data tokens lexing

semicolon
2024-11-21 12:03:53 +05:00 · 2024-11-21 09:48:54 +05:00
3 changed files with 104 additions and 15 deletions
--- a/src/compiler/compiler.c
+++ b/src/compiler/compiler.c
@@ -162,7 +162,8 @@ static void readLabel(Compiler* cmp){
            return;
        }

-        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c)){
+        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c) &&
+            c != '_' && c != '.'){
            setError(Error_unexpectedCharacter(c));
            return;
        }
@@ -187,7 +188,7 @@ static void readArguments(Compiler* cmp){
    while(cmp->pos < cmp->code_len){
        c = cmp->code[cmp->pos];
        // end of line
-        if(c == '\r' || c == '\n'){
+        if(c == '\r' || c == '\n' || c == ';'){
            tok.length = cmp->pos - tok.begin;
            if(tok.length > 0)
                List_Token_push(&cmp->tokens, tok);
@@ -222,7 +223,7 @@ static void readInstruction(Compiler* cmp){
    while(cmp->pos < cmp->code_len){
        char c = cmp->code[cmp->pos];
        // end of line
-        if(c == '\r' || c == '\n'){
+        if(c == '\r' || c == '\n' || c == ';'){
            tok.length = cmp->pos - tok.begin;
            List_Token_push(&cmp->tokens, tok);
            // cmp->line will be increased in lex()
@@ -251,6 +252,83 @@ static void readInstruction(Compiler* cmp){
    List_Token_push(&cmp->tokens, tok);
 }

+static void readChar(Compiler* cmp){
+    Token tok = Token_construct(TokenType_Char, cmp->pos, 0);
+    cmp->pos++;
+    cmp->column++;
+    
+    while(cmp->pos < cmp->code_len){
+        char c = cmp->code[cmp->pos];
+        // end of line
+        if(c == '\r' || c == '\n'){
+            setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
+            return;
+        }
+
+        if(c == '\''){
+            tok.length = cmp->pos - tok.begin + 1;
+            List_Token_push(&cmp->tokens, tok);
+            return;
+        }
+
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    setError(Error_endOfFile);
+}
+
+static void readString(Compiler* cmp){
+    Token tok = Token_construct(TokenType_String, cmp->pos, 0);
+    cmp->pos++;
+    cmp->column++;
+    
+    while(cmp->pos < cmp->code_len){
+        char c = cmp->code[cmp->pos];
+        // end of line
+        if(c == '\r' || c == '\n'){
+            setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
+            return;
+        }
+
+        if(c == '"'){
+            tok.length = cmp->pos - tok.begin + 1;
+            List_Token_push(&cmp->tokens, tok);
+            return;
+        }
+
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    setError(Error_endOfFile);
+}
+
+static void readNumber(Compiler* cmp){
+    Token tok = Token_construct(TokenType_Number, cmp->pos, 0);
+    cmp->pos++;
+    cmp->column++;
+    
+    while(cmp->pos < cmp->code_len){
+        char c = cmp->code[cmp->pos];
+
+        if(c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == ',' || c == ';'){
+            tok.length = cmp->pos - tok.begin;
+            List_Token_push(&cmp->tokens, tok);
+            return;
+        }
+
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    tok.length = cmp->pos - tok.begin;
+    List_Token_push(&cmp->tokens, tok);
+}
+
 static bool lex(Compiler* cmp){
    returnErrorIf_auto(cmp->state != CompilerState_Initial);
    cmp->state = CompilerState_Lexing;
@@ -270,13 +348,20 @@ static bool lex(Compiler* cmp){
            case '.':
                readLabel(cmp);
                break;
+            case '"':
+                readString(cmp);
+                break;
+            case '\'':
+                readChar(cmp);
+                break;
            default:
                // try read instruction
-                if(isAlphabeticalLower(c) || isAlphabeticalUpper(c)){
+                if(isAlphabeticalLower(c) || isAlphabeticalUpper(c))
                    readInstruction(cmp);
-                    break;
-                }
+                else if(isDigit(c))
+                    readNumber(cmp);
                else returnError(Error_unexpectedCharacter(c));
+                break;
        }

        if(cmp->state == CompilerState_Error)
@@ -358,9 +443,8 @@ bool Compiler_compileTasm(Compiler* cmp, cstr source_file_name, cstr out_file_na
            char* tokstr = malloc(4096);
            strncpy(tokstr, cmp->code + t.begin, t.length);
            tokstr[t.length] = 0;
-            printf("[l:%u, c:%u](pos:%u, size:%u) %s '%s'\n", 
+            printf("[l:%3u, c:%3u] %s '%s'\n", 
                pos.line, pos.column,
-                t.begin, t.length,
                TokenType_toString(t.type), tokstr);
            free(tokstr);
        }
--- a/src/compiler/token.c
+++ b/src/compiler/token.c
@@ -1,15 +1,19 @@
 #include "token.h"

-static cstr TokenType_str[] = {
+static cstr _TokenType_str[] = {
    "Unset",
    "SingleLineComment",
    "MultiLineComment",
    "Label",
    "Instruction",
    "Argument",
-    "Data",
+    "Number",
+    "Char",
+    "String",
 };

 cstr TokenType_toString(TokenType t){
-    return TokenType_str[t];
+    if(t >= sizeof(_TokenType_str) / sizeof(cstr))
+        return "!!INDEX_ERROR!!";
+    return _TokenType_str[t];
 }
--- a/src/compiler/token.h
+++ b/src/compiler/token.h
@@ -8,16 +8,17 @@ typedef enum TokenType {
    TokenType_Label,
    TokenType_Instruction,
    TokenType_Argument,
-    TokenType_Data,
-    /* there is a place for 2 values left (TokenType must occupy 4 bits) */
+    TokenType_Number,
+    TokenType_Char,
+    TokenType_String,
 } TokenType;

 cstr TokenType_toString(TokenType t);

 typedef struct Token {
    u32 begin;          // some index in Compiler->code
-    u32 length : 28;    // length in characters (28 bits)
-    TokenType type : 4; // type of token (4 bits)
+    u32 length : 24;    // length in characters (24 bits)
+    TokenType type : 8; // type of token (8 bits)
 } Token;

 #define Token_construct(TYPE, BEGIN, END) ((Token){ .type = TYPE, .begin = BEGIN, .length = END })
Author	SHA1	Message	Date
Timerix	bd8215fd73	data tokens lexing	2024-11-21 12:03:53 +05:00
Timerix	ad232f187a	semicolon	2024-11-21 09:48:54 +05:00