parseDataDefinition

2025-01-27 09:40:49 +05:00
parent e43a987e1e
commit 46e5eb1887
8 changed files with 311 additions and 15 deletions
--- a/src/compiler/AST.h
+++ b/src/compiler/AST.h
@@ -32,10 +32,9 @@ List_declare(Operation);


 typedef struct DataDefinition {
-    u32 element_size;
-    u32 count;
    cstr name;
-    void* data;
+    List_u8 data;
+    u32 element_size;
 } DataDefinition;

 List_declare(DataDefinition);
--- a/src/compiler/Lexer.c
+++ b/src/compiler/Lexer.c
@@ -139,7 +139,7 @@ static void readArguments(Compiler* cmp){
            }
        }

-        // end of line
+        // end of operation
        else if(c == '\r' || c == '\n' || c == ';'){
            tok.length = cmp->pos - tok.begin;
            if(tok.length > 0)
@@ -195,6 +195,8 @@ static void readInstruction(Compiler* cmp){
        if(c == '\r' || c == '\n' || c == ';'){
            tok.length = cmp->pos - tok.begin;
            List_Token_push(&cmp->tokens, tok);
+            tok = Token_construct(TokenType_OperationEnd, cmp->pos, 1);
+            List_Token_push(&cmp->tokens, tok);
            // cmp->line will be increased in lex()
            return;
        }
@@ -204,6 +206,8 @@ static void readInstruction(Compiler* cmp){
            tok.length = cmp->pos - tok.begin;
            List_Token_push(&cmp->tokens, tok);
            readArguments(cmp);
+            tok = Token_construct(TokenType_OperationEnd, cmp->pos, 1);
+            List_Token_push(&cmp->tokens, tok);
            return;
        }

@@ -219,6 +223,8 @@ static void readInstruction(Compiler* cmp){
    // end of file
    tok.length = cmp->pos - tok.begin;
    List_Token_push(&cmp->tokens, tok);
+    tok = Token_construct(TokenType_OperationEnd, cmp->pos, 1);
+    List_Token_push(&cmp->tokens, tok);
 }

 bool Compiler_lex(Compiler* cmp){
--- a/src/compiler/Parser.c
+++ b/src/compiler/Parser.c
@@ -7,13 +7,136 @@

 #define setError_unexpectedToken(T) {\
    char* tok_str = Compiler_extractTokenStr(cmp, T);\
-    setError("unexpected character '%s'", tok_str);\
+    cmp->pos = T.begin;\
+    Compiler_setError(cmp, "unexpected token '%s'", tok_str);\
    free(tok_str);\
 }

+#define setError_unexpectedTokenChar(T, I) {\
+    cmp->pos = T.begin + I;\
+    Compiler_setError(cmp, "unexpected token '%c'", cmp->code[cmp->pos]);\
+}

-static void parseDataDefinition(Compiler* cmp, char* instr_name, DataDefinition* dataDefPtr){

+#define Error_TokenUnset "token of undefined type"
+#define Error_BitSize "invalid size in bits"
+
+static void List_u8_pushBytes(List_u8* l, void* value, u32 startIndex, u32 count){
+    u8* v = value;
+    for(u32 byte_i = startIndex; byte_i < startIndex + count; byte_i++){
+        List_u8_push(l, v[byte_i]);
+    }
+}
+
+static inline bool isVarSizeBits(u32 B) { return (B == 8 && B == 16 && B == 32 && B == 64); }
+
+static NULLABLE(u8*) resolveEscapeSequences(Compiler* cmp, cstr src){
+    u32 len = strlen(src);
+    List_u8 resolved = List_u8_alloc(len);
+    char c;
+    bool escaped = false;
+    for(u32 i = 0; i < len; i++){
+        c = src[i];
+        if(c == '\\'){
+            escaped = !escaped;
+            continue;
+        }
+        
+        if(!escaped){
+            List_u8_push(&resolved, c);
+            continue;
+        }
+
+        // escape codes
+        switch(c){
+            case '0':
+                List_u8_push(&resolved, '\0');
+                break;
+            case 'n':
+                List_u8_push(&resolved, '\n');
+                break;
+            case 'r':
+                List_u8_push(&resolved, '\r');
+                break;
+            case 't':
+                List_u8_push(&resolved, '\t');
+                break;
+            case 'e':
+                List_u8_push(&resolved, '\e');
+                break;
+            default:
+                setError_unexpectedTokenChar(cmp->tokens.data[cmp->tok_i], i);
+                free(resolved.data);
+                return NULL;
+        }
+    }
+
+    return resolved.data;
+}
+
+static void parseDataDefinition(Compiler* cmp, char* instr_name, DataDefinition* ddf){
+    i32 _element_size_bits;
+    if(sscanf(instr_name, "const%i", &_element_size_bits) != 1 || !isVarSizeBits(_element_size_bits)){
+        setError(Error_BitSize);
+        return;
+    }
+    ddf->element_size = _element_size_bits / 8;
+
+    Token tok = cmp->tokens.data[++cmp->tok_i];
+    char* tok_str = Compiler_extractTokenStr(cmp, tok);
+    u8* processed_str = NULL;
+    i32 len = 0;
+    ddf->name = tok_str;
+    
+    while(++cmp->tok_i < cmp->tokens.len){
+        switch(tok.type){
+            case TokenType_Unset:
+                setError(Error_TokenUnset);
+                return;
+            case TokenType_SingleLineComment:
+            case TokenType_MultiLineComment:
+                // skip comments
+                break;
+            case TokenType_Number:
+                tok_str = Compiler_extractTokenStr(cmp, tok);
+                if(cstr_seekChar(tok_str, '.', 0, -1) != -1){
+                    f64 f = atof(tok_str);
+                    List_u8_pushBytes(&ddf->data, &f, 8 - ddf->element_size, ddf->element_size);
+                }
+                else {
+                    i64 i = atoll(tok_str);
+                    List_u8_pushBytes(&ddf->data, &i, 8 - ddf->element_size, ddf->element_size);
+                }
+                break;
+            case TokenType_Char:
+                tok.begin += 1;
+                tok.length -= 2;
+                tok_str = Compiler_extractTokenStr(cmp, tok);
+                processed_str = resolveEscapeSequences(cmp, tok_str);
+                free(tok_str);
+                len = strlen(processed_str);
+                if(len != ddf->element_size){
+                    setError("can't fit char of size %i in %u bit variable", len, _element_size_bits);
+                    return;
+                }
+                List_u8_pushBytes(&ddf->data, processed_str, 0, len);
+                break;
+            case TokenType_String:
+                tok.begin += 1;
+                tok.length -= 2;
+                tok_str = Compiler_extractTokenStr(cmp, tok);
+                processed_str = resolveEscapeSequences(cmp, tok_str);
+                free(tok_str);
+                len = strlen(processed_str);
+                List_u8_pushBytes(&ddf->data, processed_str, 0, len);
+                break;
+            case TokenType_OperationEnd:
+                return;
+            default:
+                setError_unexpectedToken(tok);
+                return;
+        }
+    }
 }


@@ -31,7 +154,7 @@ bool Compiler_parse(Compiler* cmp){
        tok = cmp->tokens.data[cmp->tok_i];
        switch(tok.type){
            case TokenType_Unset:
-                returnError("token of undefined type");
+                returnError(Error_TokenUnset);
            case TokenType_SingleLineComment:
            case TokenType_MultiLineComment:
                // skip comments
@@ -42,6 +165,8 @@ bool Compiler_parse(Compiler* cmp){
                Section_init(sec, Compiler_extractTokenStr(cmp, tok));
                break;
            case TokenType_Instruction:
+                if(sec == NULL)
+                    returnError("no section");
                char* instr_name = Compiler_extractTokenStr(cmp, tok);
                // data definition starts with const
                if(cstr_seek(instr_name, "const", 0, 1)){
--- a/src/compiler/Token.h
+++ b/src/compiler/Token.h
@@ -13,7 +13,8 @@ typedef enum TokenType {
    TokenType_String,               // "aaaa"
    TokenType_Name,                 // xyz
    TokenType_NamedDataPointer,     // @xyz
-    TokenType_NamedDataSize         // #xyz
+    TokenType_NamedDataSize,        // #xyz
+    TokenType_OperationEnd,         // EOL or EOF or ;
 } TokenType;

 cstr TokenType_toString(TokenType t);
@@ -26,4 +27,4 @@ typedef struct Token {

 List_declare(Token);

-#define Token_construct(TYPE, BEGIN, END) ((Token){ .type = TYPE, .begin = BEGIN, .length = END })
+#define Token_construct(TYPE, BEGIN, LEN) ((Token){ .type = TYPE, .begin = BEGIN, .length = LEN })
--- a/src/compiler/parser.c
+++ b/src/compiler/parser.c
@@ -7,13 +7,136 @@

 #define setError_unexpectedToken(T) {\
    char* tok_str = Compiler_extractTokenStr(cmp, T);\
-    setError("unexpected character '%s'", tok_str);\
+    cmp->pos = T.begin;\
+    Compiler_setError(cmp, "unexpected token '%s'", tok_str);\
    free(tok_str);\
 }

+#define setError_unexpectedTokenChar(T, I) {\
+    cmp->pos = T.begin + I;\
+    Compiler_setError(cmp, "unexpected token '%c'", cmp->code[cmp->pos]);\
+}

-static void parseDataDefinition(Compiler* cmp, char* instr_name, DataDefinition* dataDefPtr){

+#define Error_TokenUnset "token of undefined type"
+#define Error_BitSize "invalid size in bits"
+
+static void List_u8_pushBytes(List_u8* l, void* value, u32 startIndex, u32 count){
+    u8* v = value;
+    for(u32 byte_i = startIndex; byte_i < startIndex + count; byte_i++){
+        List_u8_push(l, v[byte_i]);
+    }
+}
+
+static inline bool isVarSizeBits(u32 B) { return (B == 8 && B == 16 && B == 32 && B == 64); }
+
+static NULLABLE(u8*) resolveEscapeSequences(Compiler* cmp, cstr src){
+    u32 len = strlen(src);
+    List_u8 resolved = List_u8_alloc(len);
+    char c;
+    bool escaped = false;
+    for(u32 i = 0; i < len; i++){
+        c = src[i];
+        if(c == '\\'){
+            escaped = !escaped;
+            continue;
+        }
+        
+        if(!escaped){
+            List_u8_push(&resolved, c);
+            continue;
+        }
+
+        // escape codes
+        switch(c){
+            case '0':
+                List_u8_push(&resolved, '\0');
+                break;
+            case 'n':
+                List_u8_push(&resolved, '\n');
+                break;
+            case 'r':
+                List_u8_push(&resolved, '\r');
+                break;
+            case 't':
+                List_u8_push(&resolved, '\t');
+                break;
+            case 'e':
+                List_u8_push(&resolved, '\e');
+                break;
+            default:
+                setError_unexpectedTokenChar(cmp->tokens.data[cmp->tok_i], i);
+                free(resolved.data);
+                return NULL;
+        }
+    }
+
+    return resolved.data;
+}
+
+static void parseDataDefinition(Compiler* cmp, char* instr_name, DataDefinition* ddf){
+    i32 _element_size_bits;
+    if(sscanf(instr_name, "const%i", &_element_size_bits) != 1 || !isVarSizeBits(_element_size_bits)){
+        setError(Error_BitSize);
+        return;
+    }
+    ddf->element_size = _element_size_bits / 8;
+
+    Token tok = cmp->tokens.data[++cmp->tok_i];
+    char* tok_str = Compiler_extractTokenStr(cmp, tok);
+    u8* processed_str = NULL;
+    i32 len = 0;
+    ddf->name = tok_str;
+    
+    while(++cmp->tok_i < cmp->tokens.len){
+        switch(tok.type){
+            case TokenType_Unset:
+                setError(Error_TokenUnset);
+                return;
+            case TokenType_SingleLineComment:
+            case TokenType_MultiLineComment:
+                // skip comments
+                break;
+            case TokenType_Number:
+                tok_str = Compiler_extractTokenStr(cmp, tok);
+                if(cstr_seekChar(tok_str, '.', 0, -1) != -1){
+                    f64 f = atof(tok_str);
+                    List_u8_pushBytes(&ddf->data, &f, 8 - ddf->element_size, ddf->element_size);
+                }
+                else {
+                    i64 i = atoll(tok_str);
+                    List_u8_pushBytes(&ddf->data, &i, 8 - ddf->element_size, ddf->element_size);
+                }
+                break;
+            case TokenType_Char:
+                tok.begin += 1;
+                tok.length -= 2;
+                tok_str = Compiler_extractTokenStr(cmp, tok);
+                processed_str = resolveEscapeSequences(cmp, tok_str);
+                free(tok_str);
+                len = strlen(processed_str);
+                if(len != ddf->element_size){
+                    setError("can't fit char of size %i in %u bit variable", len, _element_size_bits);
+                    return;
+                }
+                List_u8_pushBytes(&ddf->data, processed_str, 0, len);
+                break;
+            case TokenType_String:
+                tok.begin += 1;
+                tok.length -= 2;
+                tok_str = Compiler_extractTokenStr(cmp, tok);
+                processed_str = resolveEscapeSequences(cmp, tok_str);
+                free(tok_str);
+                len = strlen(processed_str);
+                List_u8_pushBytes(&ddf->data, processed_str, 0, len);
+                break;
+            case TokenType_OperationEnd:
+                return;
+            default:
+                setError_unexpectedToken(tok);
+                return;
+        }
+    }
 }


@@ -31,7 +154,7 @@ bool Compiler_parse(Compiler* cmp){
        tok = cmp->tokens.data[cmp->tok_i];
        switch(tok.type){
            case TokenType_Unset:
-                returnError("token of undefined type");
+                returnError(Error_TokenUnset);
            case TokenType_SingleLineComment:
            case TokenType_MultiLineComment:
                // skip comments
@@ -42,6 +165,8 @@ bool Compiler_parse(Compiler* cmp){
                Section_init(sec, Compiler_extractTokenStr(cmp, tok));
                break;
            case TokenType_Instruction:
+                if(sec == NULL)
+                    returnError("no section");
                char* instr_name = Compiler_extractTokenStr(cmp, tok);
                // data definition starts with const
                if(cstr_seek(instr_name, "const", 0, 1)){
--- a/src/compiler/token.h
+++ b/src/compiler/token.h
@@ -13,7 +13,8 @@ typedef enum TokenType {
    TokenType_String,               // "aaaa"
    TokenType_Name,                 // xyz
    TokenType_NamedDataPointer,     // @xyz
-    TokenType_NamedDataSize         // #xyz
+    TokenType_NamedDataSize,        // #xyz
+    TokenType_OperationEnd,         // EOL or EOF or ;
 } TokenType;

 cstr TokenType_toString(TokenType t);
@@ -26,4 +27,4 @@ typedef struct Token {

 List_declare(Token);

-#define Token_construct(TYPE, BEGIN, END) ((Token){ .type = TYPE, .begin = BEGIN, .length = END })
+#define Token_construct(TYPE, BEGIN, LEN) ((Token){ .type = TYPE, .begin = BEGIN, .length = LEN })
--- a/src/cstr.c
+++ b/src/cstr.c
@@ -89,4 +89,32 @@ i32 cstr_seekReverse(const char* src, const char* fragment, u32 startIndex, u32
        }
    }
    return -1;
-}
+}
+
+
+i32 cstr_seekChar(const char* src, char fragment, u32 startIndex, u32 seekLength){
+    char sc=*src;
+    if(sc==0 || fragment==0)
+        return -1;
+    for(u32 si=startIndex; si-startIndex<seekLength && sc!=0; si++){
+        sc=src[si];
+        if(sc==fragment)
+            return si;
+    }
+    return -1;
+}
+
+i32 cstr_seekCharReverse(const char* src, char fragment, u32 startIndex, u32 seekLength){
+    char sc=*src;
+    if(sc==0 || fragment==0)
+        return -1;
+    i32 len=strlen(src);
+    if(startIndex==(u32)-1)
+        startIndex=len-1;
+    for(u32 si=startIndex; si<(u32)-1 && si!=len-1-seekLength; si--){
+        sc=src[si];
+        if(sc==fragment)
+            return si;
+    }
+    return -1;
+}
--- a/src/std.h
+++ b/src/std.h
@@ -27,6 +27,7 @@ typedef u8 bool;
 typedef const char* cstr;

 #define ARRAY_SIZE(A) sizeof(A)/sizeof(A[0])
+#define ALIGN_TO(_SIZE,_ALIGN)        (((_SIZE) + ((_ALIGN) - 1)) & ~((_ALIGN) - 1))

 #define __count_args( \
    a0, a1, a2, a3, a4, a5, a6, a7 , a8, a9, a10,a11,a12,a13,a14,a15, \
@@ -67,3 +68,13 @@ i32 cstr_seek(const char* src, const char* fragment, u32 startIndex, u32 seekLen
 /// @param seekLength 0 ... -1
 /// @return pos of first <fragment> inclusion in <src> or -1 if not found
 i32 cstr_seekReverse(const char* src, const char* fragment, u32 startIndex, u32 seekLength);
+
+/// @param startIndex 0 ... src length
+/// @param seekLength 0 ... -1
+/// @return pos of first <fragment> inclusion in <src> or -1 if not found
+i32 cstr_seekChar(const char* src, char fragment, u32 startIndex, u32 seekLength);
+
+/// @param startIndex -1 ... src length
+/// @param seekLength 0 ... -1
+/// @return pos of first <fragment> inclusion in <src> or -1 if not found
+i32 cstr_seekCharReverse(const char* src, char fragment, u32 startIndex, u32 seekLength);