started working on parser

example
compiler
2025-01-21 01:17:30 +05:00 · 2025-01-20 22:52:44 +05:00 · 2025-01-20 22:52:23 +05:00 · 2025-01-20 22:42:32 +05:00
19 changed files with 724 additions and 557 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -7,7 +7,7 @@
            "request": "launch",
            "program": "${workspaceFolder}/bin/tcpu",
            "windows": { "program": "${workspaceFolder}/bin/tcpu.exe" },
-            "args": [ "-c", "s.tasm", "o.bin" ],
+            "args": [ "-c", "../examples/s.tasm", "o.bin", "--debug" ],
            "cwd": "${workspaceFolder}/bin",
            "preLaunchTask": "build_exec_dbg",
            "stopAtEntry": false,
--- a/examples/s.tasm
+++ b/examples/s.tasm
@@ -0,0 +1,16 @@
+/*
+"hello world" program in my assembly language
+*/
+
+.data:
+// named array of 8-bit values
+const8 msg "Hello, World :3\0"
+
+.main:
+push ax 1; // sys_write
+push bx 1; // stdout
+push cx @msg; // address of msg data
+push dx #msg; // size of msg data
+sys
+push ax 0
+exit
--- a/src/VM/VM.c
+++ b/src/VM/VM.c
@@ -49,7 +49,7 @@ i32 VM_boot(VM* vm){
    while (vm->current_pos < vm->data_size){
        u8 opcode = vm->data[vm->current_pos];

-        const Instruction* instr = Instruction_getFromOpcode(opcode);
+        const Instruction* instr = Instruction_getByOpcode(opcode);
        // printfe("[at 0x%x] %02X %s\n", (u32)vm->current_pos, opcode, instr->name);
        if(instr == NULL){
            VM_setError(vm, "unknown opcode %02X", opcode);
--- a/src/collections/List.h
+++ b/src/collections/List.h
@@ -16,22 +16,27 @@
        return List_##T##_construct((T*)(len > 0 ? malloc(len * sizeof(T)) : NULL), 0, 0);\
    }\
    \
-    void List_##T##_push(List_##T* ptr, T value);
+    T* List_##T##_expand(List_##T* ptr);\
+    void List_##T##_push(List_##T* ptr, T value);\


 #define List_define(T)\
-    void List_##T##_push(List_##T* ptr, T value){\
-        u32 max_len = ptr->max_len;\
-        if(ptr->len == max_len){\
-            max_len = max_len * 1.5;\
+    T* List_##T##_expand(List_##T* ptr){\
+        if(ptr->len == ptr->max_len){\
+            u32 max_len = ptr->max_len * 1.5;\
            max_len += __List_padding_in_sizeof_T(T);\
            /* branchless version of max(max_len, __List_min_size) */\
            max_len += (max_len < __List_min_size) * (__List_min_size - max_len);\
            ptr->data = (T*)realloc(ptr->data, max_len * sizeof(T));\
            ptr->max_len = max_len;\
        }\
-        ptr->data[ptr->len++] = value;\
-    }
+        return &ptr->data[ptr->len++];\
+    }\
+    \
+    void List_##T##_push(List_##T* ptr, T value){\
+        T* empty_cell_ptr = List_##T##_expand(ptr);\
+        *empty_cell_ptr = value;\
+    }\

 #define __List_min_size 16

--- a/src/compiler/AST.c
+++ b/src/compiler/AST.c
@@ -0,0 +1,45 @@
+#include "AST.h"
+
+List_define(Argument);
+List_define(Operation);
+List_define(DataDefinition);
+
+static cstr _ArgumentType_str[] = {
+    "Unset",
+    "Register",
+    "ConstValue",
+    "DataName",
+    "NamedDataPointer",
+    "NamedDataSize",
+};
+
+cstr ArgumentType_toString(ArgumentType t){
+    if(t >= ARRAY_SIZE(_ArgumentType_str))
+        return "!!INDEX_ERROR!!";
+    return _ArgumentType_str[t];
+}
+
+
+void Section_init(Section* sec, char* name){
+    sec->name = name;
+    sec->data = List_DataDefinition_alloc(256);
+    sec->code = List_Operation_alloc(1024);
+}
+
+void Section_free(Section* sec){
+    free(sec->name);
+    free(sec->data.data);
+    free(sec->code.data);
+}
+
+
+void AST_init(AST* ast){
+    ast->sections = List_Section_alloc(32);
+}
+
+void AST_free(AST* ast){
+    for(u32 i = 0; i != ast->sections.len; i++){
+        Section_free(&ast->sections.data[i]);
+    }
+    free(ast->sections.data);
+}
--- a/src/compiler/AST.h
+++ b/src/compiler/AST.h
@@ -4,36 +4,57 @@
 #include "../collections/List.h"

 typedef enum ArgumentType {
-    ArgumentType_NoArgument,
+    ArgumentType_Unset,
    ArgumentType_Register,
    ArgumentType_ConstValue,
-    ArgumentType_Name,
+    ArgumentType_DataName,
    ArgumentType_NamedDataPointer,
    ArgumentType_NamedDataSize,
-};
+} ArgumentType;
+
+cstr ArgumentType_toString(ArgumentType t);
+

 typedef struct Argument {
    ArgumentType type;
    u32 value;
 } Argument;

+List_declare(Argument);
+
+
+typedef struct Operation {
+    List_Argument args;
+    Opcode op;
+} Operation;
+
+List_declare(Operation);
+
+
 typedef struct DataDefinition {
-    u8 element_size;
-    u16 size;
-    void* data;
+    u32 element_size;
+    u32 count;
    cstr name;
-};
+    void* data;
+} DataDefinition;

 List_declare(DataDefinition);

+
+typedef struct Section {
+    char* name;
+    List_DataDefinition data;
+    List_Operation code;
+} Section;
+
+List_declare(Section);
+
+void Section_init(Section* Section, char* name);
+void Section_free(Section* Section);
+
 typedef struct AST {
-    DataDefinition
+    List_Section sections;
 } AST;

-/*
-d8 name ''
-
-goto label
-.label:
-code...
-*/
+void AST_init(AST* ast);
+void AST_free(AST* ast);
--- a/src/compiler/Compiler.c
+++ b/src/compiler/Compiler.c
@@ -0,0 +1,149 @@
+#include "Compiler_internal.h"
+
+
+void Compiler_init(Compiler* cmp){
+    memset(cmp, 0, sizeof(Compiler));
+    cmp->state = CompilerState_Initial;
+    cmp->tokens = List_Token_alloc(4096);
+    cmp->line_lengths = List_u32_alloc(1024);
+    AST_init(&cmp->ast);
+}
+
+void Compiler_free(Compiler* cmp){
+    free(cmp->code);
+    free(cmp->tokens.data);
+    free(cmp->line_lengths.data);
+    AST_free(&cmp->ast);
+}
+
+CodePos Compiler_getLineAndColumn(Compiler* cmp, u32 pos){
+    u32 prev_lines_len = 0;
+    if(pos >= cmp->code_len)
+        return CodePos_create(0, 0);
+    
+    for(u32 i = 0; i < cmp->line_lengths.len; i++){
+        u32 line_len = cmp->line_lengths.data[i];
+        if(prev_lines_len + line_len > pos)
+            return CodePos_create(i + 1, pos + 1 - prev_lines_len);
+        prev_lines_len += line_len;
+    }
+
+    return CodePos_create(0, 0);
+}
+
+void _Compiler_setError(Compiler* cmp, cstr context, cstr format, ...){
+    // happens at the end of file
+    if(cmp->pos >= cmp->code_len)
+        cmp->pos = cmp->code_len - 1;
+    char position_str[32];
+    CodePos code_pos = Compiler_getLineAndColumn(cmp, cmp->pos);
+    sprintf(position_str, "[at %u:%u][", code_pos.line, code_pos.column);
+    char* real_format = strcat_malloc(position_str, context, "] ", format);
+    va_list argv;
+    va_start(argv, format);
+    char* NULLABLE(buf) = vsprintf_malloc(512, real_format, argv);
+    va_end(argv);
+    free(real_format);
+    if(buf == NULL){
+        buf = malloc(16);
+        strcpy(buf, "SPRINTF FAILED");
+    }
+    cmp->state = CompilerState_Error;
+    cmp->error_message = buf;
+}
+
+#define setError(FORMAT, ...) {\
+    Compiler_setError(cmp, FORMAT, ##__VA_ARGS__);\
+}
+
+char* Compiler_extractTokenStr(Compiler* cmp, Token t){
+    char* s = malloc(t.length + 1);
+    memcpy(s, cmp->code, t.length);
+    s[t.length] = 0;
+    return s;
+}
+
+static bool compileFile(Compiler* cmp, FILE* f){
+    returnErrorIf_auto(cmp->state != CompilerState_Parsing);
+    cmp->state = CompilerState_Compiling;
+
+    return true;
+}
+
+bool Compiler_compile(Compiler* cmp, cstr source_file_name, cstr out_file_name, bool debug_log){
+    FILE* f = fopen(source_file_name, "rb");
+    if(f == NULL)
+        returnError("ERROR: can't open file '%s'", source_file_name);
+    
+    List_u8 buf = List_u8_alloc(64 * 1024);
+    int ret;
+    while((ret = fgetc(f)) != EOF) {
+        List_u8_push(&buf, ret);
+    }
+    if(ferror(f)){
+        free(buf.data);
+        fclose(f);
+        returnError("can't read file '%s'", source_file_name);
+    }
+    fclose(f);
+
+    if(buf.len == 0){
+        free(buf.data);
+        fclose(f);
+        returnError("soucre file is empty");
+    }
+
+    cmp->code = (char*)buf.data;
+    cmp->code_len = buf.len;
+    List_u8_push(&buf, 0);
+
+    f = fopen(out_file_name, "wb");
+    if(f == NULL){
+        free(buf.data);
+        returnError("ERROR: can't open file '%s'", out_file_name);
+    }
+
+    if(debug_log){
+        printf("----------------------------------[%s]---------------------------------\n", source_file_name);
+        fputs(cmp->code, stdout);
+        fputc('\n', stdout);
+    }
+    
+    bool success = Compiler_lex(cmp);
+    if(debug_log){
+        printf("------------------------------------[lines]-----------------------------------\n");
+        for(u32 i = 0; i < cmp->line_lengths.len; i++){
+            printf("[%u] length: %u\n", i+1, cmp->line_lengths.data[i]);
+        }
+        printf("------------------------------------[tokens]-----------------------------------\n");
+        for(u32 i = 0; i < cmp->tokens.len; i++){
+            Token t = cmp->tokens.data[i];
+            CodePos pos = Compiler_getLineAndColumn(cmp, t.begin);
+            char* tokstr = malloc(4096);
+            strncpy(tokstr, cmp->code + t.begin, t.length);
+            tokstr[t.length] = 0;
+            printf("[l:%3u, c:%3u] %s '%s'\n", 
+                pos.line, pos.column,
+                TokenType_toString(t.type), tokstr);
+            free(tokstr);
+        }
+    }
+    if(!success){
+        fclose(f);
+        return false;
+    }
+
+    success = Compiler_parse(cmp);
+    if(!success){
+        fclose(f);
+        return false;
+    }
+
+    success = compileFile(cmp, f);
+    fclose(f);
+    if(success){
+        cmp->state = CompilerState_Success;
+    }
+    
+    return success;
+}
--- a/src/compiler/Lexer.c
+++ b/src/compiler/Lexer.c
@@ -0,0 +1,263 @@
+#include "Compiler_internal.h"
+
+#define setError(FORMAT, ...) {\
+    completeLine(cmp);\
+    Compiler_setError(cmp, FORMAT, ##__VA_ARGS__);\
+}
+
+#define Error_unexpectedCharacter(C) "unexpected character '%c'", C
+#define Error_endOfFile "unexpected end of file"
+
+static void completeLine(Compiler* cmp){
+    List_u32_push(&cmp->line_lengths, cmp->column);
+    cmp->column = 0;
+}
+
+static void readCommentSingleLine(Compiler* cmp){
+    char c; // '/'
+    Token tok = Token_construct(TokenType_SingleLineComment, cmp->pos - 1, 0);
+    cmp->column++;
+    cmp->pos++;
+    
+    while(cmp->pos < cmp->code_len){
+        c = cmp->code[cmp->pos];
+        // end of line
+        if(c == '\r' || c == '\n'){
+            tok.length = cmp->pos - tok.begin;
+            List_Token_push(&cmp->tokens, tok);
+            // cmp->line will be increased in lex()
+            return;
+        }
+        
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    tok.length = cmp->pos - tok.begin;
+    List_Token_push(&cmp->tokens, tok);
+}
+
+static void readCommentMultiLine(Compiler* cmp){
+    char c; // '*'
+    Token tok = Token_construct(TokenType_MultiLineComment, cmp->pos - 1, 0);
+    cmp->column++;
+    cmp->pos++;
+    
+    while(cmp->pos < cmp->code_len){
+        c = cmp->code[cmp->pos];
+        // closing comment
+        if(cmp->pos > tok.begin + 3 && c == '/' && cmp->code[cmp->pos - 1] == '*') {
+            tok.length = cmp->pos - tok.begin + 1;
+            List_Token_push(&cmp->tokens, tok);
+            return;
+        }
+
+        if(c == '\n')
+            completeLine(cmp);
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    setError(Error_endOfFile);
+}
+
+static void readComment(Compiler* cmp){
+    char c; // '/'
+    if(cmp->pos + 1 == cmp->code_len){
+        setError(Error_endOfFile);
+        return;
+    }
+
+    c = cmp->code[cmp->pos + 1];
+    if(c == '\r' || c == '\n'){
+        setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
+        return;
+    }
+
+    cmp->pos++;
+    cmp->column++;
+    if(c == '/')
+        readCommentSingleLine(cmp);
+    else if(c == '*')
+        readCommentMultiLine(cmp);
+    else setError(Error_unexpectedCharacter(c));
+}
+
+static void readLabel(Compiler* cmp){
+    char c; // '.'
+    cmp->pos++;
+    cmp->column++;
+    Token tok = Token_construct(TokenType_Label, cmp->pos, 0);
+    
+    while(cmp->pos < cmp->code_len){
+        c = cmp->code[cmp->pos];
+        // end of line
+        if(c == ':' || c == '\r' || c == '\n'){
+            tok.length = cmp->pos - tok.begin;
+            if(tok.length > 0)
+                List_Token_push(&cmp->tokens, tok);
+            else setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
+            // cmp->line will be increased in lex()
+            return;
+        }
+
+        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c) &&
+            c != '_' && c != '.'){
+            setError(Error_unexpectedCharacter(c));
+            return;
+        }
+        
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    tok.length = cmp->pos - tok.begin;
+    if(tok.length > 0)
+        List_Token_push(&cmp->tokens, tok);
+    else setError(Error_endOfFile);
+}
+
+static void readArguments(Compiler* cmp){
+    char c; // space
+    Token tok = Token_construct(TokenType_Unset, cmp->pos, 0);
+    char quot = '\0'; // quotation character of a string value
+
+    while(cmp->pos < cmp->code_len){
+        c = cmp->code[cmp->pos];
+
+        // string argument reading
+        if(quot != '\0'){
+            if(c == quot && cmp->code[cmp->pos - 1] != '\\'){
+                quot = '\0';
+            }
+            else if(c == '\r' || c == '\n'){
+                setError("line end reached but string hasn't been closed yet");
+                return;
+            }
+        }
+
+        // end of line
+        else if(c == '\r' || c == '\n' || c == ';'){
+            tok.length = cmp->pos - tok.begin;
+            if(tok.length > 0)
+                List_Token_push(&cmp->tokens, tok);
+            // cmp->line will be increased in lex()
+            return;
+        }
+
+        // new argument begins
+        else if(c == ' ' || c == '\t'){
+            tok.length = cmp->pos - tok.begin;
+            if(tok.length > 0)
+                List_Token_push(&cmp->tokens, tok);
+            tok = Token_construct(TokenType_Unset, cmp->pos + 1, 0);
+        }
+
+        else if(tok.type == TokenType_Unset){
+            if(c == '\''){
+                tok.type = TokenType_Char;
+                quot = c;
+            }
+            else if(c == '"'){
+                tok.type = TokenType_String;
+                quot = c;
+            }
+            else if(c == '@')
+                tok.type = TokenType_NamedDataPointer;
+            else if(c == '#')
+                tok.type = TokenType_NamedDataSize;
+            else if(isDigit(c))
+                tok.type = TokenType_Number;
+            else tok.type = TokenType_Name;
+        }
+
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    tok.length = cmp->pos - tok.begin;
+    if(tok.length > 0)
+        List_Token_push(&cmp->tokens, tok);
+}
+
+static void readInstruction(Compiler* cmp){
+    Token tok = Token_construct(TokenType_Instruction, cmp->pos, 0);
+    cmp->pos++;
+    cmp->column++;
+    
+    while(cmp->pos < cmp->code_len){
+        char c = cmp->code[cmp->pos];
+        // end of line
+        if(c == '\r' || c == '\n' || c == ';'){
+            tok.length = cmp->pos - tok.begin;
+            List_Token_push(&cmp->tokens, tok);
+            // cmp->line will be increased in lex()
+            return;
+        }
+
+        // arguments begin
+        if(c == ' ' || c == '\t'){
+            tok.length = cmp->pos - tok.begin;
+            List_Token_push(&cmp->tokens, tok);
+            readArguments(cmp);
+            return;
+        }
+
+        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c)){
+            setError(Error_unexpectedCharacter(c));
+            return;
+        }
+
+        cmp->column++;
+        cmp->pos++;
+    }
+    
+    // end of file
+    tok.length = cmp->pos - tok.begin;
+    List_Token_push(&cmp->tokens, tok);
+}
+
+bool Compiler_lex(Compiler* cmp){
+    returnErrorIf_auto(cmp->state != CompilerState_Initial);
+    cmp->state = CompilerState_Lexing;
+    cmp->column = 1;
+
+    while(cmp->pos < cmp->code_len){
+        char c = cmp->code[cmp->pos];
+        switch(c){
+            // skip blank characters
+            case ' ': case '\t': case '\r': case '\n':
+                break;
+            // try read comment
+            case '/':
+                readComment(cmp);
+                break;
+            // try read label
+            case '.':
+                readLabel(cmp);
+                break;
+            default:
+                // try read instruction
+                if(isAlphabeticalLower(c) || isAlphabeticalUpper(c))
+                    readInstruction(cmp);
+                else returnError(Error_unexpectedCharacter(c));
+                break;
+        }
+
+        if(cmp->state == CompilerState_Error)
+            return false;
+        
+        c = cmp->code[cmp->pos];
+        if(c == '\n')
+            completeLine(cmp);
+        cmp->column++;
+        cmp->pos++;
+    }
+
+    completeLine(cmp);
+    return true;
+}
--- a/src/compiler/compiler.c
+++ b/src/compiler/compiler.c
@@ -1,470 +0,0 @@
-#include "Lexer.h"
-
-List_define(Token);
-
-CodePos Lexer_getLineAndColumn(Lexer* cmp, u32 pos){
-    u32 prev_lines_len = 0;
-    if(pos >= cmp->code_len)
-        return CodePos_create(0, 0);
-    
-    for(u32 i = 0; i < cmp->line_lengths.len; i++){
-        u32 line_len = cmp->line_lengths.data[i];
-        if(prev_lines_len + line_len > pos)
-            return CodePos_create(i + 1, pos + 1 - prev_lines_len);
-        prev_lines_len += line_len;
-    }
-
-    return CodePos_create(0, 0);
-}
-
-static void completeLine(Lexer* cmp){
-    List_u32_push(&cmp->line_lengths, cmp->column);
-    cmp->column = 0;
-}
-
-void _Lexer_setError(Lexer* cmp, cstr context, cstr format, ...){
-    completeLine(cmp);
-    // happens at the end of file
-    if(cmp->pos >= cmp->code_len)
-        cmp->pos = cmp->code_len - 1;
-    char position_str[32];
-    CodePos code_pos = Lexer_getLineAndColumn(cmp, cmp->pos);
-    sprintf(position_str, "[at %u:%u][", code_pos.line, code_pos.column);
-    char* real_format = strcat_malloc(position_str, context, "] ", format);
-    va_list argv;
-    va_start(argv, format);
-    char* NULLABLE(buf) = vsprintf_malloc(512, real_format, argv);
-    va_end(argv);
-    free(real_format);
-    if(buf == NULL){
-        buf = malloc(16);
-        strcpy(buf, "SPRINTF FAILED");
-    }
-    cmp->state = LexerState_Error;
-    cmp->error_message = buf;
-}
-
-
-#define setError(FORMAT, ...) Lexer_setError(cmp, FORMAT, ##__VA_ARGS__)
-
-#define returnError(FORMAT, ...) {\
-    setError(FORMAT, ##__VA_ARGS__);\
-    return false;\
-}
-
-#define returnErrorIf(STATEMENT, FORMAT, ...) if(STATEMENT) returnError(FORMAT, ##__VA_ARGS__)
-
-#define returnErrorIf_auto(STATEMENT) returnErrorIf(STATEMENT, #STATEMENT)
-
-#define Error_unexpectedCharacter(C) "unexpected character '%c'", C
-#define Error_endOfFile "unexpected end of file"
-
-void Lexer_init(Lexer* cmp){
-    memset(cmp, 0, sizeof(Lexer));
-    cmp->state = LexerState_Initial;
-    cmp->tokens = List_Token_alloc(4096);
-    cmp->line_lengths = List_u32_alloc(1024);
-}
-
-void Lexer_free(Lexer* cmp){
-    free(cmp->code);
-    free(cmp->tokens.data);
-    free(cmp->line_lengths.data);
-}
-
-static void readCommentSingleLine(Lexer* cmp){
-    char c; // '/'
-    Token tok = Token_construct(TokenType_SingleLineComment, cmp->pos - 1, 0);
-    cmp->column++;
-    cmp->pos++;
-    
-    while(cmp->pos < cmp->code_len){
-        c = cmp->code[cmp->pos];
-        // end of line
-        if(c == '\r' || c == '\n'){
-            tok.length = cmp->pos - tok.begin;
-            List_Token_push(&cmp->tokens, tok);
-            // cmp->line will be increased in lex()
-            return;
-        }
-        
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    tok.length = cmp->pos - tok.begin;
-    List_Token_push(&cmp->tokens, tok);
-}
-
-static void readCommentMultiLine(Lexer* cmp){
-    char c; // '*'
-    Token tok = Token_construct(TokenType_MultiLineComment, cmp->pos - 1, 0);
-    cmp->column++;
-    cmp->pos++;
-    
-    while(cmp->pos < cmp->code_len){
-        c = cmp->code[cmp->pos];
-        // closing comment
-        if(cmp->pos > tok.begin + 3 && c == '/' && cmp->code[cmp->pos - 1] == '*') {
-            tok.length = cmp->pos - tok.begin + 1;
-            List_Token_push(&cmp->tokens, tok);
-            return;
-        }
-
-        if(c == '\n')
-            completeLine(cmp);
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    setError(Error_endOfFile);
-}
-
-static void readComment(Lexer* cmp){
-    char c; // '/'
-    if(cmp->pos + 1 == cmp->code_len){
-        setError(Error_endOfFile);
-        return;
-    }
-
-    c = cmp->code[cmp->pos + 1];
-    if(c == '\r' || c == '\n'){
-        setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
-        return;
-    }
-
-    cmp->pos++;
-    cmp->column++;
-    if(c == '/')
-        readCommentSingleLine(cmp);
-    else if(c == '*')
-        readCommentMultiLine(cmp);
-    else setError(Error_unexpectedCharacter(c));
-}
-
-static void readLabel(Lexer* cmp){
-    char c; // '.'
-    cmp->pos++;
-    cmp->column++;
-    Token tok = Token_construct(TokenType_Label, cmp->pos, 0);
-    
-    while(cmp->pos < cmp->code_len){
-        c = cmp->code[cmp->pos];
-        // end of line
-        if(c == ':' || c == '\r' || c == '\n'){
-            tok.length = cmp->pos - tok.begin;
-            if(tok.length > 0)
-                List_Token_push(&cmp->tokens, tok);
-            else setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
-            // cmp->line will be increased in lex()
-            return;
-        }
-
-        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c) &&
-            c != '_' && c != '.'){
-            setError(Error_unexpectedCharacter(c));
-            return;
-        }
-        
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    tok.length = cmp->pos - tok.begin;
-    if(tok.length > 0)
-        List_Token_push(&cmp->tokens, tok);
-    else setError(Error_endOfFile);
-}
-
-static void readArguments(Lexer* cmp){
-    char c; // space
-    cmp->pos++;
-    cmp->column++;
-    Token tok = Token_construct(TokenType_Argument, cmp->pos, 0);
-    
-    while(cmp->pos < cmp->code_len){
-        c = cmp->code[cmp->pos];
-        // end of line
-        if(c == '\r' || c == '\n' || c == ';'){
-            tok.length = cmp->pos - tok.begin;
-            if(tok.length > 0)
-                List_Token_push(&cmp->tokens, tok);
-            // cmp->line will be increased in lex()
-            return;
-        }
-
-        // new argument begins
-        if(c == ' ' || c == '\t'){
-            tok.length = cmp->pos - tok.begin;
-            if(tok.length > 0)
-                List_Token_push(&cmp->tokens, tok);
-            tok.begin = cmp->pos + 1;
-        }
-        
-
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    tok.length = cmp->pos - tok.begin;
-    if(tok.length > 0)
-        List_Token_push(&cmp->tokens, tok);
-}
-
-static void readInstruction(Lexer* cmp){
-    Token tok = Token_construct(TokenType_Instruction, cmp->pos, 0);
-    cmp->pos++;
-    cmp->column++;
-    
-    while(cmp->pos < cmp->code_len){
-        char c = cmp->code[cmp->pos];
-        // end of line
-        if(c == '\r' || c == '\n' || c == ';'){
-            tok.length = cmp->pos - tok.begin;
-            List_Token_push(&cmp->tokens, tok);
-            // cmp->line will be increased in lex()
-            return;
-        }
-
-        // arguments begin
-        if(c == ' ' || c == '\t'){
-            tok.length = cmp->pos - tok.begin;
-            List_Token_push(&cmp->tokens, tok);
-            readArguments(cmp);
-            return;
-        }
-
-        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c)){
-            setError(Error_unexpectedCharacter(c));
-            return;
-        }
-
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    tok.length = cmp->pos - tok.begin;
-    List_Token_push(&cmp->tokens, tok);
-}
-
-static void readChar(Lexer* cmp){
-    Token tok = Token_construct(TokenType_Char, cmp->pos, 0);
-    cmp->pos++;
-    cmp->column++;
-    
-    while(cmp->pos < cmp->code_len){
-        char c = cmp->code[cmp->pos];
-        // end of line
-        if(c == '\r' || c == '\n'){
-            setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
-            return;
-        }
-
-        if(c == '\''){
-            tok.length = cmp->pos - tok.begin + 1;
-            List_Token_push(&cmp->tokens, tok);
-            return;
-        }
-
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    setError(Error_endOfFile);
-}
-
-static void readString(Lexer* cmp){
-    Token tok = Token_construct(TokenType_String, cmp->pos, 0);
-    cmp->pos++;
-    cmp->column++;
-    
-    while(cmp->pos < cmp->code_len){
-        char c = cmp->code[cmp->pos];
-        // end of line
-        if(c == '\r' || c == '\n'){
-            setError(Error_unexpectedCharacter(cmp->code[--cmp->pos]));
-            return;
-        }
-
-        if(c == '"'){
-            tok.length = cmp->pos - tok.begin + 1;
-            List_Token_push(&cmp->tokens, tok);
-            return;
-        }
-
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    setError(Error_endOfFile);
-}
-
-static void readNumber(Lexer* cmp){
-    Token tok = Token_construct(TokenType_Number, cmp->pos, 0);
-    cmp->pos++;
-    cmp->column++;
-    
-    while(cmp->pos < cmp->code_len){
-        char c = cmp->code[cmp->pos];
-
-        if(c == '\r' || c == '\n' || c == ' ' || c == '\t' || c == ',' || c == ';'){
-            tok.length = cmp->pos - tok.begin;
-            List_Token_push(&cmp->tokens, tok);
-            return;
-        }
-
-        cmp->column++;
-        cmp->pos++;
-    }
-    
-    // end of file
-    tok.length = cmp->pos - tok.begin;
-    List_Token_push(&cmp->tokens, tok);
-}
-
-static bool lex(Lexer* cmp){
-    returnErrorIf_auto(cmp->state != LexerState_Initial);
-    cmp->state = LexerState_Lexing;
-    cmp->column = 1;
-
-    while(cmp->pos < cmp->code_len){
-        char c = cmp->code[cmp->pos];
-        switch(c){
-            // skip blank characters
-            case ' ': case '\t': case '\r': case '\n':
-                break;
-            // try read comment
-            case '/':
-                readComment(cmp);
-                break;
-            // try read label
-            case '.':
-                readLabel(cmp);
-                break;
-            case '"':
-                readString(cmp);
-                break;
-            case '\'':
-                readChar(cmp);
-                break;
-            default:
-                // try read instruction
-                if(isAlphabeticalLower(c) || isAlphabeticalUpper(c))
-                    readInstruction(cmp);
-                else if(isDigit(c))
-                    readNumber(cmp);
-                else returnError(Error_unexpectedCharacter(c));
-                break;
-        }
-
-        if(cmp->state == LexerState_Error)
-            return false;
-        
-        c = cmp->code[cmp->pos];
-        if(c == '\n')
-            completeLine(cmp);
-        cmp->column++;
-        cmp->pos++;
-    }
-
-    completeLine(cmp);
-    return true;
-}
-
-static bool parse(Lexer* cmp){
-    returnErrorIf_auto(cmp->state != LexerState_Lexing);
-    cmp->state = LexerState_Parsing;
-
-    return true;
-}
-static bool compile(Lexer* cmp, FILE* f){
-    returnErrorIf_auto(cmp->state != LexerState_Parsing);
-    cmp->state = LexerState_Compiling;
-
-    return true;
-}
-
-bool Lexer_compileTasm(Lexer* cmp, cstr source_file_name, cstr out_file_name, bool debug){
-    FILE* f = fopen(source_file_name, "rb");
-    if(f == NULL)
-        returnError("ERROR: can't open file '%s'", source_file_name);
-    
-    List_u8 buf = List_u8_alloc(64 * 1024);
-    int ret;
-    while((ret = fgetc(f)) != EOF) {
-        List_u8_push(&buf, ret);
-    }
-    if(ferror(f)){
-        free(buf.data);
-        fclose(f);
-        returnError("can't read file '%s'", source_file_name);
-    }
-    fclose(f);
-
-    if(buf.len == 0){
-        free(buf.data);
-        fclose(f);
-        returnError("soucre file is empty");
-    }
-
-    cmp->code = (char*)buf.data;
-    cmp->code_len = buf.len;
-    List_u8_push(&buf, 0);
-
-    f = fopen(out_file_name, "wb");
-    if(f == NULL){
-        free(buf.data);
-        returnError("ERROR: can't open file '%s'", out_file_name);
-    }
-
-    if(debug){
-        printf("----------------------------------[%s]---------------------------------\n", source_file_name);
-        fputs(cmp->code, stdout);
-        fputc('\n', stdout);
-    }
-    
-    bool success = lex(cmp);
-    if(debug){
-        printf("------------------------------------[lines]-----------------------------------\n");
-        for(u32 i = 0; i < cmp->line_lengths.len; i++){
-            printf("[%u] length: %u\n", i+1, cmp->line_lengths.data[i]);
-        }
-        printf("------------------------------------[tokens]-----------------------------------\n");
-        for(u32 i = 0; i < cmp->tokens.len; i++){
-            Token t = cmp->tokens.data[i];
-            CodePos pos = Lexer_getLineAndColumn(cmp, t.begin);
-            char* tokstr = malloc(4096);
-            strncpy(tokstr, cmp->code + t.begin, t.length);
-            tokstr[t.length] = 0;
-            printf("[l:%3u, c:%3u] %s '%s'\n", 
-                pos.line, pos.column,
-                TokenType_toString(t.type), tokstr);
-            free(tokstr);
-        }
-    }
-    if(!success){
-        fclose(f);
-        return false;
-    }
-
-    success = parse(cmp);
-    if(!success){
-        fclose(f);
-        return false;
-    }
-
-    success = compile(cmp, f);
-    fclose(f);
-    if(success){
-        cmp->state = LexerState_Success;
-    }
-    
-    return success;
-}
--- a/src/compiler/compiler.h
+++ b/src/compiler/compiler.h
@@ -1,48 +1,34 @@
 #pragma once
 #include "../std.h"
 #include "../collections/List.h"
-#include "token.h"
+#include "Token.h"
 #include "AST.h"

-List_declare(Token);
+typedef enum CompilerState {
+    CompilerState_Initial,
+    CompilerState_Lexing,
+    CompilerState_Parsing,
+    CompilerState_Compiling,
+    CompilerState_Error,
+    CompilerState_Success
+} CompilerState;

-typedef enum LexerState {
-    LexerState_Initial,
-    LexerState_Lexing,
-    LexerState_Parsing,
-    LexerState_Compiling,
-    LexerState_Error,
-    LexerState_Success
-} LexerState;
-
-typedef struct Lexer {
+typedef struct Compiler {
    char* code;
    u32 code_len;
    u32 column;     // > 0 if code parsing started
    u32 pos;
-    LexerState state;
+    CompilerState state;
    NULLABLE(char* error_message);
    List_Token tokens;
    List_u32 line_lengths;
-} Lexer;
+    AST ast;
+    u32 tok_i;
+} Compiler;

-void Lexer_init(Lexer* cmp);
-void Lexer_free(Lexer* cmp);
+void Compiler_init(Compiler* cmp);
+void Compiler_free(Compiler* cmp);

 /// @brief compile assembly language code to machine code
 /// @return true if no errors, false if any error occured (check cmp->error_message)
-bool Lexer_compileTasm(Lexer* cmp, cstr source_file_name, cstr out_file_name, bool debug);
-
-#define Lexer_setError(cmp, format, ...) _Lexer_setError(cmp, __func__, format ,##__VA_ARGS__)
-void _Lexer_setError(Lexer* cmp, cstr context, cstr format, ...) __attribute__((__format__(__printf__, 3, 4)));
-
-typedef struct CodePos {
-    u32 line; // 0 on error
-    u32 column; // 0 on error
-} CodePos;
-
-#define CodePos_create(L, C) ((CodePos){ .line = L, .column = C })
-
-/// @param pos index in code buffer
-CodePos Lexer_getLineAndColumn(Lexer* cmp, u32 pos);
-
+bool Compiler_compile(Compiler* cmp, cstr source_file_name, cstr out_file_name, bool debug);
--- a/src/compiler/compiler_internal.h
+++ b/src/compiler/compiler_internal.h
@@ -0,0 +1,29 @@
+#include "Compiler.h"
+
+void _Compiler_setError(Compiler* cmp, cstr context, cstr format, ...) __attribute__((__format__(__printf__, 3, 4)));
+
+#define Compiler_setError(cmp, format, ...) _Compiler_setError(cmp, __func__, format ,##__VA_ARGS__)
+
+#define returnError(FORMAT, ...) {\
+    setError(FORMAT, ##__VA_ARGS__);\
+    return false;\
+}
+
+#define returnErrorIf(STATEMENT, FORMAT, ...) if(STATEMENT) returnError(FORMAT, ##__VA_ARGS__)
+
+#define returnErrorIf_auto(STATEMENT) returnErrorIf(STATEMENT, #STATEMENT)
+
+typedef struct CodePos {
+    u32 line; // 0 on error
+    u32 column; // 0 on error
+} CodePos;
+
+#define CodePos_create(L, C) ((CodePos){ .line = L, .column = C })
+
+/// @param pos index in code buffer
+CodePos Compiler_getLineAndColumn(Compiler* cmp, u32 pos);
+
+char* Compiler_extractTokenStr(Compiler* cmp, Token t);
+
+bool Compiler_lex(Compiler* cmp);
+bool Compiler_parse(Compiler* cmp);
--- a/src/compiler/parser.c
+++ b/src/compiler/parser.c
@@ -1,3 +1,68 @@
-#include "compiler.h"
+#include "Compiler_internal.h"

-List_define(DataDefinition);
+#define setError(FORMAT, ...) {\
+    cmp->pos = cmp->tokens.data[cmp->tok_i].begin;\
+    Compiler_setError(cmp, FORMAT, ##__VA_ARGS__);\
+}
+
+#define setError_unexpectedToken(T) {\
+    char* tok_str = Compiler_extractTokenStr(cmp, T);\
+    setError("unexpected character '%s'", tok_str);\
+    free(tok_str);\
+}
+
+
+static void parseDataDefinition(Compiler* cmp, char* instr_name, DataDefinition* dataDefPtr){
+
+}
+
+
+static void parseOperation(Compiler* cmp, char* instr_name, Operation* operPtr){
+
+}
+
+bool Compiler_parse(Compiler* cmp){
+    returnErrorIf_auto(cmp->state != CompilerState_Lexing);
+    cmp->state = CompilerState_Parsing;
+    Token tok;
+    Section* sec = NULL;
+    
+    while(cmp->tok_i < cmp->tokens.len){
+        tok = cmp->tokens.data[cmp->tok_i];
+        switch(tok.type){
+            case TokenType_Unset:
+                returnError("token of undefined type");
+            case TokenType_SingleLineComment:
+            case TokenType_MultiLineComment:
+                // skip comments
+                break;
+            case TokenType_Label:
+                // create new section
+                sec = List_Section_expand(&cmp->ast.sections);
+                Section_init(sec, Compiler_extractTokenStr(cmp, tok));
+                break;
+            case TokenType_Instruction:
+                char* instr_name = Compiler_extractTokenStr(cmp, tok);
+                // data definition starts with const
+                if(cstr_seek(instr_name, "const", 0, 1)){
+                    DataDefinition* dataDefPtr = List_DataDefinition_expand(&sec->data);
+                    parseDataDefinition(cmp, instr_name, dataDefPtr);
+                }
+                else {
+                    Operation* operPtr = List_Operation_expand(&sec->code);
+                    parseOperation(cmp, instr_name, operPtr);
+                }
+                break;
+            default:
+                setError_unexpectedToken(tok);
+                return false;
+        }
+
+        if(cmp->state == CompilerState_Error)
+            return false;
+
+        cmp->tok_i++;
+    }
+
+    return true;
+}
--- a/src/compiler/token.c
+++ b/src/compiler/token.c
@@ -1,19 +1,19 @@
-#include "token.h"
+#include "Token.h"
+
+List_define(Token);

 static cstr _TokenType_str[] = {
    "Unset",
    "SingleLineComment",
    "MultiLineComment",
+    "Instruction",
    "Label",
-    "DataDefinition",
    "Number",
    "Char",
    "String",
-    "Instruction",
-    "Register",
-    "DataType",
-    "DataPointer",
-    "DataSize"
+    "Name",
+    "NamedDataPointer",
+    "NamedDataSize"
 };

 cstr TokenType_toString(TokenType t){
--- a/src/compiler/token.h
+++ b/src/compiler/token.h
@@ -1,20 +1,19 @@
 #pragma once
 #include "../std.h"
+#include "../collections/List.h"

 typedef enum TokenType {
-    TokenType_Unset,
-    TokenType_SingleLineComment,
-    TokenType_MultiLineComment,
-    TokenType_Label,
-    TOkenType_DataDefinition,
-    TokenType_Number,
-    TokenType_Char,
-    TokenType_String,
-    TokenType_Instruction,
-    TokenType_Register,
-    TokenType_DataType,
-    TokenType_DataPointer,
-    TokenType_DataSize
+    TokenType_Unset,                // initial value
+    TokenType_SingleLineComment,    // //comment
+    TokenType_MultiLineComment,     // /* comment */
+    TokenType_Instruction,          // abc
+    TokenType_Label,                // .abc:
+    TokenType_Number,               // 0123
+    TokenType_Char,                 // 'A'
+    TokenType_String,               // "aaaa"
+    TokenType_Name,                 // xyz
+    TokenType_NamedDataPointer,     // @xyz
+    TokenType_NamedDataSize         // #xyz
 } TokenType;

 cstr TokenType_toString(TokenType t);
@@ -25,4 +24,6 @@ typedef struct Token {
    TokenType type : 8; // type of token (8 bits)
 } Token;

+List_declare(Token);
+
 #define Token_construct(TYPE, BEGIN, END) ((Token){ .type = TYPE, .begin = BEGIN, .length = END })
--- a/src/cstr.c
+++ b/src/cstr.c
@@ -50,3 +50,43 @@ char* NULLABLE(vsprintf_malloc)(size_t buffer_size, cstr format, va_list argv){
    }
    return buf;
 }
+
+i32 cstr_seek(const char* src, const char* fragment, u32 startIndex, u32 seekLength){
+    char sc = *src, fc = *fragment;
+    if(sc == 0 || fc == 0)
+        return -1;
+    u32 fr_start = startIndex;
+    for(u32 si = startIndex; si-startIndex < seekLength && sc != 0; si++){
+        sc = src[si];
+        fc = fragment[si-fr_start];
+        if(fc == 0)
+            return fr_start;
+        if(sc != fc)
+            fr_start++;
+    }
+    return -1;
+}
+
+i32 cstr_seekReverse(const char* src, const char* fragment, u32 startIndex, u32 seekLength){
+    char sc = *src, fc = *fragment;
+    if(sc == 0 || fc == 0)
+        return -1;
+    i32 len = strlen(src);
+    if(startIndex == (u32)-1)
+        startIndex = len-1;
+    u32 fr_len = strlen(fragment);
+    for(u32 si = startIndex; si < (u32)-1 && si != (len - seekLength - 1); si--){
+        if(si + 1 < fr_len)
+            return -1;
+        sc = src[si];
+        fc = fragment[0];
+        u32 fr_start = si;
+        for(u32 fi = 0; fc == sc ; fi++){
+            if(fi == fr_len)
+                return fr_start;
+            fc = fragment[fi];
+            sc = src[si--];
+        }
+    }
+    return -1;
+}
--- a/src/instructions/instructions.c
+++ b/src/instructions/instructions.c
@@ -29,7 +29,7 @@ const Instruction instructions[] = {
    // Instruction_construct(CALL),
 };

-const Instruction* Instruction_getFromOpcode(Opcode opcode){
+const Instruction* Instruction_getByOpcode(Opcode opcode){
    if(opcode >= ARRAY_SIZE(instructions))
        return NULL;
    
--- a/src/instructions/instructions.h
+++ b/src/instructions/instructions.h
@@ -33,4 +33,4 @@ typedef struct Instruction {
 /// @brief get instruction info from table
 /// @param opcode any byte
 /// @return ptr to struct or NULL
-const Instruction* NULLABLE(Instruction_getFromOpcode)(Opcode opcode);
+const Instruction* NULLABLE(Instruction_getByOpcode)(Opcode opcode);
--- a/src/main.c
+++ b/src/main.c
@@ -1,11 +1,11 @@
 #include "VM/VM.h"
 #include "instructions/instructions.h"
 #include "collections/List.h"
-#include "compiler/compiler.h"
+#include "compiler/Compiler.h"

 #define arg_is(STR) (strcmp(argv[argi], STR) == 0)

-i32 compileSources(cstr source_file, cstr out_file);
+i32 compileSources(cstr source_file, cstr out_file, bool debug_log);
 i32 bootFromImage(cstr image_file);

 i32 main(const i32 argc, cstr* argv){
@@ -21,6 +21,8 @@ i32 main(const i32 argc, cstr* argv){
    cstr NULLABLE(out_file) = NULL;
    cstr NULLABLE(source_file) = NULL;

+    bool debug_log = false;
+
    for(i32 argi = 1; argi < argc; argi++){
        if(arg_is("-h") || arg_is("--help")){
            printf(
@@ -28,12 +30,13 @@ i32 main(const i32 argc, cstr* argv){
                "-op, --opcodes                             Show list of all instructions.\n"
                "-i, --image [FILE]                         Boot VM using image file.\n"
                "-c, --compile [SOURCE_FILE] [OUT_FILE]     Compile assembly source files to machine code.\n"
+                "-d, --debug                                Enable debug log.\n"
            );
            return 0;
        }
        else if(arg_is("-op") || arg_is("--opcodes")){
            for(u8 opcode = 0; opcode < 255; opcode++){
-                const Instruction* instr = Instruction_getFromOpcode(opcode);
+                const Instruction* instr = Instruction_getByOpcode(opcode);
                if(instr != NULL){
                    printf("%02X %s\n", opcode, instr->name);
                }
@@ -72,6 +75,9 @@ i32 main(const i32 argc, cstr* argv){
            }
            out_file = argv[argi];
        }
+        else if(arg_is("-d") || arg_is("--debug")){
+            debug_log = true;
+        }
        else {
            printfe("ERROR: unknown argument '%s'\n", argv[argi]);
            return 1;
@@ -80,7 +86,7 @@ i32 main(const i32 argc, cstr* argv){

    i32 exit_code = 0;
    if(compile){
-        exit_code = compileSources(source_file, out_file);
+        exit_code = compileSources(source_file, out_file, debug_log);
    }
    if(exit_code == 0 && boot){
        exit_code = bootFromImage(image_file);
@@ -131,19 +137,20 @@ i32 bootFromImage(cstr image_file){
    return exit_code;
 }

-i32 compileSources(cstr source_file, cstr out_file){
+i32 compileSources(cstr source_file, cstr out_file, bool debug_log){
    Compiler cmp;
    Compiler_init(&cmp);
-    bool success = Compiler_compileTasm(&cmp, source_file, out_file, true);
-    Compiler_free(&cmp);
+    bool success = Compiler_compile(&cmp, source_file, out_file, debug_log);
    if(!success){
        if(cmp.error_message){
            printfe("COMPILER ERROR: %s\n", cmp.error_message);
            free(cmp.error_message);
        }
        else printfe("COMPILER ERROR: unknown (error_message is null)\n");
+        Compiler_free(&cmp);
        return 111;
    }
-
+    
+    Compiler_free(&cmp);
    return 0;
 }
--- a/src/std.h
+++ b/src/std.h
@@ -57,3 +57,13 @@ char* NULLABLE(vsprintf_malloc)(size_t buffer_size, cstr format, va_list argv);
 static inline bool isAlphabeticalLower(char c) { return 'a' <= c && c <= 'z'; }
 static inline bool isAlphabeticalUpper(char c) { return 'A' <= c && c <= 'Z'; }
 static inline bool isDigit(char c) { return '0' <= c && c <= '9'; }
+
+/// @param startIndex 0 ... src length
+/// @param seekLength 0 ... -1
+/// @return pos of first <fragment> inclusion in <src> or -1 if not found
+i32 cstr_seek(const char* src, const char* fragment, u32 startIndex, u32 seekLength);
+
+/// @param startIndex -1 ... src length
+/// @param seekLength 0 ... -1
+/// @return pos of first <fragment> inclusion in <src> or -1 if not found
+i32 cstr_seekReverse(const char* src, const char* fragment, u32 startIndex, u32 seekLength);
Author	SHA1	Message	Date
Timerix	83e28c9022	started working on parser	2025-01-21 01:17:30 +05:00
Timerix	5c9197436f	example	2025-01-20 22:52:44 +05:00
Timerix	f710aa4199	compiler	2025-01-20 22:52:23 +05:00
Timerix	facacc90f8	argument parsing	2025-01-20 22:42:32 +05:00