TCPU/src/compiler/Lexer.c

#include "Compiler_internal.h"

#define setError(FORMAT, ...) {\
    completeLine(cmp);\
    Compiler_setError(cmp, FORMAT, ##__VA_ARGS__);\
}

#define Error_unexpectedCharacter(C) "unexpected character '%c'", C
#define Error_endOfFile "unexpected end of file"

static void completeLine(Compiler* cmp){
    List_u32_push(&cmp->line_lengths, cmp->column);
    cmp->column = 0;
}

static void readCommentSingleLine(Compiler* cmp){
    char c; // '/'
    Token tok = Token_construct(TokenType_SingleLineComment, cmp->pos - 1, 0);
    cmp->column++;
    cmp->pos++;

    while(cmp->pos < cmp->code.len){
        c = cmp->code.data[cmp->pos];
        // end of line
        if(c == '\r' || c == '\n'){
            tok.length = cmp->pos - tok.begin;
            List_Token_push(&cmp->tokens, tok);
            // cmp->line will be increased in lex()
            return;
        }

        cmp->column++;
        cmp->pos++;
    }

    // end of file
    tok.length = cmp->pos - tok.begin;
    List_Token_push(&cmp->tokens, tok);
}

static void readCommentMultiLine(Compiler* cmp){
    char c; // '*'
    Token tok = Token_construct(TokenType_MultiLineComment, cmp->pos - 1, 0);
    cmp->column++;
    cmp->pos++;

    while(cmp->pos < cmp->code.len){
        c = cmp->code.data[cmp->pos];
        // closing comment
        if(cmp->pos > tok.begin + 3 && c == '/' && cmp->code.data[cmp->pos - 1] == '*') {
            tok.length = cmp->pos - tok.begin + 1;
            List_Token_push(&cmp->tokens, tok);
            return;
        }

        if(c == '\n')
            completeLine(cmp);
        cmp->column++;
        cmp->pos++;
    }

    // end of file
    setError(Error_endOfFile);
}

static void readComment(Compiler* cmp){
    char c; // '/'
    if(cmp->pos + 1 == cmp->code.len){
        setError(Error_endOfFile);
        return;
    }

    c = cmp->code.data[cmp->pos + 1];
    if(c == '\r' || c == '\n'){
        setError(Error_unexpectedCharacter(cmp->code.data[--cmp->pos]));
        return;
    }

    cmp->pos++;
    cmp->column++;
    if(c == '/')
        readCommentSingleLine(cmp);
    else if(c == '*')
        readCommentMultiLine(cmp);
    else setError(Error_unexpectedCharacter(c));
}

static void readLabel(Compiler* cmp){
    char c; // '.'
    cmp->pos++;
    cmp->column++;
    Token tok = Token_construct(TokenType_Label, cmp->pos, 0);

    while(cmp->pos < cmp->code.len){
        c = cmp->code.data[cmp->pos];
        // end of line
        if(c == ':' || c == '\r' || c == '\n'){
            tok.length = cmp->pos - tok.begin;
            if(tok.length > 0)
                List_Token_push(&cmp->tokens, tok);
            else setError(Error_unexpectedCharacter(cmp->code.data[--cmp->pos]));
            // cmp->line will be increased in lex()
            return;
        }

        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c) &&
            c != '_' && c != '.'){
            setError(Error_unexpectedCharacter(c));
            return;
        }

        cmp->column++;
        cmp->pos++;
    }

    // end of file
    tok.length = cmp->pos - tok.begin;
    if(tok.length > 0)
        List_Token_push(&cmp->tokens, tok);
    else setError(Error_endOfFile);
}

static void readArguments(Compiler* cmp){
    char c; // space
    Token tok = Token_construct(TokenType_Unset, cmp->pos, 0);
    char quot = '\0'; // quotation character of a string value

    while(cmp->pos < cmp->code.len){
        c = cmp->code.data[cmp->pos];

        // string argument reading
        if(quot != '\0'){
            if(c == quot && cmp->code.data[cmp->pos - 1] != '\\'){
                quot = '\0';
            }
            else if(c == '\r' || c == '\n'){
                setError("line end reached but string hasn't been closed yet");
                return;
            }
        }

        // end of operation
        else if(c == '\r' || c == '\n' || c == ';'){
            tok.length = cmp->pos - tok.begin;
            if(tok.length > 0)
                List_Token_push(&cmp->tokens, tok);
            // cmp->line will be increased in lex()
            return;
        }

        // new argument begins
        else if(c == ' ' || c == '\t'){
            tok.length = cmp->pos - tok.begin;
            if(tok.length > 0)
                List_Token_push(&cmp->tokens, tok);
            tok = Token_construct(TokenType_Unset, cmp->pos + 1, 0);
        }

        else if(tok.type == TokenType_Unset){
            if(c == '\''){
                tok.type = TokenType_Char;
                quot = c;
            }
            else if(c == '"'){
                tok.type = TokenType_String;
                quot = c;
            }
            else if(c == '@')
                tok.type = TokenType_NamedDataPointer;
            else if(c == '#')
                tok.type = TokenType_NamedDataSize;
            else if(isDigit(c))
                tok.type = TokenType_Number;
            else tok.type = TokenType_Name;
        }

        cmp->column++;
        cmp->pos++;
    }

    // end of file
    tok.length = cmp->pos - tok.begin;
    if(tok.length > 0)
        List_Token_push(&cmp->tokens, tok);
}

static void readInstruction(Compiler* cmp){
    Token tok = Token_construct(TokenType_Instruction, cmp->pos, 0);
    cmp->pos++;
    cmp->column++;

    while(cmp->pos < cmp->code.len){
        char c = cmp->code.data[cmp->pos];
        // end of line
        if(c == '\r' || c == '\n' || c == ';'){
            tok.length = cmp->pos - tok.begin;
            List_Token_push(&cmp->tokens, tok);
            tok = Token_construct(TokenType_OperationEnd, cmp->pos, 1);
            List_Token_push(&cmp->tokens, tok);
            // cmp->line will be increased in lex()
            return;
        }

        // arguments begin
        if(c == ' ' || c == '\t'){
            tok.length = cmp->pos - tok.begin;
            List_Token_push(&cmp->tokens, tok);
            readArguments(cmp);
            tok = Token_construct(TokenType_OperationEnd, cmp->pos, 1);
            List_Token_push(&cmp->tokens, tok);
            return;
        }

        if(!isAlphabeticalLower(c) && !isAlphabeticalUpper(c) && !isDigit(c)){
            setError(Error_unexpectedCharacter(c));
            return;
        }

        cmp->column++;
        cmp->pos++;
    }

    // end of file
    tok.length = cmp->pos - tok.begin;
    List_Token_push(&cmp->tokens, tok);
    tok = Token_construct(TokenType_OperationEnd, cmp->pos, 1);
    List_Token_push(&cmp->tokens, tok);
}

bool Compiler_lex(Compiler* cmp){
    returnErrorIf_auto(cmp->state != CompilerState_Initial);
    cmp->state = CompilerState_Lexing;
    cmp->column = 1;

    while(cmp->pos < cmp->code.len){
        char c = cmp->code.data[cmp->pos];
        switch(c){
            // skip blank characters
            case ' ': case '\t': case '\r': case '\n':
                break;
            // try read comment
            case '/':
                readComment(cmp);
                break;
            // try read label
            case '.':
                readLabel(cmp);
                break;
            default:
                // try read instruction
                if(isAlphabeticalLower(c) || isAlphabeticalUpper(c))
                    readInstruction(cmp);
                else returnError(Error_unexpectedCharacter(c));
                break;
        }

        if(cmp->state == CompilerState_Error)
            return false;

        c = cmp->code.data[cmp->pos];
        if(c == '\n')
            completeLine(cmp);
        cmp->column++;
        cmp->pos++;
    }

    completeLine(cmp);
    return true;
}