diff --git a/lexer.c b/lexer.c index 0e34d89..eb53f75 100644 --- a/lexer.c +++ b/lexer.c @@ -1,26 +1,13 @@ #include -#include + #include #include #include #include #include -#include #include -#define NB_IMPLEMENTATION -#include "nb.h" -int str_to_int(char *strint){ - int new_int = atoi(strint); - return new_int; -} -float str_to_float(char *strif){ - char *fptr; - float new_int = strtof(strif, &fptr); - return new_int; -} - -typedef enum{ +typedef enum { TOKEN_PLUS, TOKEN_MINUS, TOKEN_INTEGER, @@ -37,7 +24,7 @@ typedef enum{ TOKEN_COMMA } symbols; -typedef enum{ +typedef enum { BHV_STACK, BHV_UNDEFINED, BHV_NUMBER, @@ -45,491 +32,178 @@ typedef enum{ BHV_FLOAT, } symbol_bhv; -typedef struct{ - symbols type; - char* text; - size_t text_len; - symbol_bhv behaviour; - uint cursor_skip; - symbols previous_token; + +typedef struct { + symbols *type; + char **text; + size_t *text_len; + symbol_bhv *behaviour; + unsigned int *cursor_skip; + symbols *previous_token; + size_t capacity; + size_t size; } Token; -typedef struct{ - Token* unit; - size_t size; - size_t capacity; -} TokenArr; -typedef struct{ - char *content; - // size_t cursor; - // size_t line; -} Lexer; +void token_init(Token *tok, size_t capacity) { + tok->capacity = capacity; + tok->size = 0; -typedef enum{ - AST_NUMBER, - AST_BINARY_OP, -} ASTNodeType; + tok->type = malloc(sizeof(symbols) * capacity); + tok->text = malloc(sizeof(char *) * capacity); + tok->text_len = malloc(sizeof(size_t) * capacity); + tok->behaviour = malloc(sizeof(symbol_bhv) * capacity); + tok->cursor_skip = malloc(sizeof(unsigned int) * capacity); + tok->previous_token = malloc(sizeof(symbols) * capacity); -typedef struct ASTNode ASTNode; - -struct ASTNode { - ASTNodeType type; - union { - struct { double value; } number; - struct { - char op; - ASTNode* left; - ASTNode* right; - } binary; - struct { - char *name; - ASTNode** args; - size_t arg_count; - } func_call; - } data; -}; - -typedef struct{ - Token* tokens; - size_t cursor; -} parser; - -// Lexer -void lexer_new(char *content, size_t content_len){ - (void) content; - (void) content_len; -} -// Token -void lexer_next(Lexer *mylexer){ - (void) mylexer; + assert(tok->type && tok->text && tok->text_len && + tok->behaviour && tok->cursor_skip && tok->previous_token); } -Token parser_peek(parser* p){ - return p->tokens[p->cursor]; +void token_grow(Token *tok) { + size_t new_capacity = (tok->capacity == 0 ? 8 : tok->capacity * 2); + + tok->type = realloc(tok->type, new_capacity * sizeof(symbols)); + tok->text = realloc(tok->text, new_capacity * sizeof(char *)); + tok->text_len = realloc(tok->text_len, new_capacity * sizeof(size_t)); + tok->behaviour = realloc(tok->behaviour, new_capacity * sizeof(symbol_bhv)); + tok->cursor_skip = realloc(tok->cursor_skip, new_capacity * sizeof(unsigned int)); + tok->previous_token = realloc(tok->previous_token, new_capacity * sizeof(symbols)); + + assert(tok->type && tok->text && tok->text_len && + tok->behaviour && tok->cursor_skip && tok->previous_token); + + tok->capacity = new_capacity; } -Token parser_advance(parser* p){ - return p->tokens[p->cursor++]; +void token_push(Token *tok, symbols type, const char *text, + symbol_bhv behaviour, size_t cursor_skip) { + if (tok->size >= tok->capacity) { + token_grow(tok); + } + + size_t i = tok->size; + + tok->type[i] = type; + tok->text[i] = strdup(text); + tok->text_len[i] = strlen(text); + tok->behaviour[i] = behaviour; + tok->cursor_skip[i] = cursor_skip; + + if (i > 0) + tok->previous_token[i] = tok->type[i - 1]; + else + tok->previous_token[i] = TOKEN_UNKNOWN; + + tok->size++; } -bool parser_match(parser* p, symbols tokent){ - if (parser_peek(p).type == tokent){ - parser_advance(p); - return true; +void token_free(Token *tok) { + for (size_t i = 0; i < tok->size; i++) { + free(tok->text[i]); + } + free(tok->type); + free(tok->text); + free(tok->text_len); + free(tok->behaviour); + free(tok->cursor_skip); + free(tok->previous_token); +} + + +int str_to_int(char *strint) { return atoi(strint); } +float str_to_float(char *strif) { return strtof(strif, NULL); } + +char *token_type_to_string(symbols type) { + switch (type) { + case TOKEN_PLUS: return "TOKEN_PLUS"; + case TOKEN_MINUS: return "TOKEN_MINUS"; + case TOKEN_INTEGER: return "TOKEN_INTEGER"; + case TOKEN_FLOAT: return "TOKEN_FLOAT"; + case TOKEN_SPACE: return "TOKEN_SPACE"; + case TOKEN_STRING: return "TOKEN_STRING"; + case TOKEN_MUL: return "TOKEN_MUL"; + case TOKEN_DIV: return "TOKEN_DIV"; + case TOKEN_LPAREN: return "TOKEN_LPAREN"; + case TOKEN_RPAREN: return "TOKEN_RPAREN"; + case TOKEN_COMMA: return "TOKEN_COMMA"; + case TOKEN_EOF: return "TOKEN_EOF"; + case TOKEN_NEWLINE: return "TOKEN_NEWLINE"; + case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN"; + default: return "UNKNOWN_SYMBOL"; + } +} + + +size_t read_from_tok(Token *tok, const char *input, size_t cursor) { + char buf[64]; + size_t start = cursor; + size_t i = 0; + + if (isdigit(input[cursor])) { + int dots_seen = 0; + while (isdigit(input[cursor]) || input[cursor] == '.') { + if (input[cursor] == '.') dots_seen++; + buf[i++] = input[cursor++]; + } + buf[i] = '\0'; + if (dots_seen == 0) { + token_push(tok, TOKEN_INTEGER, buf, BHV_NUMBER, cursor - start); + } else { + token_push(tok, TOKEN_FLOAT, buf, BHV_FLOAT, cursor - start); + } + } else if (isalpha(input[cursor])) { + while (isalpha(input[cursor])) { + buf[i++] = input[cursor++]; + } + buf[i] = '\0'; + token_push(tok, TOKEN_STRING, buf, BHV_STRING, cursor - start); } else { - return false; - } -} - -ASTNode* ast_new_number(double val){ - ASTNode* node = malloc(sizeof(ASTNode)); - node->type = AST_NUMBER; - node->data.number.value = val; - return node; -} - -ASTNode* ast_new_binary(char op, ASTNode* l, ASTNode* r){ - ASTNode* node = malloc(sizeof(ASTNode)); - node->type = AST_BINARY_OP; - node->data.binary.op = op; - node->data.binary.left = l; - node->data.binary.right = r; - // maybe need to fix - return node; -} - -ASTNode* parse_factor(parser* p) { - Token tok = parser_peek(p); - if (tok.type == TOKEN_EOF){ - fprintf(stderr, "Unexpected end of input in factor\n"); - exit(EXIT_FAILURE); - } - if (tok.type == TOKEN_INTEGER || tok.type == TOKEN_FLOAT){ - parser_advance(p); - double v = atof(tok.text); - return ast_new_number(v); - } - // if (tok.type == TOKEN_STRING){ - // parser_advance(p); - // char* func_name = tok.text; - // if (parser_match(p, TOKEN_LPAREN)){ - // size_t argc_count = 0; - // - // } - // } - fprintf(stderr, "Unexpected token '%s' in factor\n", tok.text); - exit(EXIT_FAILURE); -} - -ASTNode* parse_term(parser* p) { - ASTNode* node = parse_factor(p); - while (true) { - Token tok = parser_peek(p); - if (tok.type == TOKEN_MUL || tok.type == TOKEN_DIV) { - parser_advance(p); - ASTNode* right = parse_factor(p); - node = ast_new_binary(tok.text[0], node, right); - } else { - break; - } + buf[0] = input[cursor]; + buf[1] = '\0'; + switch (input[cursor]) { + case '+': token_push(tok, TOKEN_PLUS, "+", BHV_STACK, 1); break; + case '-': token_push(tok, TOKEN_MINUS, "-", BHV_STACK, 1); break; + case '*': token_push(tok, TOKEN_MUL, "*", BHV_STACK, 1); break; + case '/': token_push(tok, TOKEN_DIV, "/", BHV_STACK, 1); break; + case ' ': token_push(tok, TOKEN_SPACE, " ", BHV_UNDEFINED, 1); break; + case '\n': token_push(tok, TOKEN_NEWLINE, "\\n", BHV_UNDEFINED, 1); break; + case '(': token_push(tok, TOKEN_LPAREN, "(", BHV_STACK, 1); break; + case ')': token_push(tok, TOKEN_RPAREN, ")", BHV_STACK, 1); break; + case ',': token_push(tok, TOKEN_COMMA, ",", BHV_STACK, 1); break; + default: token_push(tok, TOKEN_UNKNOWN, buf, BHV_UNDEFINED, 1); break; } - return node; -} - -ASTNode* parse_expression(parser* p) { - ASTNode* node = parse_term(p); - while (true) { - Token tok = parser_peek(p); - if (tok.type == TOKEN_PLUS || tok.type == TOKEN_MINUS) { - parser_advance(p); - ASTNode* right = parse_term(p); - node = ast_new_binary(tok.text[0], node, right); - } else { - break; - } - } - return node; -} - -double eval_ast(ASTNode* node) { - if (node->type == AST_NUMBER) { - return node->data.number.value; - } - double L = eval_ast(node->data.binary.left); - double R = eval_ast(node->data.binary.right); - switch (node->data.binary.op) { - case '+': return L + R; - case '-': return L - R; - case '*': return L * R; - case '/': return L / R; - default: - fprintf(stderr, "Unknown op '%c'\n", node->data.binary.op); - exit(EXIT_FAILURE); - } -} - -Token read_from_tok(char* text, uint cursor){ - Token mytoks; - - static char buf[64]; - size_t i = 0; - mytoks.cursor_skip = 1; - - if (isdigit(text[cursor])) { - size_t start = cursor; - int dots_seen = 0; - while ( isdigit(text[cursor]) || text[cursor] == '.') { - if (text[cursor] == '.') { - dots_seen +=1; - assert(dots_seen < 2); - } - buf[i++] = text[cursor++]; - } - - buf[i] = '\0'; - - if (!dots_seen){ - mytoks.type = TOKEN_INTEGER; - mytoks.behaviour = BHV_NUMBER; - } else { - mytoks.type = TOKEN_FLOAT; - mytoks.behaviour = BHV_FLOAT; - } - - - mytoks.cursor_skip = cursor - start; - mytoks.text = strdup(buf); - mytoks.text_len = i; - } - else if (isalpha(text[cursor])){ - size_t start = cursor; - while (isalpha(text[cursor])) { - buf[i++] = text[cursor++]; - } - buf[i] = '\0'; - mytoks.type = TOKEN_STRING; - mytoks.behaviour = BHV_STRING; - mytoks.cursor_skip = cursor - start; - mytoks.text = strdup(buf); - mytoks.text_len = i; - } - - else { - buf[0] = text[cursor]; - buf[1] = '\0'; - - switch (text[cursor]) - { - case '+': - mytoks.type = TOKEN_PLUS; - mytoks.text = strdup("+"); - mytoks.behaviour = BHV_STACK; - break; - case '-': - mytoks.type = TOKEN_MINUS; - mytoks.text = strdup("-"); - mytoks.behaviour = BHV_STACK; - break; - case ' ': - mytoks.type = TOKEN_SPACE; - mytoks.text = strdup("space"); - break; - case '*': - mytoks.type = TOKEN_MUL; - mytoks.text = strdup("*"); - mytoks.behaviour = BHV_STACK; - break; - case '/': - mytoks.type = TOKEN_DIV; - mytoks.text = strdup("/"); - mytoks.behaviour = BHV_STACK; - break; - case '\n': - mytoks.type = TOKEN_NEWLINE; - mytoks.text = strdup("newline"); - mytoks.cursor_skip = 1; - break; - case '(': - mytoks.type = TOKEN_LPAREN; - mytoks.text = strdup("("); - mytoks.behaviour = BHV_STACK; - break; - case ')': - mytoks.type = TOKEN_RPAREN; - mytoks.text = strdup(")"); - mytoks.behaviour = BHV_STACK; - break; - case ',': - mytoks.type = TOKEN_COMMA; - mytoks.text = strdup(","); - mytoks.behaviour = BHV_STACK; - break; - default: - mytoks.type = TOKEN_UNKNOWN; - mytoks.behaviour = BHV_UNDEFINED; - mytoks.text = strdup(buf); - - - } - } - return mytoks; -} - - -void tokenarr_push(TokenArr* arr, Token tok) { - if (arr->size >= arr->capacity) { - arr->capacity = arr->capacity ? arr->capacity * 2 : 8; - arr->unit = realloc(arr->unit, arr->capacity * sizeof(Token)); - assert(arr->unit != NULL); - } - arr->unit[arr->size++] = tok; -} - -TokenArr tokenize_all(const char* input) { - TokenArr arr = {NULL, 0, 0}; - size_t i = 0; - size_t len = strlen(input); - while (i < len) { - Token tok = read_from_tok((char*)input, i); - i += tok.cursor_skip; - if (tok.type == TOKEN_SPACE || tok.type == TOKEN_NEWLINE) { - free(tok.text); - continue; - } - tokenarr_push(&arr, tok); - } - Token eof = {0}; - eof.type = TOKEN_EOF; - eof.text = strdup("EOF"); - eof.text_len = 3; - eof.behaviour = BHV_UNDEFINED; - eof.cursor_skip = 0; - tokenarr_push(&arr, eof); - return arr; -} - - - -// Token* c - -void token_parser(Token mytok, char* input){ - int length1 = strlen(input); - int i=0; - - - - while (i < length1) { - mytok = read_from_tok(input, i); - - printf("Text: %s\n", mytok.text); - printf("Behaviour: %d\n", mytok.behaviour); - if (mytok.behaviour == BHV_STACK){ - printf("this is stack lil bro\n"); - } - i++; - } -} - - -// operators accepted in int/digit or whatever type def only when they have a digit before AND after them - - -/* -int main(){ - Token newtok; - char* input = "8"; - - parser(newtok, input); -} -*/ - -char* token_type_to_string(symbols type) { - switch (type) { - case TOKEN_PLUS: return "TOKEN_PLUS"; - case TOKEN_MINUS: return "TOKEN_MINUS"; - case TOKEN_INTEGER: return "TOKEN_INTEGER"; - case TOKEN_FLOAT: return "TOKEN_FLOAT"; - case TOKEN_SPACE: return "TOKEN_SPACE"; - case TOKEN_STRING: return "TOKEN_STRING"; - case TOKEN_UNKNOWN: return "TOKEN_UNKNOWN"; - default: return "UNKNOWN_SYMBOL"; - } -} - -// void main2() { -// char* input = "323.23 + Hello world 102102"; -// int length1 = strlen(input); -// int i = 0; -// printf("input: %s\n\n", input); -// while (i < length1) { -// Token result = read_from_tok(input, i); -// printf("text: %s\ntype: %u (%s)\n\n", result.text, result.type, token_type_to_string(result.type)); -// i += result.cursor_skip; -// } -// } - - - -void mathparser(const char* input) { - TokenArr stack = tokenize_all(input); - float result = 0; - float current = 0; - float sign = 1; - float op = 0; - - for (size_t i = 0; i < stack.size; ++i) { - switch (stack.unit[i].type) { - case TOKEN_INTEGER: - { - float value = str_to_float(stack.unit[i].text); - if (op == 1) { - current *= value; - op = 0; - } else if (op == 2) { - current /= value; - op = 0; - } else { - current = value; - } - break; - } - - case TOKEN_FLOAT: - { - float value = str_to_float(stack.unit[i].text); - if (op == 1) { - current *= value; - op = 0; - } else if (op == 2) { - current /= value; - op = 0; - } else { - current = value; - } - break; - } - case TOKEN_PLUS: - result += sign * current; - sign = 1; - op = 0; - break; - case TOKEN_MINUS: - result += sign * current; - sign = -1; - op = 0; - break; - case TOKEN_MUL: - op = 1; - break; - case TOKEN_DIV: - op = 2; - break; - default: - break; - } - } - result += sign * current; - printf("%f\n", result); - for (size_t j = 0; j < stack.size; ++j) { - free(stack.unit[j].text); - } - free(stack.unit); -} - - -// int main4() { -// char* input = "print(5) hello"; -// printf("input: %s\n\n", input); -// -// TokenArr arr = tokenize_all(input); -// -// for (size_t j = 0; j < arr.size; ++j) { -// Token* result = &arr.unit[j]; -// printf("text: %s\ntype: %u (%s)\n\n", result->text, result->type, token_type_to_string(result->type)); -// } -// -// printf("================ Tokenized =================\n"); -// -// for (size_t j = 0; j < arr.size; ++j) { -// Token* result = &arr.unit[j]; -// printf("text: %s, type: %u (%s) || ", result->text, result->type, token_type_to_string(result->type)); -// } -// printf("\n"); -// for (size_t j = 0; j < arr.size; ++j) { -// free(arr.unit[j].text); -// } -// free(arr.unit); -// return 0; -// } - - - - - -// int main5(){ -// char* input = "40/2.3 * 10 + 400"; -// printf("input: %s\n", input); -// mathparser(input); -// return 0; -// } - - -int main(int argc, char** argv) { - if (argc > 1){ - char* input = nb_read_file(argv[1]); - // printf("Input: %s\n", input); - - TokenArr toks = tokenize_all(input); - - parser p = { toks.unit, 0 }; - ASTNode* root = parse_expression(&p); - - double result = eval_ast(root); - printf("%f\n", result); - } else { - printf("Usage: %s \n", argv[0]); + cursor++; } - return 0; + return cursor - start; +} + +Token tokenize_all(const char *input) { + Token tok; + token_init(&tok, 8); + + size_t i = 0; + size_t length = strlen(input); + + while (i < length) { + i += read_from_tok(&tok, input, i); + } + + token_push(&tok, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0); + return tok; +} + + +int main() { + char *expr = "1 + 2 * 3"; + + Token tokens = tokenize_all(expr); + + for (size_t i = 0; i < tokens.size; i++) { + printf("[%s] \"%s\"\n", token_type_to_string(tokens.type[i]), tokens.text[i]); + } + + token_free(&tokens); + return 0; } diff --git a/nb.h b/nb.h index f450a5e..0147c0a 100644 --- a/nb.h +++ b/nb.h @@ -86,7 +86,7 @@ void nb_init(nb_arr *newarr, int initial_capacity){ void nb_append(nb_arr *newarr, char *newval){ if (newarr->value == NULL){ newarr->capacity =16; - if (newarr->capacity > 16 | newarr->arrsize > newarr->capacity) { + if ((newarr->capacity > 16) | (newarr->arrsize > newarr->capacity)) { newarr->capacity *=2; } newarr->value = (char**)realloc(newarr->value, sizeof(char*) * newarr->capacity); @@ -178,9 +178,9 @@ void nb_com(nb_arr *newarr){ } -void append_c_file(FILE *filepointer){ - -} +// void append_c_file(FILE *filepointer){ +// filepointer = NULL; +// } void nb_copy_file(char* old_file_name, char* new_file_name){ // old name shouldnt be nobuild.c. it should be the name of the current file. nb_file old_file;