From 595fdbe653a917470382c251c48cb73a822ffe09 Mon Sep 17 00:00:00 2001 From: shabani005 Date: Wed, 5 Nov 2025 23:14:59 +0300 Subject: [PATCH] basic VM working --- builder.c | 6 +- lexer.h | 119 +++++++++++++++++++---------- parser.h | 219 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ parser3.c | 194 +++++++++++++++++++++++++++++++++++++++++++++-- vm.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 690 insertions(+), 52 deletions(-) create mode 100644 parser.h create mode 100644 vm.c diff --git a/builder.c b/builder.c index 9b7be5d..49fb867 100644 --- a/builder.c +++ b/builder.c @@ -8,12 +8,12 @@ int main(int argc, char **argv){ nb_append(&cmd, "gcc"); nb_append(&cmd, "-Wall -Wextra"); - nb_append(&cmd, "lexer.c"); - nb_append(&cmd, "-o lex"); + nb_append(&cmd, "vm.c"); + nb_append(&cmd, "-o vm"); nb_cmd(&cmd); - nb_append(&cmd, "./lex"); + nb_append(&cmd, "./vm"); for (int i=1; i= sizeof(buf) - 1) break; } buf[i] = '\0'; - if (dots_seen == 0) { - token_push(tok, TOKEN_INTEGER, buf, BHV_NUMBER, cursor - start); - } else { - token_push(tok, TOKEN_FLOAT, buf, BHV_FLOAT, cursor - start); - } - } else if (input[cursor] == '"'){ - cursor++; - while(input[cursor] != '"' && input[cursor] != '\0'){ - buf[i++] = input[cursor++]; - if (i >= sizeof(buf) - 1) break; - } - buf[i] = '\0'; - if (input[cursor] == '"') cursor ++; - token_push(tok, TOKEN_STRING, buf, BHV_STRING, cursor - start); - } else if (isalpha(input[cursor])) { // should be after checking for strlit - while (isalpha(input[cursor])) { - buf[i++] = input[cursor++]; - } - buf[i] = '\0'; - token_push(tok, TOKEN_IDENTIFIER, buf, BHV_IDENT, cursor - start); - //refactor into separate function to use in parsing functions and definitions - } else { - buf[0] = input[cursor]; - buf[1] = '\0'; - switch (input[cursor]) { - case '+': token_push(tok, TOKEN_PLUS, "+", BHV_STACK, 1); break; - case '-': token_push(tok, TOKEN_MINUS, "-", BHV_STACK, 1); break; - case '*': token_push(tok, TOKEN_MUL, "*", BHV_STACK, 1); break; - case '/': token_push(tok, TOKEN_DIV, "/", BHV_STACK, 1); break; - case ' ': token_push(tok, TOKEN_SPACE, " ", BHV_UNDEFINED, 1); break; - case '\n': token_push(tok, TOKEN_NEWLINE, "\\n", BHV_UNDEFINED, 1); break; - case '(': token_push(tok, TOKEN_LPAREN, "(", BHV_STACK, 1); break; - case ')': token_push(tok, TOKEN_RPAREN, ")", BHV_STACK, 1); break; - case ',': token_push(tok, TOKEN_COMMA, ",", BHV_STACK, 1); break; - default: token_push(tok, TOKEN_UNKNOWN, buf, BHV_UNDEFINED, 1); break; - } - cursor++; + token_push(tok, dots_seen == 0 ? TOKEN_INTEGER : TOKEN_FLOAT, + buf, dots_seen == 0 ? BHV_NUMBER : BHV_FLOAT, + cursor - start); + return cursor - start; // all digits handled } + else if (input[cursor] == '"') { + cursor++; // skip opening quote + while (input[cursor] != '"' && input[cursor] != '\0') { + buf[i++] = input[cursor++]; + if (i >= sizeof(buf) - 1) break; + } + buf[i] = '\0'; + if (input[cursor] == '"') cursor++; // skip closing quote + token_push(tok, TOKEN_STRING, buf, BHV_STRING, cursor - start); + return cursor - start; + } + + else if (isalpha((unsigned char)input[cursor])) { + while (isalpha((unsigned char)input[cursor])) { + buf[i++] = input[cursor++]; + if (i >= sizeof(buf) - 1) break; + } + buf[i] = '\0'; + token_push(tok, TOKEN_IDENTIFIER, buf, BHV_IDENT, cursor - start); + return cursor - start; + } + + // Single-character tokens and symbols + switch (input[cursor]) { + case '+': token_push(tok, TOKEN_PLUS, "+", BHV_STACK, 1); break; + case '-': token_push(tok, TOKEN_MINUS, "-", BHV_STACK, 1); break; + case '*': token_push(tok, TOKEN_MUL, "*", BHV_STACK, 1); break; + case '/': token_push(tok, TOKEN_DIV, "/", BHV_STACK, 1); break; + case '{': token_push(tok, TOKEN_LCURLY, "{", BHV_STACK, 1); break; + case '}': token_push(tok, TOKEN_RCURLY, "}", BHV_STACK, 1); break; + case ';': token_push(tok, TOKEN_SEMI, ";", BHV_STACK, 1); break; + case ':': token_push(tok, TOKEN_COLON, ":", BHV_STACK, 1); break; + + case '(': + token_push(tok, TOKEN_LPAREN, "(", BHV_STACK, 1); + break; + case ')': + token_push(tok, TOKEN_RPAREN, ")", BHV_STACK, 1); + break; + case ',': + token_push(tok, TOKEN_COMMA, ",", BHV_STACK, 1); + break; + case ' ': + // you can skip space tokens if you don't need them + token_push(tok, TOKEN_SPACE, " ", BHV_UNDEFINED, 1); + break; + case '\n': + token_push(tok, TOKEN_NEWLINE, "\\n", BHV_UNDEFINED, 1); + break; + case '\0': + return 0; // end of input + default: { + buf[0] = input[cursor]; + buf[1] = '\0'; + token_push(tok, TOKEN_UNKNOWN, buf, BHV_UNDEFINED, 1); + break; + } + } + + cursor++; // move forward exactly one char for symbol cases return cursor - start; } + Token tokenize_all(const char *input) { Token tok; token_init(&tok, 8); diff --git a/parser.h b/parser.h new file mode 100644 index 0000000..6d8ceef --- /dev/null +++ b/parser.h @@ -0,0 +1,219 @@ +#include "./lexer.h" +#define NB_IMPLEMENTATION +#include "./nb.h" + +int get_prec(symbols op){ + switch (op) { + case TOKEN_MUL: + case TOKEN_DIV: + return 2; break; + case TOKEN_PLUS: + case TOKEN_MINUS: + return 1; break; + default: return 0; + } +} +// parse + +bool is_left_asc(symbols op){ + switch (op) { + case TOKEN_MUL: + case TOKEN_DIV: + case TOKEN_PLUS: + case TOKEN_MINUS: + return true; break; + default: return false; + } +} + +Token *global_tok = NULL; + +typedef enum { + SYM_VAR, + SYM_FUNC, +} SymbolKind; + +typedef struct { + const char* name; + size_t ret_count; + size_t arg_count; + symbols arg_types[16]; + symbols ret_type; + SymbolKind symbol_kind; + bool builtin; +} Symbol; + + +static Symbol builtins[] = { + { "print", 1, 1, { TOKEN_UNKNOWN }, TOKEN_EOF, SYM_FUNC, true }, +}; + + +typedef struct { + Symbol *symbols; + size_t size; + size_t capacity; +} SymbolTable; + + +static int builtin_num = sizeof(builtins)/sizeof(builtins[0]); + +static SymbolTable global_env = { + .size = sizeof(builtins)/sizeof(builtins[0]), + .capacity = sizeof(builtins)/sizeof(builtins[0]), + .symbols = builtins}; + + +Symbol *symbol_lookup(SymbolTable *table, const char *n){ + for (size_t i=0; isize; ++i){ + if(strcmp(n, table->symbols[i].name) == 0){ + return &table->symbols[i]; + } + } + return NULL; +} + +// fn add(x: int, y: int) int { +// return x+y; +// } + + +void symbol_table_init(SymbolTable *table, size_t initial_capacity) { + table->symbols = malloc(sizeof(Symbol) * initial_capacity); + if (!table->symbols) { + fprintf(stderr, "symbol_table_init: malloc failed\n"); + exit(1); + } + table->size = 0; + table->capacity = initial_capacity; +} + +void symbol_table_add(SymbolTable *table, Symbol sym) { + if (table->size >= table->capacity) { + table->capacity = (table->capacity == 0) ? 8 : table->capacity * 2; + table->symbols = realloc(table->symbols, sizeof(Symbol) * table->capacity); + if (!table->symbols) { + fprintf(stderr, "symbol_table_add: realloc failed\n"); + exit(1); + } + } + table->symbols[table->size++] = sym; +} + + +void symbol_table_free(SymbolTable *table) { + free(table->symbols); + table->symbols = NULL; + table->size = 0; + table->capacity = 0; +} + + +Token build_rpn(Token *inp, SymbolTable *symtab) { + Token output; + Token stack; + + token_init(&output, 16); + token_init(&stack, 16); + + for (size_t i = 0; i < inp->size; ++i) { + symbols type = inp->type[i]; + const char *text = inp->text[i]; + + if (type == TOKEN_IDENTIFIER && i + 1 < inp->size && inp->type[i + 1] == TOKEN_LPAREN) { + Symbol *found = symbol_lookup(symtab, text); + if (!found) { + Symbol sym = { + .name = strdup(text), + .arg_count = 0, + .ret_type = TOKEN_EOF, + .symbol_kind = SYM_FUNC, + .builtin = false + }; + symbol_table_add(symtab, sym); + } + token_push(&stack, type, text, inp->behaviour[i], 0); + } else if (type == TOKEN_IDENTIFIER) { + Symbol *found = symbol_lookup(symtab, text); + if (!found) { + Symbol sym = { + .name = strdup(text), + .arg_count = 0, + .ret_type = TOKEN_UNKNOWN, + .symbol_kind = SYM_VAR, + .builtin = false + }; + symbol_table_add(symtab, sym); + } + token_push(&output, type, text, inp->behaviour[i], 0); + } else if (type == TOKEN_LPAREN) { + token_push(&stack, type, text, inp->behaviour[i], 0); + } else if (type == TOKEN_RPAREN) { + while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_LPAREN) + stack.size--; + if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_IDENTIFIER) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + } else if (type == TOKEN_INTEGER || type == TOKEN_FLOAT || type == TOKEN_STRING) { + token_push(&output, type, text, inp->behaviour[i], 0); + } else if (is_left_asc(type)) { + while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN && + (get_prec(stack.type[stack.size - 1]) > get_prec(type) || + get_prec(stack.type[stack.size - 1]) == get_prec(type)) && + is_left_asc(type)) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + token_push(&stack, type, text, inp->behaviour[i], 0); + } + } + + while (stack.size > 0) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + + token_push(&output, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0); + return output; +} + +void print_token(Token *tk){ + for (size_t i=0; isize; ++i){ + printf("TokenNum: %zu Type: %s Value: %s\n", i, tk->tktype[i], tk->text[i]); + } +} + + + + +// int main(int argc, char **argv){ +// if (argc < 2) return -1; +// const char ts[] = "\"hello\" hi + 2 2.312"; +// const char math[] = "print(((1+2)*6)/18)"; // = 1 +// const char print[] = "print(\"hello\")"; +// const char simple[] = "1 + ( 3 + 3 )/4+4*3"; + + +// char* read = nb_read_file(argv[1]); +// Token tk = tokenize_all(read); +// printf("INPUT: %s\n", read); +// SymbolTable table = {0}; +// symbol_table_init(&table, 32); + + +// Token rpn = build_rpn(&tk, &table); +// print_token(&rpn); +// } diff --git a/parser3.c b/parser3.c index fd768e8..d266465 100644 --- a/parser3.c +++ b/parser3.c @@ -1,4 +1,6 @@ #include "./lexer.h" +#define NB_IMPLEMENTATION +#include "./nb.h" int get_prec(symbols op){ switch (op) { @@ -24,18 +26,194 @@ bool is_left_asc(symbols op){ } } -void build_rpn(); +Token *global_tok = NULL; + +typedef enum { + SYM_VAR, + SYM_FUNC, +} SymbolKind; + +typedef struct { + const char* name; + size_t ret_count; + size_t arg_count; + symbols arg_types[16]; + symbols ret_type; + SymbolKind symbol_kind; + bool builtin; +} Symbol; +static Symbol builtins[] = { + { "print", 1, 1, { TOKEN_UNKNOWN }, TOKEN_EOF, SYM_FUNC, true }, +}; +typedef struct { + Symbol *symbols; + size_t size; + size_t capacity; +} SymbolTable; -int main(void){ - const char ts[] = "\"hello\" hi + 2"; - const char math[] = "((1+2)*6)/18"; // = 1 - Token tk = tokenize_all(math); - for (size_t i=0; isize; ++i){ + if(strcmp(n, table->symbols[i].name) == 0){ + return &table->symbols[i]; + } } - // printf("token count: %zu\n", tk.size); + return NULL; +} + +// fn add(x: int, y: int) int { +// return x+y; +// } + + +void symbol_table_init(SymbolTable *table, size_t initial_capacity) { + table->symbols = malloc(sizeof(Symbol) * initial_capacity); + if (!table->symbols) { + fprintf(stderr, "symbol_table_init: malloc failed\n"); + exit(1); + } + table->size = 0; + table->capacity = initial_capacity; +} + +void symbol_table_add(SymbolTable *table, Symbol sym) { + if (table->size >= table->capacity) { + table->capacity = (table->capacity == 0) ? 8 : table->capacity * 2; + table->symbols = realloc(table->symbols, sizeof(Symbol) * table->capacity); + if (!table->symbols) { + fprintf(stderr, "symbol_table_add: realloc failed\n"); + exit(1); + } + } + table->symbols[table->size++] = sym; +} + + +void symbol_table_free(SymbolTable *table) { + free(table->symbols); + table->symbols = NULL; + table->size = 0; + table->capacity = 0; +} + + +Token build_rpn(Token *inp, SymbolTable *symtab) { + Token output; + Token stack; + + token_init(&output, 16); + token_init(&stack, 16); + + for (size_t i = 0; i < inp->size; ++i) { + symbols type = inp->type[i]; + const char *text = inp->text[i]; + + if (type == TOKEN_IDENTIFIER && i + 1 < inp->size && inp->type[i + 1] == TOKEN_LPAREN) { + Symbol *found = symbol_lookup(symtab, text); + if (!found) { + Symbol sym = { + .name = strdup(text), + .arg_count = 0, + .ret_type = TOKEN_EOF, + .symbol_kind = SYM_FUNC, + .builtin = false + }; + symbol_table_add(symtab, sym); + } + token_push(&stack, type, text, inp->behaviour[i], 0); + } else if (type == TOKEN_IDENTIFIER) { + Symbol *found = symbol_lookup(symtab, text); + if (!found) { + Symbol sym = { + .name = strdup(text), + .arg_count = 0, + .ret_type = TOKEN_UNKNOWN, + .symbol_kind = SYM_VAR, + .builtin = false + }; + symbol_table_add(symtab, sym); + } + token_push(&output, type, text, inp->behaviour[i], 0); + } else if (type == TOKEN_LPAREN) { + token_push(&stack, type, text, inp->behaviour[i], 0); + } else if (type == TOKEN_RPAREN) { + while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_LPAREN) + stack.size--; + if (stack.size > 0 && stack.type[stack.size - 1] == TOKEN_IDENTIFIER) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + } else if (type == TOKEN_INTEGER || type == TOKEN_FLOAT || type == TOKEN_STRING) { + token_push(&output, type, text, inp->behaviour[i], 0); + } else if (is_left_asc(type)) { + while (stack.size > 0 && stack.type[stack.size - 1] != TOKEN_LPAREN && + (get_prec(stack.type[stack.size - 1]) > get_prec(type) || + get_prec(stack.type[stack.size - 1]) == get_prec(type)) && + is_left_asc(type)) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + token_push(&stack, type, text, inp->behaviour[i], 0); + } + } + + while (stack.size > 0) { + token_push(&output, stack.type[stack.size - 1], + stack.text[stack.size - 1], + stack.behaviour[stack.size - 1], 0); + stack.size--; + } + + token_push(&output, TOKEN_EOF, "EOF", BHV_UNDEFINED, 0); + return output; +} + +void print_token(Token *tk){ + for (size_t i=0; isize; ++i){ + printf("TokenNum: %zu Type: %s Value: %s\n", i, tk->tktype[i], tk->text[i]); + } +} + + + + +int main(int argc, char **argv){ + if (argc < 2) return -1; + const char ts[] = "\"hello\" hi + 2 2.312"; + const char math[] = "print(((1+2)*6)/18)"; // = 1 + const char print[] = "print(\"hello\")"; + const char simple[] = "1 + ( 3 + 3 )/4+4*3"; + + + char* read = nb_read_file(argv[1]); + Token tk = tokenize_all(read); + printf("INPUT: %s\n", read); + SymbolTable table = {0}; + symbol_table_init(&table, 32); + + + Token rpn = build_rpn(&tk, &table); + print_token(&rpn); } diff --git a/vm.c b/vm.c new file mode 100644 index 0000000..30a3a2c --- /dev/null +++ b/vm.c @@ -0,0 +1,204 @@ +#include "parser.h" +#include + +typedef enum { + OP_PUSH_INT, + OP_PUSH_FLOAT, + OP_PUSH_STRING, + OP_ADD, + OP_SUB, + OP_MUL, + OP_DIV, + OP_PRINT, + OP_HALT +} OPcode; + +typedef struct { + OPcode op; + double num; + char *strlit; +} instruct; + +typedef enum { + VAL_INT, + VAL_FLOAT, + VAL_STRING, +} ValueType; + +typedef struct { + ValueType type; + union { + long i; + double f; + char *s; + }; +} Value; + +typedef struct { + instruct *program; + size_t inst_p; + size_t program_size; + Value stack[256]; + size_t st_p; + bool running; +} VM; + +instruct *rpn_to_bytecode(Token *rpn, size_t *out){ + size_t cap = 64; + size_t size = 0; + + instruct *prog = malloc(sizeof(instruct) * cap); + + for (size_t i=0; isize; ++i){ + symbols t = rpn->type[i]; + const char *text = rpn->text[i]; + + instruct ins = {0}; + + switch (t){ + case TOKEN_INTEGER: ins.op = OP_PUSH_INT; ins.num = atof(text); break; + case TOKEN_FLOAT: ins.op = OP_PUSH_FLOAT; ins.num = atof(text); break; + case TOKEN_STRING: ins.op = OP_PUSH_STRING; ins.strlit = strdup(text); break; + case TOKEN_PLUS: ins.op = OP_ADD; break; + case TOKEN_MINUS: ins.op = OP_SUB; break; + case TOKEN_MUL: ins.op = OP_MUL; break; + case TOKEN_DIV: ins.op = OP_DIV; break; + + case TOKEN_IDENTIFIER: + if (strcmp(text, "print") == 0) { + ins.op = OP_PRINT; + } else { + printf("[WARNING] Uknown Identifier '%s'\n", text); + } + break; //TODO: unhardcode this + case TOKEN_EOF: ins.op = OP_HALT; break; + default: continue; + } + if (size >= cap){ + cap*=2; + prog = realloc(prog, sizeof(instruct)*cap); + } + prog[size++] = ins; + } + *out = size; + return prog; +} + +void vm_run(VM *vm) { + vm->running = true; + vm->inst_p = 0; + vm->st_p = 0; + + while (vm->running && vm->inst_p < vm->program_size) { + instruct ins = vm->program[vm->inst_p++]; + + switch (ins.op) { + case OP_PUSH_INT: { + Value v = { .type = VAL_INT, .i = ins.num }; + vm->stack[vm->st_p++] = v; + } break; + + case OP_PUSH_FLOAT: { + Value v = { .type = VAL_FLOAT, .f = ins.num }; + vm->stack[vm->st_p++] = v; + } break; + + case OP_PUSH_STRING: { + Value v = { .type = VAL_STRING, .s = strdup(ins.strlit) }; + vm->stack[vm->st_p++] = v; + } break; + + case OP_ADD: + case OP_SUB: + case OP_MUL: + case OP_DIV: { + if (vm->st_p < 2) { + fprintf(stderr, "not enough values on stack.\n"); + vm->running = false; + break; + } + + Value b = vm->stack[--vm->st_p]; + Value a = vm->stack[--vm->st_p]; + + double av = (a.type == VAL_INT) ? a.i : a.f; + double bv = (b.type == VAL_INT) ? b.i : b.f; + double result = 0; + + switch (ins.op) { + case OP_ADD: result = av + bv; break; + case OP_SUB: result = av - bv; break; + case OP_MUL: result = av * bv; break; + case OP_DIV: + if (bv == 0) { + fprintf(stderr, "division by zero.\n"); + vm->running = false; + } else result = av / bv; + break; + default: break; + } + + Value v = { .type = VAL_FLOAT, .f = result }; + vm->stack[vm->st_p++] = v; + } break; + + case OP_PRINT: { + if (vm->st_p == 0) { + fprintf(stderr, "cant print an empty stack\n"); + vm->running = false; + break; + } + + Value v = vm->stack[--vm->st_p]; + switch (v.type) { + case VAL_INT: printf("%ld\n", v.i); break; + case VAL_FLOAT: printf("%g\n", v.f); break; + case VAL_STRING: + printf("%s\n", v.s); + free(v.s); + break; + } + } break; + + case OP_HALT: + vm->running = false; + break; + + default: + fprintf(stderr, "unknown opcode %d\n", ins.op); + vm->running = false; + break; + } + } +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + char* read = nb_read_file(argv[1]); + //printf("INPUT: %s\n", read); + + Token tk = tokenize_all(read); + SymbolTable table = {0}; + symbol_table_init(&table, 32); + + Token rpn = build_rpn(&tk, &table); + //print_token(&rpn); + + size_t prog_size = 0; + instruct *prog = rpn_to_bytecode(&rpn, &prog_size); + VM vm = { + .program = prog, + .program_size = prog_size, + .inst_p = 0, + .st_p = 0, + .running = true, + }; + + vm_run(&vm); + + return 0; +}