Files
hlang/lexer.c

196 lines
4.2 KiB
C
Raw Normal View History

#include <assert.h>
2025-07-20 22:24:25 +03:00
#include <ctype.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
typedef struct{
char* mstr;
} mstring;
typedef struct{
int mint;
} mint;
typedef struct{
float myfloat;
} mfloat;
typedef enum{
TOKEN_PLUS,
TOKEN_MINUS,
TOKEN_INTEGER,
TOKEN_FLOAT,
2025-07-20 22:40:11 +03:00
TOKEN_SPACE,
2025-07-21 13:34:20 +03:00
TOKEN_STRING,
2025-07-20 22:24:25 +03:00
intdef,
TOKEN_UNKNOWN,
} symbols;
2025-07-21 13:34:20 +03:00
typedef enum{
BHV_STACK,
BHV_UNDEFINED,
BHV_NUMBER,
BHV_STRING,
BHV_FLOAT,
2025-07-21 13:34:20 +03:00
} symbol_bhv;
2025-07-20 22:24:25 +03:00
typedef struct{
symbols type;
char* text;
size_t text_len;
2025-07-21 13:34:20 +03:00
symbol_bhv behaviour;
uint cursor_skip;
symbols previous_token;
2025-07-20 22:24:25 +03:00
} Token;
typedef struct{
char *content;
// size_t cursor;
// size_t line;
} Lexer;
// Lexer
void lexer_new(char *content, size_t content_len){
}
// Token
void lexer_next(Lexer *mylexer){
}
2025-07-20 22:49:50 +03:00
// will implement a stack for arithmetic later. do I want a compiler or interpreter? since this is a learning experience im gonna do the easier thing first
2025-07-20 22:24:25 +03:00
Token read_from_tok(char* text, uint cursor){
Token mytoks;
2025-07-21 13:34:20 +03:00
2025-07-20 22:24:25 +03:00
static char buf[64];
size_t i = 0;
2025-07-21 13:34:20 +03:00
mytoks.cursor_skip = 1;
// integer logic. will have to somehow detect "." for floats but it will be hard to do because the way I wrote this code is shit
// ie: checking for . depends on the switch statement. so I will have to maybe add previous_token to the token struct. Actually a feasible idea.
// will I need to set previous_token to the current token? maybe.
2025-07-20 22:42:44 +03:00
if (isdigit(text[cursor])) {
2025-07-20 22:24:25 +03:00
size_t start = cursor;
int dots_seen = 0;
while ( isdigit(text[cursor]) || text[cursor] == '.') {
if (text[cursor] == '.') {
dots_seen +=1;
assert(dots_seen < 2);
}
buf[i++] = text[cursor++];
2025-07-20 22:24:25 +03:00
}
// recheck this assert later
2025-07-20 22:24:25 +03:00
buf[i] = '\0';
if (!dots_seen){
mytoks.type = TOKEN_INTEGER;
mytoks.behaviour = BHV_NUMBER;
} else {
mytoks.type = TOKEN_FLOAT;
mytoks.behaviour = BHV_FLOAT;
}
2025-07-21 13:34:20 +03:00
mytoks.cursor_skip = cursor - start;
mytoks.text = buf;
mytoks.text_len = i;
}
// string logic
else if (isalpha(text[cursor])){
2025-07-21 13:34:20 +03:00
size_t start = cursor;
while (isalpha(text[cursor])) {
buf[i++] = text[cursor++];
}
buf[i] = '\0';
mytoks.type = TOKEN_STRING;
mytoks.behaviour = BHV_STRING;
mytoks.cursor_skip = cursor - start;
2025-07-20 22:24:25 +03:00
mytoks.text = buf;
mytoks.text_len = i;
2025-07-20 22:42:44 +03:00
}
2025-07-21 13:34:20 +03:00
2025-07-20 22:42:44 +03:00
else {
2025-07-20 22:24:25 +03:00
buf[0] = text[cursor];
buf[1] = '\0';
mytoks.text = buf;
switch (text[cursor]){
case '+':
mytoks.type = TOKEN_PLUS;
2025-07-21 13:34:20 +03:00
// asigning text is not really needed unless for debug. could however be useful for codegen later.
2025-07-20 22:24:25 +03:00
mytoks.text = "+";
2025-07-21 13:34:20 +03:00
mytoks.behaviour = BHV_STACK;
break;
2025-07-20 22:24:25 +03:00
case '-':
mytoks.type = TOKEN_MINUS;
mytoks.text = "-";
2025-07-21 13:34:20 +03:00
mytoks.behaviour = BHV_STACK;
2025-07-20 22:24:25 +03:00
break;
2025-07-20 22:40:11 +03:00
case ' ':
mytoks.type = TOKEN_SPACE;
mytoks.text = "space";
break;
2025-07-20 22:24:25 +03:00
default:
2025-07-20 22:40:11 +03:00
mytoks.type = TOKEN_UNKNOWN;
2025-07-21 13:34:20 +03:00
mytoks.behaviour = BHV_UNDEFINED;
2025-07-20 22:24:25 +03:00
}
}
return mytoks;
}
// Token* c
2025-07-21 13:34:20 +03:00
void parser(Token mytok, char* input){
int length1 = strlen(input);
int i=0;
while (i < length1) {
mytok = read_from_tok(input, i);
printf("Text: %s\n", mytok.text);
printf("Behaviour: %d\n", mytok.behaviour);
if (mytok.behaviour == BHV_STACK){
printf("this is stack lil bro\n");
}
i++;
}
}
2025-07-20 22:24:25 +03:00
// operators accepted in int/digit or whatever type def only when they have a digit before AND after them
2025-07-21 13:34:20 +03:00
/*
2025-07-20 22:24:25 +03:00
int main(){
Token newtok;
2025-07-21 13:34:20 +03:00
char* input = "8";
parser(newtok, input);
}
*/
int main(){
Token newtok;
char* input = "323.23 + Hello world";
2025-07-21 13:34:20 +03:00
int length1 = strlen(input);
int i = 0;
while (i < length1) {
Token result = read_from_tok(input, i);
printf("text: %s\ntype: %u\n\n", result.text, result.type);
i += result.cursor_skip;
2025-07-20 22:24:25 +03:00
}
}