2025-07-22 15:57:51 +03:00
# include <assert.h>
2025-07-20 22:24:25 +03:00
# include <ctype.h>
# include <stddef.h>
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <sys/types.h>
typedef struct {
char * mstr ;
} mstring ;
typedef struct {
int mint ;
} mint ;
typedef struct {
float myfloat ;
} mfloat ;
2025-07-28 17:30:45 +03:00
int str_to_int ( char * strint ) {
int new_int = atoi ( strint ) ;
return new_int ;
}
2025-07-20 22:24:25 +03:00
typedef enum {
TOKEN_PLUS ,
TOKEN_MINUS ,
TOKEN_INTEGER ,
2025-07-22 15:57:51 +03:00
TOKEN_FLOAT ,
2025-07-20 22:40:11 +03:00
TOKEN_SPACE ,
2025-07-21 13:34:20 +03:00
TOKEN_STRING ,
2025-07-20 22:24:25 +03:00
intdef ,
TOKEN_UNKNOWN ,
} symbols ;
2025-07-21 13:34:20 +03:00
typedef enum {
BHV_STACK ,
BHV_UNDEFINED ,
BHV_NUMBER ,
BHV_STRING ,
2025-07-22 15:57:51 +03:00
BHV_FLOAT ,
2025-07-21 13:34:20 +03:00
} symbol_bhv ;
2025-07-23 18:32:45 +03:00
2025-07-20 22:24:25 +03:00
typedef struct {
symbols type ;
char * text ;
size_t text_len ;
2025-07-21 13:34:20 +03:00
symbol_bhv behaviour ;
uint cursor_skip ;
2025-07-22 15:57:51 +03:00
symbols previous_token ;
2025-07-20 22:24:25 +03:00
} Token ;
2025-07-28 17:30:45 +03:00
// since I now have tokenize all I dont really need previous_token. I can just ast walk it without each individual token carrying all data
2025-07-23 18:32:45 +03:00
typedef struct {
Token * unit ;
size_t size ;
size_t capacity ;
} TokenArr ;
2025-07-20 22:24:25 +03:00
typedef struct {
char * content ;
// size_t cursor;
// size_t line;
} Lexer ;
2025-07-24 16:17:10 +03:00
// will not nesseccarilly use AST. just could be useful in the future.
typedef enum {
AST_NUMBER ,
AST_BINARY_OP ,
} ASTNodeType ;
typedef struct {
ASTNodeType type ;
union {
struct {
double value ;
} number ;
struct {
char op ;
struct ASTNode * left ;
struct ASTNode * right ;
} binary ;
} ;
} ASTNode ;
typedef struct {
TokenArr * tokens ;
size_t cursor ;
} parser ;
2025-07-20 22:24:25 +03:00
// Lexer
void lexer_new ( char * content , size_t content_len ) {
2025-07-28 17:30:45 +03:00
( void ) content ;
( void ) content_len ;
2025-07-20 22:24:25 +03:00
}
// Token
void lexer_next ( Lexer * mylexer ) {
2025-07-28 17:30:45 +03:00
( void ) mylexer ;
2025-07-20 22:24:25 +03:00
}
2025-07-20 22:49:50 +03:00
// will implement a stack for arithmetic later. do I want a compiler or interpreter? since this is a learning experience im gonna do the easier thing first
2025-07-20 22:24:25 +03:00
Token read_from_tok ( char * text , uint cursor ) {
Token mytoks ;
2025-07-21 13:34:20 +03:00
2025-07-20 22:24:25 +03:00
static char buf [ 64 ] ;
size_t i = 0 ;
2025-07-21 13:34:20 +03:00
mytoks . cursor_skip = 1 ;
2025-07-22 15:57:51 +03:00
// integer logic. will have to somehow detect "." for floats but it will be hard to do because the way I wrote this code is shit
// ie: checking for . depends on the switch statement. so I will have to maybe add previous_token to the token struct. Actually a feasible idea.
// will I need to set previous_token to the current token? maybe.
2025-07-20 22:42:44 +03:00
if ( isdigit ( text [ cursor ] ) ) {
2025-07-20 22:24:25 +03:00
size_t start = cursor ;
2025-07-22 15:57:51 +03:00
int dots_seen = 0 ;
while ( isdigit ( text [ cursor ] ) | | text [ cursor ] = = ' . ' ) {
if ( text [ cursor ] = = ' . ' ) {
dots_seen + = 1 ;
assert ( dots_seen < 2 ) ;
}
buf [ i + + ] = text [ cursor + + ] ;
2025-07-20 22:24:25 +03:00
}
2025-07-22 15:57:51 +03:00
// recheck this assert later
2025-07-20 22:24:25 +03:00
buf [ i ] = ' \0 ' ;
2025-07-22 15:57:51 +03:00
if ( ! dots_seen ) {
mytoks . type = TOKEN_INTEGER ;
mytoks . behaviour = BHV_NUMBER ;
} else {
mytoks . type = TOKEN_FLOAT ;
mytoks . behaviour = BHV_FLOAT ;
}
2025-07-21 13:34:20 +03:00
mytoks . cursor_skip = cursor - start ;
2025-07-24 16:17:10 +03:00
mytoks . text = strdup ( buf ) ;
2025-07-21 13:34:20 +03:00
mytoks . text_len = i ;
2025-07-22 15:57:51 +03:00
}
// string logic
else if ( isalpha ( text [ cursor ] ) ) {
2025-07-21 13:34:20 +03:00
size_t start = cursor ;
while ( isalpha ( text [ cursor ] ) ) {
buf [ i + + ] = text [ cursor + + ] ;
}
buf [ i ] = ' \0 ' ;
mytoks . type = TOKEN_STRING ;
mytoks . behaviour = BHV_STRING ;
mytoks . cursor_skip = cursor - start ;
2025-07-24 16:17:10 +03:00
mytoks . text = strdup ( buf ) ;
2025-07-20 22:24:25 +03:00
mytoks . text_len = i ;
2025-07-20 22:42:44 +03:00
}
2025-07-21 13:34:20 +03:00
2025-07-20 22:42:44 +03:00
else {
2025-07-20 22:24:25 +03:00
buf [ 0 ] = text [ cursor ] ;
buf [ 1 ] = ' \0 ' ;
switch ( text [ cursor ] ) {
case ' + ' :
mytoks . type = TOKEN_PLUS ;
2025-07-21 13:34:20 +03:00
// asigning text is not really needed unless for debug. could however be useful for codegen later.
2025-07-24 16:17:10 +03:00
mytoks . text = strdup ( " + " ) ;
2025-07-21 13:34:20 +03:00
mytoks . behaviour = BHV_STACK ;
break ;
2025-07-20 22:24:25 +03:00
case ' - ' :
mytoks . type = TOKEN_MINUS ;
2025-07-24 16:17:10 +03:00
mytoks . text = strdup ( " - " ) ;
2025-07-21 13:34:20 +03:00
mytoks . behaviour = BHV_STACK ;
2025-07-20 22:24:25 +03:00
break ;
2025-07-20 22:40:11 +03:00
case ' ' :
mytoks . type = TOKEN_SPACE ;
2025-07-24 16:17:10 +03:00
mytoks . text = strdup ( " space " ) ;
2025-07-20 22:40:11 +03:00
break ;
2025-07-20 22:24:25 +03:00
default :
2025-07-20 22:40:11 +03:00
mytoks . type = TOKEN_UNKNOWN ;
2025-07-21 13:34:20 +03:00
mytoks . behaviour = BHV_UNDEFINED ;
2025-07-24 16:19:49 +03:00
mytoks . text = strdup ( buf ) ;
2025-07-21 13:34:20 +03:00
2025-07-20 22:24:25 +03:00
}
}
return mytoks ;
}
2025-07-23 18:32:45 +03:00
void tokenarr_push ( TokenArr * arr , Token tok ) {
if ( arr - > size > = arr - > capacity ) {
arr - > capacity = arr - > capacity ? arr - > capacity * 2 : 8 ;
arr - > unit = realloc ( arr - > unit , arr - > capacity * sizeof ( Token ) ) ;
assert ( arr - > unit ! = NULL ) ;
}
arr - > unit [ arr - > size + + ] = tok ;
}
TokenArr tokenize_all ( const char * input ) {
TokenArr arr = { NULL , 0 , 0 } ;
size_t i = 0 ;
size_t len = strlen ( input ) ;
while ( i < len ) {
Token tok = read_from_tok ( ( char * ) input , i ) ;
tokenarr_push ( & arr , tok ) ;
i + = tok . cursor_skip ;
}
return arr ;
}
2025-07-20 22:24:25 +03:00
// Token* c
2025-07-24 16:17:10 +03:00
void token_parser ( Token mytok , char * input ) {
2025-07-21 13:34:20 +03:00
int length1 = strlen ( input ) ;
int i = 0 ;
while ( i < length1 ) {
mytok = read_from_tok ( input , i ) ;
printf ( " Text: %s \n " , mytok . text ) ;
printf ( " Behaviour: %d \n " , mytok . behaviour ) ;
if ( mytok . behaviour = = BHV_STACK ) {
printf ( " this is stack lil bro \n " ) ;
}
i + + ;
}
}
2025-07-20 22:24:25 +03:00
// operators accepted in int/digit or whatever type def only when they have a digit before AND after them
2025-07-21 13:34:20 +03:00
/*
2025-07-20 22:24:25 +03:00
int main ( ) {
Token newtok ;
2025-07-21 13:34:20 +03:00
char * input = " 8 " ;
parser ( newtok , input ) ;
}
*/
2025-07-23 18:32:45 +03:00
char * token_type_to_string ( symbols type ) {
2025-07-22 16:18:37 +03:00
switch ( type ) {
case TOKEN_PLUS : return " TOKEN_PLUS " ;
case TOKEN_MINUS : return " TOKEN_MINUS " ;
case TOKEN_INTEGER : return " TOKEN_INTEGER " ;
case TOKEN_FLOAT : return " TOKEN_FLOAT " ;
case TOKEN_SPACE : return " TOKEN_SPACE " ;
case TOKEN_STRING : return " TOKEN_STRING " ;
case intdef : return " intdef " ;
case TOKEN_UNKNOWN : return " TOKEN_UNKNOWN " ;
default : return " UNKNOWN_SYMBOL " ;
}
}
2025-07-23 18:32:45 +03:00
void main2 ( ) {
2025-07-21 13:34:20 +03:00
Token newtok ;
2025-07-22 16:18:37 +03:00
char * input = " 323.23 + Hello world 102102 " ;
2025-07-21 13:34:20 +03:00
int length1 = strlen ( input ) ;
int i = 0 ;
2025-07-22 16:18:37 +03:00
printf ( " input: %s \n \n " , input ) ;
2025-07-21 13:34:20 +03:00
while ( i < length1 ) {
2025-07-22 16:18:37 +03:00
Token result = read_from_tok ( input , i ) ;
printf ( " text: %s \n type: %u (%s) \n \n " , result . text , result . type , token_type_to_string ( result . type ) ) ;
2025-07-21 13:34:20 +03:00
i + = result . cursor_skip ;
2025-07-20 22:24:25 +03:00
}
}
2025-07-23 18:32:45 +03:00
2025-07-28 17:56:35 +03:00
void astparser ( const char * input ) {
TokenArr stack = tokenize_all ( input ) ;
int sum = 0 ;
int sign = 1 ;
for ( size_t i = 0 ; i < stack . size ; + + i ) {
switch ( stack . unit [ i ] . type ) {
case TOKEN_PLUS :
sign = 1 ;
break ;
case TOKEN_MINUS :
sign = - 1 ;
break ;
case TOKEN_INTEGER :
sum + = sign * str_to_int ( stack . unit [ i ] . text ) ;
sign = 1 ;
break ;
default :
break ;
}
2025-07-28 17:30:45 +03:00
}
2025-07-28 17:56:35 +03:00
printf ( " %d \n " , sum ) ;
for ( size_t j = 0 ; j < stack . size ; + + j ) {
free ( stack . unit [ j ] . text ) ;
}
free ( stack . unit ) ;
2025-07-28 17:30:45 +03:00
}
int main4 ( ) {
2025-07-24 16:19:49 +03:00
char * input = " print(5) hello " ;
2025-07-23 18:32:45 +03:00
printf ( " input: %s \n \n " , input ) ;
TokenArr arr = tokenize_all ( input ) ;
2025-07-24 16:17:10 +03:00
2025-07-23 18:32:45 +03:00
for ( size_t j = 0 ; j < arr . size ; + + j ) {
Token * result = & arr . unit [ j ] ;
printf ( " text: %s \n type: %u (%s) \n \n " , result - > text , result - > type , token_type_to_string ( result - > type ) ) ;
}
2025-07-24 16:17:10 +03:00
printf ( " ================ Tokenized ================= \n " ) ;
for ( size_t j = 0 ; j < arr . size ; + + j ) {
Token * result = & arr . unit [ j ] ;
printf ( " text: %s, type: %u (%s) || " , result - > text , result - > type , token_type_to_string ( result - > type ) ) ;
}
printf ( " \n " ) ;
for ( size_t j = 0 ; j < arr . size ; + + j ) {
free ( arr . unit [ j ] . text ) ;
}
2025-07-23 18:32:45 +03:00
free ( arr . unit ) ;
return 0 ;
}
2025-07-28 17:30:45 +03:00
int main ( ) {
2025-07-28 17:56:35 +03:00
char * input = " 1+69+3 " ;
2025-07-28 17:30:45 +03:00
printf ( " input: %s \n \n " , input ) ;
astparser ( input ) ;
}