feat(lexer): created parser utils

This commit is contained in:
Matteo Flebus 2026-01-21 16:19:10 +01:00
parent b5b40f303c
commit 26ac0ffe05
5 changed files with 296 additions and 276 deletions

View file

@ -2,7 +2,7 @@ lib_LIBRARIES = liblexer.a
liblexer_a_SOURCES = \
lexer.c \
lexer.h
lexer_utils.c
liblexer_a_CPPFLAGS = -I$(top_srcdir)/src

View file

@ -7,10 +7,21 @@
#include <stdlib.h>
#include <string.h>
#include "lexer_utils.h"
#include "../io_backend/io_backend.h"
#include "../utils/string_utils/string_utils.h"
// ######## STATIC FUNCTIONS ##############
/* @return: true if a special character from the grammar was found,
* false otherwise.
*/
static bool is_special_char(char c)
{
if (c == EOF)
return true;
char special_chars[] = "\n'\"`;#|&\\$(){}<>*";
return strchr(special_chars, c) != NULL;
}
/* @brief: sets the ctx->current_token to [tok].
* this function is called by token_peek().
@ -29,7 +40,6 @@ static void update_previous_token(struct token *tok, struct lexer_context *ctx)
free_token(&ctx->previous_token);
ctx->previous_token = tok;
}
/* @brief: updates the current position in the stream.
* [stream] += [i]
* Also frees the last sent token, and sets it to ctx->current_token.
@ -45,194 +55,6 @@ static void save_state(char *stream, ssize_t i, struct lexer_context *ctx)
update_current_token(NULL, ctx);
}
/* @return: true if a special character from the grammar was found,
* false otherwise.
*/
static bool is_special_char(char c)
{
if (c == EOF)
return true;
char special_chars[] = "\n'\"`;#|&\\$(){}<>*";
return strchr(special_chars, c) != NULL;
}
/* @brief: if a special character is found at [begin],
* [tok->token_type] is set accordingly
*/
static void set_token_spechar(struct token *tok, char *begin, ssize_t size)
{
if (size != 1)
return;
switch (begin[0])
{
case EOF:
tok->type = TOKEN_EOF;
break;
case ';':
tok->type = TOKEN_SEMICOLON;
break;
case '\n':
tok->type = TOKEN_NEWLINE;
break;
case '\'':
tok->type = TOKEN_QUOTE;
break;
case '"':
tok->type = TOKEN_DOUBLE_QUOTE;
break;
case '`':
tok->type = TOKEN_GRAVE;
break;
case '#':
tok->type = TOKEN_COMMENT;
break;
case '|':
tok->type = TOKEN_PIPE;
break;
case '&':
tok->type = TOKEN_AMPERSAND;
break;
case '\\':
tok->type = TOKEN_BACKSLASH;
break;
case '$':
tok->type = TOKEN_DOLLAR;
break;
case '(':
tok->type = TOKEN_LEFT_PAREN;
break;
case ')':
tok->type = TOKEN_RIGHT_PAREN;
break;
case '{':
tok->type = TOKEN_LEFT_BRACKET;
break;
case '}':
tok->type = TOKEN_RIGHT_BRACKET;
break;
case '<':
tok->type = TOKEN_LESS;
break;
case '>':
tok->type = TOKEN_GREATER;
break;
case '*':
tok->type = TOKEN_STAR;
break;
default:
break;
}
}
/* @brief: if a keyword is found at [begin],
* [tok->token_type] is set accordingly
*/
static void set_token_keyword(struct token *tok, char *begin, ssize_t size)
{
if (tok->type != TOKEN_NULL || size == 0)
return;
if (strncmp(begin, "if", size) == 0)
{
tok->type = TOKEN_IF;
}
else if (strncmp(begin, "fi", size) == 0)
{
tok->type = TOKEN_FI;
}
else if (strncmp(begin, "then", size) == 0)
{
tok->type = TOKEN_THEN;
}
else if (strncmp(begin, "else", size) == 0)
{
tok->type = TOKEN_ELSE;
}
else if (strncmp(begin, "elif", size) == 0)
{
tok->type = TOKEN_ELIF;
}
// no keywords found.
if (tok->type == TOKEN_NULL)
return;
tok->data = calloc(size + 1, sizeof(char));
if (tok->data == NULL)
return;
strncpy(tok->data, begin, size);
}
/* @brief: if token_type has not yet been set, then it is a TOKEN_WORD
* Also allocates the data and fills it.
*/
static void set_token_word(struct token *tok, char *begin, ssize_t size)
{
if (tok->type == TOKEN_NULL && size != 0)
{
tok->type = TOKEN_WORD;
tok->data = calloc(size + 1, sizeof(char));
if (tok->data == NULL)
return;
strncpy(tok->data, begin, size);
}
}
void destroy_lexer_context(struct lexer_context **ctx)
{
if (ctx == NULL || *ctx == NULL)
return;
if ((*ctx)->previous_token != NULL)
free((*ctx)->previous_token);
if ((*ctx)->current_token != NULL)
free((*ctx)->current_token);
free(*ctx);
*ctx = NULL;
}
struct token *new_token(char *begin, ssize_t size)
{
struct token *tok = calloc(1, sizeof(struct token));
if (tok == NULL)
return NULL;
set_token_spechar(tok, begin, size);
set_token_keyword(tok, begin, size);
set_token_word(tok, begin, size);
return tok;
}
void free_token(struct token **tok)
{
if (tok == NULL || *tok == NULL)
return;
if ((*tok)->data != NULL)
free((*tok)->data);
free(*tok);
*tok = NULL;
}
char *stream_init(struct lexer_context *ctx)
{
char *stream;
if (ctx->previous_token == NULL) // at the begining
{
ctx->remaining_chars = stream_read(&stream);
}
else
{
stream = ctx->end_previous_token;
}
char *trimed_stream = trim_blank_left(stream);
ctx->remaining_chars -= trimed_stream - stream;
stream = trimed_stream;
return stream;
}
/*
* @brief: Updates the lexing_mode to LEXER_NORMAL
* if the SECOND quote is found at stream[i].
@ -268,6 +90,7 @@ static bool update_lexing_mode(char *stream, ssize_t i,
return *lexing_mode != mode_before_update;
}
struct token *peek_token(struct lexer_context *ctx)
{
// we already created the upcoming token during the previous call to peek()
@ -311,6 +134,12 @@ struct token *peek_token(struct lexer_context *ctx)
}
struct token *tok = new_token(stream, i);
// if token is comment, we don't want it
if (tok->type == TOKEN_COMMENT)
{
// tok = peek_token();
}
update_current_token(tok, ctx);
return tok;
}

View file

@ -1,69 +1,9 @@
#ifndef LEXER_H
#define LEXER_H
#include "lexer_utils.h"
#include <sys/types.h>
struct lexer_context
{
char *end_previous_token;
ssize_t remaining_chars;
struct token *previous_token;
struct token *current_token;
};
/* @brief: frees all fields of ctx and sets ctx to NULL.
*/
void destroy_lexer_context(struct lexer_context **ctx);
enum lexing_mode
{
LEXER_NORMAL,
LEXER_QUOTE,
LEXER_DOUBLE_QUOTE
};
enum token_type
{
// Special characters
TOKEN_NULL = 0,
TOKEN_EOF,
TOKEN_WORD,
TOKEN_NEWLINE,
// WARNING: quote and double quote should never be used inside a token.
TOKEN_QUOTE,
TOKEN_DOUBLE_QUOTE,
TOKEN_GRAVE,
TOKEN_SEMICOLON,
TOKEN_COMMENT,
TOKEN_PIPE,
TOKEN_AMPERSAND,
TOKEN_BACKSLASH,
TOKEN_DOLLAR,
TOKEN_LEFT_PAREN,
TOKEN_RIGHT_PAREN,
TOKEN_LEFT_BRACKET,
TOKEN_RIGHT_BRACKET,
TOKEN_LESS,
TOKEN_GREATER,
TOKEN_STAR,
// Keywords
TOKEN_IF,
TOKEN_THEN,
TOKEN_ELSE,
TOKEN_FI,
TOKEN_ELIF
};
struct token
{
enum token_type type;
char *data;
};
/*
* @brief: returns the next (newly allocated) token without consuming it.
* if end of input is reached, returns a token of type TOKEN_EOF.
@ -88,27 +28,4 @@ struct token *pop_token(struct lexer_context *ctx);
struct token *get_token_str(void);
/*
* @brief: return a newly allocated token, with the corresponding type.
* The data contains [size] char, starting from [begin].
*
* @return: NULL on error, a token otherwise.
*/
struct token *new_token(char *begin, ssize_t size);
/* @brief: frees the token given in argument
*/
void free_token(struct token **tok);
/*
* @brief: checks if the stream used for the last token creation is empty.
* If it is, it calls stream_read() from IO_backend,
* and sets [remaing_chars].
* If not, it starts from the end of the last token.
* Also trims left blanks before returning.
*
* @return: char* stream from which we tokenise.
*/
char *stream_init(struct lexer_context *ctx);
#endif /* ! LEXER_H */

183
src/lexer/lexer_utils.c Normal file
View file

@ -0,0 +1,183 @@
#include "lexer_utils.h"
#include <stdlib.h>
#include <string.h>
#include "../io_backend/io_backend.h"
#include "../utils/string_utils/string_utils.h"
/* @brief: if a special character is found at [begin],
* [tok->token_type] is set accordingly
*/
static void set_token_spechar(struct token *tok, char *begin, ssize_t size)
{
if (size != 1)
return;
switch (begin[0])
{
case EOF:
tok->type = TOKEN_EOF;
break;
case ';':
tok->type = TOKEN_SEMICOLON;
break;
case '\n':
tok->type = TOKEN_NEWLINE;
break;
case '\'':
tok->type = TOKEN_QUOTE;
break;
case '"':
tok->type = TOKEN_DOUBLE_QUOTE;
break;
case '`':
tok->type = TOKEN_GRAVE;
break;
case '#':
tok->type = TOKEN_COMMENT;
break;
case '|':
tok->type = TOKEN_PIPE;
break;
case '&':
tok->type = TOKEN_AMPERSAND;
break;
case '\\':
tok->type = TOKEN_BACKSLASH;
break;
case '$':
tok->type = TOKEN_DOLLAR;
break;
case '(':
tok->type = TOKEN_LEFT_PAREN;
break;
case ')':
tok->type = TOKEN_RIGHT_PAREN;
break;
case '{':
tok->type = TOKEN_LEFT_BRACKET;
break;
case '}':
tok->type = TOKEN_RIGHT_BRACKET;
break;
case '<':
tok->type = TOKEN_LESS;
break;
case '>':
tok->type = TOKEN_GREATER;
break;
case '*':
tok->type = TOKEN_STAR;
break;
default:
break;
}
}
/* @brief: if a keyword is found at [begin],
* [tok->token_type] is set accordingly
*/
static void set_token_keyword(struct token *tok, char *begin, ssize_t size)
{
if (tok->type != TOKEN_NULL || size == 0)
return;
if (strncmp(begin, "if", size) == 0)
{
tok->type = TOKEN_IF;
}
else if (strncmp(begin, "fi", size) == 0)
{
tok->type = TOKEN_FI;
}
else if (strncmp(begin, "then", size) == 0)
{
tok->type = TOKEN_THEN;
}
else if (strncmp(begin, "else", size) == 0)
{
tok->type = TOKEN_ELSE;
}
else if (strncmp(begin, "elif", size) == 0)
{
tok->type = TOKEN_ELIF;
}
// no keywords found.
if (tok->type == TOKEN_NULL)
return;
tok->data = calloc(size + 1, sizeof(char));
if (tok->data == NULL)
return;
strncpy(tok->data, begin, size);
}
/* @brief: if token_type has not yet been set, then it is a TOKEN_WORD
* Also allocates the data and fills it.
*/
static void set_token_word(struct token *tok, char *begin, ssize_t size)
{
if (tok->type == TOKEN_NULL && size != 0)
{
tok->type = TOKEN_WORD;
tok->data = calloc(size + 1, sizeof(char));
if (tok->data == NULL)
return;
strncpy(tok->data, begin, size);
}
}
struct token *new_token(char *begin, ssize_t size)
{
struct token *tok = calloc(1, sizeof(struct token));
if (tok == NULL)
return NULL;
set_token_spechar(tok, begin, size);
set_token_keyword(tok, begin, size);
set_token_word(tok, begin, size);
return tok;
}
void destroy_lexer_context(struct lexer_context **ctx)
{
if (ctx == NULL || *ctx == NULL)
return;
if ((*ctx)->previous_token != NULL)
free((*ctx)->previous_token);
if ((*ctx)->current_token != NULL)
free((*ctx)->current_token);
free(*ctx);
*ctx = NULL;
}
void free_token(struct token **tok)
{
if (tok == NULL || *tok == NULL)
return;
if ((*tok)->data != NULL)
free((*tok)->data);
free(*tok);
*tok = NULL;
}
char *stream_init(struct lexer_context *ctx)
{
char *stream;
if (ctx->previous_token == NULL) // at the begining
{
ctx->remaining_chars = stream_read(&stream);
}
else
{
stream = ctx->end_previous_token;
}
char *trimed_stream = trim_blank_left(stream);
ctx->remaining_chars -= trimed_stream - stream;
stream = trimed_stream;
return stream;
}

91
src/lexer/lexer_utils.h Normal file
View file

@ -0,0 +1,91 @@
#ifndef LEXER_UTILS_H
#define LEXER_UTILS_H
#include <sys/types.h>
#include <stddef.h>
struct lexer_context
{
char *end_previous_token;
ssize_t remaining_chars;
struct token *previous_token;
struct token *current_token;
};
/* @brief: frees all fields of ctx and sets ctx to NULL.
*/
void destroy_lexer_context(struct lexer_context **ctx);
enum lexing_mode
{
LEXER_NORMAL,
LEXER_QUOTE,
LEXER_DOUBLE_QUOTE
};
enum token_type
{
// Special characters
TOKEN_NULL = 0,
TOKEN_EOF,
TOKEN_WORD,
TOKEN_NEWLINE,
// WARNING: quote and double quote should never be used inside a token.
TOKEN_QUOTE,
TOKEN_DOUBLE_QUOTE,
TOKEN_GRAVE,
TOKEN_SEMICOLON,
TOKEN_COMMENT,
TOKEN_PIPE,
TOKEN_AMPERSAND,
TOKEN_BACKSLASH,
TOKEN_DOLLAR,
TOKEN_LEFT_PAREN,
TOKEN_RIGHT_PAREN,
TOKEN_LEFT_BRACKET,
TOKEN_RIGHT_BRACKET,
TOKEN_LESS,
TOKEN_GREATER,
TOKEN_STAR,
// Keywords
TOKEN_IF,
TOKEN_THEN,
TOKEN_ELSE,
TOKEN_FI,
TOKEN_ELIF
};
struct token
{
enum token_type type;
char *data;
};
/*
* @brief: return a newly allocated token, with the corresponding type.
* The data contains [size] char, starting from [begin].
*
* @return: NULL on error, a token otherwise.
*/
struct token *new_token(char *begin, ssize_t size);
/* @brief: frees the token given in argument
*/
void free_token(struct token **tok);
/*
* @brief: checks if the stream used for the last token creation is empty.
* If it is, it calls stream_read() from IO_backend,
* and sets [remaing_chars].
* If not, it starts from the end of the last token.
* Also trims left blanks before returning.
*
* @return: char* stream from which we tokenise.
*/
char *stream_init(struct lexer_context *ctx);
#endif /* LEXER_UTILS_H */