From c81afc2c699c119a10ead35585f8742691b77593 Mon Sep 17 00:00:00 2001 From: Matteo Flebus Date: Mon, 19 Jan 2026 17:32:45 +0100 Subject: [PATCH 1/2] feat(lexer): quote handling --- src/lexer/lexer.c | 190 ++++++++++++++++++++++++++++------------------ src/lexer/lexer.h | 14 +++- 2 files changed, 127 insertions(+), 77 deletions(-) diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index ef5a478..fd54c98 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -69,62 +69,62 @@ static void set_token_spechar(struct token *tok, char *begin, ssize_t size) return; switch (begin[0]) { - case EOF: - tok->type = TOKEN_EOF; - break; - case ';': - tok->type = TOKEN_SEMICOLON; - break; - case '\n': - tok->type = TOKEN_NEWLINE; - break; - case '\'': - tok->type = TOKEN_QUOTE; - break; - case '"': - tok->type = TOKEN_DOUBLE_QUOTE; - break; - case '`': - tok->type = TOKEN_GRAVE; - break; - case '#': - tok->type = TOKEN_COMMENT; - break; - case '|': - tok->type = TOKEN_PIPE; - break; - case '&': - tok->type = TOKEN_AMPERSAND; - break; - case '\\': - tok->type = TOKEN_BACKSLASH; - break; - case '$': - tok->type = TOKEN_DOLLAR; - break; - case '(': - tok->type = TOKEN_LEFT_PAREN; - break; - case ')': - tok->type = TOKEN_RIGHT_PAREN; - break; - case '{': - tok->type = TOKEN_LEFT_BRACKET; - break; - case '}': - tok->type = TOKEN_RIGHT_BRACKET; - break; - case '<': - tok->type = TOKEN_LESS; - break; - case '>': - tok->type = TOKEN_GREATER; - break; - case '*': - tok->type = TOKEN_STAR; - break; - default: - break; + case EOF: + tok->type = TOKEN_EOF; + break; + case ';': + tok->type = TOKEN_SEMICOLON; + break; + case '\n': + tok->type = TOKEN_NEWLINE; + break; + case '\'': + tok->type = TOKEN_QUOTE; + break; + case '"': + tok->type = TOKEN_DOUBLE_QUOTE; + break; + case '`': + tok->type = TOKEN_GRAVE; + break; + case '#': + tok->type = TOKEN_COMMENT; + break; + case '|': + tok->type = TOKEN_PIPE; + break; + case '&': + tok->type = TOKEN_AMPERSAND; + break; + case '\\': + tok->type = TOKEN_BACKSLASH; + break; + case '$': + tok->type = TOKEN_DOLLAR; + break; + case '(': + tok->type = TOKEN_LEFT_PAREN; + break; + case ')': + tok->type = TOKEN_RIGHT_PAREN; + break; + case '{': + tok->type = TOKEN_LEFT_BRACKET; + break; + case '}': + tok->type = TOKEN_RIGHT_BRACKET; + break; + case '<': + tok->type = TOKEN_LESS; + break; + case '>': + tok->type = TOKEN_GREATER; + break; + case '*': + tok->type = TOKEN_STAR; + break; + default: + break; } } @@ -224,6 +224,36 @@ char *stream_init(void) return stream; } +/* + * @brief: Updates the lexing_mode to LEXER_NORMAL + * if the SECOND quote is found at stream[i]. + * Updates the lexing_mode to the corresponding quote type + * if the FIRST quote of any type is found. + * + * @return: true if an update was done. false otherwise. + */ +static bool update_lexing_mode(char *stream, ssize_t i, + enum lexing_mode *lexing_mode) +{ + enum lexing_mode mode_before_update = *lexing_mode; + // SECOND quote + if (*lexing_mode == LEXER_QUOTE && stream[i] == '\'') + *lexing_mode = LEXER_NORMAL; + if (*lexing_mode == LEXER_DOUBLE_QUOTE && stream[i] == '"') + *lexing_mode = LEXER_NORMAL; + + // FIRST quote + if (*lexing_mode == LEXER_NORMAL) + { + if (stream[i] == '"') + *lexing_mode = LEXER_DOUBLE_QUOTE; + + if (stream[i] == '\'') + *lexing_mode = LEXER_QUOTE; + } + return *lexing_mode != mode_before_update; +} + struct token *peek_token(void) { // we already created the upcoming token during the previous call to peek() @@ -233,20 +263,26 @@ struct token *peek_token(void) } char *stream = stream_init(); - ssize_t i = 0; + // Usefull to know if we are inside a quote or double quote + enum lexing_mode lexing_mode = LEXER_NORMAL; + while (i < remaining_chars) { - if (is_special_char(stream[i])) + // true if we didn't encounter a quotes of any type at stream[i] + if (!update_lexing_mode(stream, i, &lexing_mode)) { - if (i == 0) // where we create spe_char token - i++; - break; - } - if (isblank(stream[i])) - { - break; + if (is_special_char(stream[i])) + { + if (i == 0) // where we create spe_char token + i++; + break; + } + if (isblank(stream[i])) + { + break; + } } i++; } @@ -266,20 +302,26 @@ struct token *pop_token(void) return NULL; } char *stream = stream_init(); - ssize_t i = 0; + // Usefull to know if we are inside a quote or double quote + enum lexing_mode lexing_mode = LEXER_NORMAL; + while (i < remaining_chars) { - if (is_special_char(stream[i])) + // true if we didn't encounter a quotes of any type at stream[i] + if (!update_lexing_mode(stream, i, &lexing_mode)) { - if (i == 0) // where we create spe_char token - i++; - break; - } - if (isblank(stream[i])) - { - break; + if (is_special_char(stream[i])) + { + if (i == 0) // where we create spe_char token + i++; + break; + } + if (isblank(stream[i])) + { + break; + } } i++; } diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index 6c561e0..f71907c 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -3,6 +3,13 @@ #include +enum lexing_mode +{ + LEXER_NORMAL, + LEXER_QUOTE, + LEXER_DOUBLE_QUOTE +}; + enum token_type { // Special characters @@ -10,8 +17,12 @@ enum token_type TOKEN_EOF, TOKEN_WORD, TOKEN_NEWLINE, + + // WARNING: quote and double quote should never be used inside a token. + // Its only use is to know if we are inside a quote, and which type of quote TOKEN_QUOTE, TOKEN_DOUBLE_QUOTE, + TOKEN_GRAVE, TOKEN_SEMICOLON, TOKEN_COMMENT, @@ -43,8 +54,6 @@ struct token /* * @brief: returns the next (newly allocated) token without consuming it. - * if end of input is reached, enters in EOF looping node, - * returning only the same token of type TOKEN_EOF. * if end of input is reached, returns a token of type TOKEN_EOF. */ struct token *peek_token(void); @@ -57,7 +66,6 @@ struct token *peek_token(void); * @warning: if the last returned token was a token EOF, it frees it * and returns NULL. This means that after peeking a token EOF * in the parser, there must be EXACTLY ONE call to pop_token(). - * */ struct token *pop_token(void); From bf7b7f7f68033a1050d11526ff064dea70fd7be0 Mon Sep 17 00:00:00 2001 From: Matteo Flebus Date: Mon, 19 Jan 2026 17:35:37 +0100 Subject: [PATCH 2/2] doc(lexer): update --- src/lexer/lexer.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lexer/lexer.h b/src/lexer/lexer.h index f71907c..0da6f17 100644 --- a/src/lexer/lexer.h +++ b/src/lexer/lexer.h @@ -19,7 +19,6 @@ enum token_type TOKEN_NEWLINE, // WARNING: quote and double quote should never be used inside a token. - // Its only use is to know if we are inside a quote, and which type of quote TOKEN_QUOTE, TOKEN_DOUBLE_QUOTE,