From 52e74f7a4f3f9b7a49a5b030c3b579e80aab0484 Mon Sep 17 00:00:00 2001 From: Allen Webster Date: Thu, 10 Mar 2016 11:28:44 -0500 Subject: [PATCH] more work on the new lexer --- test/4cpp_new_lexer.h | 448 +++++++++++++++++++++++++++++++++++++++--- test/experiment.cpp | 10 +- 2 files changed, 419 insertions(+), 39 deletions(-) diff --git a/test/4cpp_new_lexer.h b/test/4cpp_new_lexer.h index 4c8ce82f..94a1b5d3 100644 --- a/test/4cpp_new_lexer.h +++ b/test/4cpp_new_lexer.h @@ -6,6 +6,9 @@ #include "../4cpp_lexer_types.h" +namespace new_lex{ +// + #define lexer_link static @@ -77,13 +80,29 @@ cpp_shift_token_starts(Cpp_Token_Stack *stack, int from_token_i, int shift_amoun enum Lex_State{ LS_default, + LS_identifier, + LS_char, + LS_string, + LS_number, LS_comment_pre, LS_comment, LS_comment_block, LS_comment_block_ending, LS_dot, + LS_ellipsis, LS_less, + LS_less_less, LS_more, + LS_more_more, + LS_minus, + LS_arrow, + LS_and, + LS_or, + LS_plus, + LS_colon, + LS_star, + LS_modulo, + LS_caret, }; struct Lex_Data{ @@ -109,29 +128,69 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * int emit_token = 0; - char c; + char c = 0; chunk -= file_absolute_pos; - for (; pos < end_pos && token_i < max_token_i; ++pos){ - for (; pos < end_pos;){ - c = chunk[pos++]; - if (!(c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' || c == '\v')) break; + for (; pos < end_pos && token_i < max_token_i;){ + + c = chunk[pos]; + + if (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' || c == '\v'){ + for (; pos < end_pos;){ + c = chunk[pos++]; + if (!(c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' || c == '\v')) break; + } + --pos; } - --pos; lex_data.token_start = pos; state = LS_default; emit_token = 0; - for (; emit_token == 0 && pos < end_pos;){ - c = chunk[pos++]; - + for (; emit_token == 0 && pos <= end_pos;){ + if (pos < end_pos){ + c = chunk[pos++]; + } + else{ + c = 0; + ++pos; + } + switch (state){ case LS_default: + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'){ + state = LS_identifier; + } + else if (c >= '1' && c <= '9'){ + state = LS_number; + } + else switch (c){ + case '\'': state = LS_char; break; + case '"': state = LS_string; break; + case '/': state = LS_comment_pre; break; - + + case '.': state = LS_dot; break; + + case '<': state = LS_less; break; + case '>': state = LS_more; break; + + case '-': state = LS_minus; break; + + case '&': state = LS_and; break; + case '|': state = LS_or; break; + + case '+': state = LS_plus; break; + + case ':': state = LS_colon; break; + + case '*': state = LS_star; break; + + case '%': state = LS_modulo; break; + case '^': state = LS_caret; break; + #define OperCase(op,type) case op: emit_token = 1; break; OperCase('{', CPP_TOKEN_BRACE_OPEN); OperCase('}', CPP_TOKEN_BRACE_CLOSE); @@ -144,30 +203,83 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * OperCase('~', CPP_TOKEN_TILDE); OperCase(',', CPP_TOKEN_COMMA); + OperCase(';', CPP_TOKEN_SEMICOLON); OperCase('?', CPP_TOKEN_TERNARY_QMARK); #undef OperCase - -#if 0 - case '.': state = LS_dot; break; - case '<': state = LS_less; break; - case '>': state = LS_more; break; -#endif } break; - + + case LS_identifier: + if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')){ + emit_token = 1; + } + break; + + case LS_char: + // TODO + break; + + case LS_string: + // TODO + break; + + case LS_number: + if (c >= '0' && c <= '9'){ + state = LS_number; + } + else if (c == '.'){ + state = LS_float; + } + break; + case LS_dot: + switch (c){ + case '.': state = LS_ellipsis; break; + case '*': emit_token = 1; break; + default: emit_token = 1; break; + } break; - + + case LS_ellipsis: + emit_token = 1; + break; + case LS_less: + switch (c){ + case '<': state = LS_less_less; break; + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } break; - + + case LS_less_less: + switch (c){ + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + case LS_more: + switch (c){ + case '>': state = LS_more_more; break; + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } break; - + + case LS_more_more: + switch (c){ + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + case LS_comment_pre: switch (c){ case '/': state = LS_comment; break; case '*': state = LS_comment_block; break; + case '=': emit_token = 1; break; + default: emit_token = 1; break; } break; @@ -190,16 +302,83 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * default: state = LS_comment_block; break; } break; + + case LS_minus: + switch (c){ + case '>': state = LS_arrow; break; + case '-': emit_token = 1; break; + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_arrow: + switch (c){ + case '*': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_and: + switch (c){ + case '&': emit_token = 1; break; + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_or: + switch (c){ + case '|': emit_token = 1; break; + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_plus: + switch (c){ + case '+': emit_token = 1; break; + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_colon: + switch (c){ + case ':': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_star: + switch (c){ + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_modulo: + switch (c){ + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; + + case LS_caret: + switch (c){ + case '=': emit_token = 1; break; + default: emit_token = 1; break; + } + break; } } if (emit_token){ - --pos; lex_data.token_end = pos; switch (state){ case LS_default: - switch (chunk[pos]){ + switch (c){ #define OperCase(op,t) case op: token.type = t; break; OperCase('{', CPP_TOKEN_BRACE_OPEN); OperCase('}', CPP_TOKEN_BRACE_CLOSE); @@ -212,30 +391,235 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * OperCase('~', CPP_TOKEN_TILDE); OperCase(',', CPP_TOKEN_COMMA); + OperCase(';', CPP_TOKEN_SEMICOLON); OperCase('?', CPP_TOKEN_TERNARY_QMARK); #undef OperCase } token.flags = CPP_TFLAG_IS_OPERATOR; break; - + + case LS_identifier: + token.type = CPP_TOKEN_IDENTIFIER; + token.flags = 0; + --lex_data.token_end; + --pos; + break; + + case LS_comment_pre: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '=': token.type = CPP_TOKEN_DIVEQ; break; + default: + token.type = CPP_TOKEN_DIV; + --lex_data.token_end; + --pos; + break; + } + break; + case LS_comment: case LS_comment_block_ending: token.type = CPP_TOKEN_COMMENT; token.flags = 0; - c = chunk[pos]; + c = chunk[--lex_data.token_end]; while (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v' || c == '\f'){ - --pos; - c = chunk[pos]; + --lex_data.token_end; + c = chunk[lex_data.token_end]; + } + ++lex_data.token_end; + break; + + case LS_dot: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '*': token.type = CPP_TOKEN_PTRDOT; break; + default: + token.type = CPP_TOKEN_DOT; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_ellipsis: + switch (c){ + case '.': + token.flags = CPP_TFLAG_IS_OPERATOR; + token.type = CPP_TOKEN_ELLIPSIS; + break; + + default: + token.type = CPP_TOKEN_JUNK; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_less: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '=': token.type = CPP_TOKEN_LESSEQ; break; + default: + token.type = CPP_TOKEN_LESS; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_less_less: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '=': token.type = CPP_TOKEN_LSHIFTEQ; break; + default: + token.type = CPP_TOKEN_LSHIFT; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_more: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '=': token.type = CPP_TOKEN_GRTREQ; break; + default: + token.type = CPP_TOKEN_GRTR; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_more_more: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '=': token.type = CPP_TOKEN_RSHIFTEQ; break; + default: + token.type = CPP_TOKEN_RSHIFT; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_minus: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '-': token.type = CPP_TOKEN_DECREMENT; break; + case '=': token.type = CPP_TOKEN_SUBEQ; break; + default: + token.type = CPP_TOKEN_MINUS; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_arrow: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '*': token.type = CPP_TOKEN_PTRARROW; break; + default: + token.type = CPP_TOKEN_ARROW; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_and: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '&': token.type = CPP_TOKEN_AND; break; + case '=': token.type = CPP_TOKEN_ANDEQ; break; + default: + token.type = CPP_TOKEN_AMPERSAND; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_or: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '|': token.type = CPP_TOKEN_OR; break; + case '=': token.type = CPP_TOKEN_OREQ; break; + default: + token.type = CPP_TOKEN_BIT_OR; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_plus: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '+': token.type = CPP_TOKEN_INCREMENT; break; + case '=': token.type = CPP_TOKEN_ADDEQ; break; + default: + token.type = CPP_TOKEN_PLUS; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_colon: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case ':': token.type = CPP_TOKEN_SCOPE; break; + default: + token.type = CPP_TOKEN_COLON; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_star: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '=': token.type = CPP_TOKEN_MULEQ; break; + default: + token.type = CPP_TOKEN_STAR; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_modulo: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '=': token.type = CPP_TOKEN_MODEQ; break; + default: + token.type = CPP_TOKEN_MOD; + --lex_data.token_end; + --pos; + break; + } + break; + + case LS_caret: + token.flags = CPP_TFLAG_IS_OPERATOR; + switch (c){ + case '^': token.type = CPP_TOKEN_XOREQ; break; + default: + token.type = CPP_TOKEN_BIT_XOR; + --lex_data.token_end; + --pos; + break; } - ++pos; break; } - + token.start = lex_data.token_start; - token.size = pos - lex_data.token_start; + token.size = lex_data.token_end - lex_data.token_start; token.state_flags = pp_state; out_tokens[token_i++] = token; - - pos = lex_data.token_end; } } @@ -245,6 +629,8 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * return(lex_data); } +} + #endif // BOTTOM diff --git a/test/experiment.cpp b/test/experiment.cpp index af1bb1ce..7a635e54 100644 --- a/test/experiment.cpp +++ b/test/experiment.cpp @@ -10,20 +10,14 @@ // TOP #include "../4ed_meta.h" - #define FCPP_STRING_IMPLEMENTATION #include "../4coder_string.h" #include "../4cpp_types.h" - #include "../4cpp_lexer_types.h" - #define FCPP_LEXER_IMPLEMENTATION #include "../4cpp_lexer.h" - -namespace new_lex{ #include "4cpp_new_lexer.h" -} #include @@ -238,8 +232,8 @@ run_experiment(Experiment *exp, char *filename){ if (correct->start != testing->start || correct->size != testing->size){ pass = 0; printf("token range mismatch at token %d\n" - "\t%d:%d original %d:%d testing\n" - "\t%.*s original %.*s testing\n", + " %d:%d original %d:%d testing\n" + " %.*s original %.*s testing\n", j, correct->start, correct->size, testing->start, testing->size, correct->size, file_cpp.data + correct->start,