From 0f160be414a2587575cda54e569db6e08a28a52f Mon Sep 17 00:00:00 2001 From: Allen Webster Date: Thu, 17 Mar 2016 12:48:13 -0400 Subject: [PATCH] new lexer 20/20 --- test/4cpp_new_lexer.h | 241 +++++++++++++++++++++++++++++++++--------- test/experiment.cpp | 96 +++++++++++++---- 2 files changed, 266 insertions(+), 71 deletions(-) diff --git a/test/4cpp_new_lexer.h b/test/4cpp_new_lexer.h index fcb8816c..0ac4c5da 100644 --- a/test/4cpp_new_lexer.h +++ b/test/4cpp_new_lexer.h @@ -344,6 +344,8 @@ enum Lex_State{ LS_number, LS_number0, LS_float, + LS_crazy_float0, + LS_crazy_float1, LS_hex, LS_comment_pre, LS_comment, @@ -372,6 +374,17 @@ enum Lex_State{ LS_count }; +enum Lex_Int_State{ + LSINT_default, + LSINT_u, + LSINT_l, + LSINT_L, + LSINT_ul, + LSINT_uL, + LSINT_ll, + LSINT_extra +}; + enum Lex_INC_State{ LSINC_default, LSINC_quotes, @@ -394,20 +407,13 @@ enum Lex_PP_State{ }; struct Lex_FSM{ - unsigned short state; - char pp_state; + char state; + char int_state; char emit_token; char multi_line; char completed; }; -struct Lex_Data{ - Lex_FSM fsm; - char pp_state; - char completed; - int token_start; -}; - lexer_link Lex_PP_State cpp_pp_directive_to_state(Cpp_Token_Type type){ Lex_PP_State result = LSPP_default; @@ -490,8 +496,15 @@ cpp_push_token_nonalloc(Cpp_Token *out_tokens, int *token_i, Cpp_Token token){ } } +struct Lex_Data{ + Lex_FSM fsm; + char pp_state; + char completed; + int token_start; +}; + lexer_link Lex_Data -cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack *token_stack_out){ +cpp_lex_nonalloc(Lex_Data lex_data, char *chunk, int file_absolute_pos, int size, int last_chunk, Cpp_Token_Stack *token_stack_out){ Cpp_Token *out_tokens = token_stack_out->tokens; int token_i = token_stack_out->count; int max_token_i = token_stack_out->max_count; @@ -500,12 +513,14 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * int pos = file_absolute_pos; int end_pos = size + file_absolute_pos; - - Lex_Data lex_data = {0}; - Lex_FSM fsm = {0}; - + int stream_end_pos = 0x7FFFFFFF; char c = 0; + if (last_chunk){ + stream_end_pos = end_pos; + ++end_pos; + } + chunk -= file_absolute_pos; for (; pos < end_pos && token_i < max_token_i;){ @@ -525,10 +540,9 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * lex_data.token_start = pos; - fsm = {0}; - fsm.pp_state = lex_data.pp_state; - for (; fsm.emit_token == 0 && pos <= end_pos;){ - if (pos < end_pos){ + lex_data.fsm = {0}; + for (; lex_data.fsm.emit_token == 0 && pos < end_pos;){ + if (pos < stream_end_pos){ c = chunk[pos++]; } else{ @@ -537,10 +551,11 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * } { - unsigned short state = fsm.state; - char pp_state = fsm.pp_state; - char emit_token = fsm.emit_token; - char multi_line = fsm.multi_line; + char pp_state = lex_data.pp_state; + + char state = lex_data.fsm.state; + char emit_token = lex_data.fsm.emit_token; + char multi_line = lex_data.fsm.multi_line; switch (pp_state){ case LSPP_error: @@ -671,13 +686,14 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * case LS_char: switch(c){ case '\'': emit_token = 1; break; - case '\\': state = LS_char_slashed; multi_line |= 1; break; + case '\\': state = LS_char_slashed; break; } break; case LS_char_slashed: switch (c){ case '\r': case '\f': case '\v': break; + case '\n': state = LS_string; multi_line |= 1; break; default: state = LS_char; break; } break; @@ -685,13 +701,14 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * case LS_string: switch(c){ case '\"': emit_token = 1; break; - case '\\': state = LS_string_slashed; multi_line |= 1; break; + case '\\': state = LS_string_slashed; break; } break; case LS_string_slashed: switch (c){ case '\r': case '\f': case '\v': break; + case '\n': state = LS_string; multi_line |= 1; break; default: state = LS_string; break; } break; @@ -726,12 +743,31 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * case LS_float: if (!(c >= '0' && c <= '9')){ switch (c){ - case 'f': emit_token = 1; break; + case 'e': state = LS_crazy_float0; break; default: emit_token = 1; break; } } break; + case LS_crazy_float0: + { + if ((c >= '0' && c <= '9') || c == '-'){ + state = LS_crazy_float1; + } + else{ + emit_token = 1; + } + } + break; + + case LS_crazy_float1: + { + if (!(c >= '0' && c <= '9')){ + emit_token = 1; + } + } + break; + case LS_hex: if (!(c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F')){ emit_token = 1; @@ -904,17 +940,17 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * break; } - - fsm.state = state; - fsm.pp_state = pp_state; - fsm.emit_token = emit_token; - fsm.multi_line = multi_line; + lex_data.pp_state = pp_state; + + lex_data.fsm.state = state; + lex_data.fsm.emit_token = emit_token; + lex_data.fsm.multi_line = multi_line; } } - if (fsm.emit_token){ + if (lex_data.fsm.emit_token){ if (lex_data.pp_state == LSPP_include){ - switch (fsm.state){ + switch (lex_data.fsm.state){ case LSINC_default:break; case LSINC_quotes: @@ -929,7 +965,7 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * break; } } - else switch (fsm.state){ + else switch (lex_data.fsm.state){ case LS_default: switch (c){ #define OperCase(op,t) case op: token.type = t; break; @@ -949,8 +985,27 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * OperCase('@', CPP_TOKEN_JUNK); OperCase('$', CPP_TOKEN_JUNK); - OperCase('\\', CPP_TOKEN_JUNK); #undef OperCase + + case '\\': + if (lex_data.pp_state == LSPP_default){ + token.type = CPP_TOKEN_JUNK; + } + else{ + int restore_point = pos; + c = chunk[pos]; + while (c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f'){ + c = chunk[pos++]; + } + if (c == '\n'){ + lex_data.fsm.emit_token = 0; + } + else{ + pos = restore_point; + token.type = CPP_TOKEN_JUNK; + } + } + break; } if (c != '@' && c != '$' && c != '\\'){ token.flags = CPP_TFLAG_IS_OPERATOR; @@ -996,11 +1051,11 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * }break; case LS_pound: - token.flags = CPP_TFLAG_IS_OPERATOR; + token.flags = 0; switch (c){ - case '=': token.type = CPP_TOKEN_LESSEQ; break; + case '#': token.type = CPP_PP_CONCAT; break; default: - token.type = CPP_TOKEN_LESS; + token.type = CPP_PP_STRINGIFY; --pos; break; } @@ -1036,19 +1091,99 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * case LS_number: case LS_number0: case LS_hex: + lex_data.fsm.int_state = LSINT_default; + + { + int done = 0; + --pos; + for (; done == 0 && pos <= end_pos;){ + if (pos < end_pos){ + c = chunk[pos++]; + } + else{ + c = 0; + ++pos; + } + + switch (lex_data.fsm.int_state){ + case LSINT_default: + switch (c){ + case 'u': case 'U': lex_data.fsm.int_state = LSINT_u; break; + case 'l': lex_data.fsm.int_state = LSINT_l; break; + case 'L': lex_data.fsm.int_state = LSINT_L; break; + default: done = 1; break; + } + break; + + case LSINT_u: + switch (c){ + case 'l': lex_data.fsm.int_state = LSINT_ul; break; + case 'L': lex_data.fsm.int_state = LSINT_uL; break; + default: done = 1; break; + } + break; + + case LSINT_l: + switch (c){ + case 'l': lex_data.fsm.int_state = LSINT_ll; break; + case 'U': case 'u': lex_data.fsm.int_state = LSINT_extra; break; + default: done = 1; break; + } + break; + + case LSINT_L: + switch (c){ + case 'L': lex_data.fsm.int_state = LSINT_ll; break; + case 'U': case 'u': lex_data.fsm.int_state = LSINT_extra; break; + default: done = 1; break; + } + break; + + case LSINT_ul: + switch (c){ + case 'l': lex_data.fsm.int_state = LSINT_extra; break; + default: done = 1; break; + } + break; + + case LSINT_uL: + switch (c){ + case 'L': lex_data.fsm.int_state = LSINT_extra; break; + default: done = 1; break; + } + break; + + case LSINT_ll: + switch (c){ + case 'u': case 'U': lex_data.fsm.int_state = LSINT_extra; break; + default: done = 1; break; + } + break; + + case LSINT_extra: + done = 1; + break; + } + } + --pos; + } + token.type = CPP_TOKEN_INTEGER_CONSTANT; token.flags = 0; - --pos; break; case LS_float: + case LS_crazy_float0: + case LS_crazy_float1: token.type = CPP_TOKEN_FLOATING_CONSTANT; token.flags = 0; - if (c != 'f'){ - --pos; - } + switch (c){ + case 'f': case 'F': + case 'l': case 'L':break; + default: --pos; break; + } break; - + case LS_char: token.type = CPP_TOKEN_CHARACTER_CONSTANT; token.flags = 0; @@ -1280,7 +1415,7 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * switch (c){ case '=': token.type = CPP_TOKEN_NOTEQ; break; default: - token.type = CPP_TOKEN_BIT_NOT; + token.type = CPP_TOKEN_NOT; --pos; break; } @@ -1297,7 +1432,7 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * break; case LSPP_macro_identifier: - if (fsm.state != LS_identifier){ + if (lex_data.fsm.state != LS_identifier){ token.type = CPP_TOKEN_JUNK; lex_data.pp_state = LSPP_junk; } @@ -1307,7 +1442,7 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * break; case LSPP_identifier: - if (fsm.state != LS_identifier){ + if (lex_data.fsm.state != LS_identifier){ token.type = CPP_TOKEN_JUNK; } lex_data.pp_state = LSPP_junk; @@ -1328,14 +1463,16 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack * break; } } + + if (lex_data.fsm.emit_token){ + token.start = lex_data.token_start; + token.size = pos - lex_data.token_start; + token.flags |= (lex_data.fsm.multi_line)?(CPP_TFLAG_MULTILINE):(0); + token.flags |= (lex_data.pp_state != LSPP_default)?(CPP_TFLAG_PP_BODY):(0); + token.state_flags = lex_data.pp_state; - token.start = lex_data.token_start; - token.size = pos - lex_data.token_start; - token.flags |= (fsm.multi_line)?(CPP_TFLAG_MULTILINE):(0); - token.flags |= (fsm.pp_state != LSPP_default)?(CPP_TFLAG_PP_BODY):(0); - token.state_flags = fsm.pp_state; - - cpp_push_token_nonalloc(out_tokens, &token_i, token); + cpp_push_token_nonalloc(out_tokens, &token_i, token); + } } } diff --git a/test/experiment.cpp b/test/experiment.cpp index fa3e10ab..832d2ec5 100644 --- a/test/experiment.cpp +++ b/test/experiment.cpp @@ -20,6 +20,7 @@ #include "4cpp_new_lexer.h" #include +#include #include #include @@ -180,12 +181,17 @@ struct Experiment{ int passed_total, test_total; }; +i64 handcoded_lexer_time = 0; +i64 fsm_lexer_time = 0; + static void -run_experiment(Experiment *exp, char *filename, int verbose){ +run_experiment(Experiment *exp, char *filename, int verbose, int chunks){ String extension = {}; Data file_data; Cpp_File file_cpp; + new_lex::Lex_Data ld = {0}; int pass; + int k, chunk_size, is_last; extension = file_extension(make_string_slowly(filename)); @@ -193,7 +199,7 @@ run_experiment(Experiment *exp, char *filename, int verbose){ file_data = dump_file(filename); if (file_data.size < (100 << 10)){ pass = 1; - printf("testing on file: %s\n", filename); + if (verbose >= 0) printf("testing on file: %s\n", filename); exp->test_total++; exp->correct_stack.count = 0; @@ -205,13 +211,38 @@ run_experiment(Experiment *exp, char *filename, int verbose){ file_cpp.data = (char*)file_data.data; file_cpp.size = file_data.size; - cpp_lex_file_nonalloc(file_cpp, &exp->correct_stack, lex_data); - new_lex::cpp_lex_nonalloc((char*)file_data.data, 0, file_data.size, &exp->testing_stack); - + { + i64 start; + + start = __rdtsc(); + cpp_lex_file_nonalloc(file_cpp, &exp->correct_stack, lex_data); + handcoded_lexer_time += (__rdtsc() - start); + + start = __rdtsc(); + if (chunks){ + is_last = 0; + for (k = 0; k < file_data.size; k += chunks){ + chunk_size = chunks; + if (chunk_size + k >= file_data.size){ + chunk_size = file_data.size - k; + is_last = 1; + } + + ld = new_lex::cpp_lex_nonalloc(ld, (char*)file_data.data + k, k, is_last, chunk_size, &exp->testing_stack); + } + } + else{ + new_lex::cpp_lex_nonalloc(ld, (char*)file_data.data, 0, file_data.size, 1, &exp->testing_stack); + } + fsm_lexer_time += (__rdtsc() - start); + } + if (exp->correct_stack.count != exp->testing_stack.count){ pass = 0; - printf("error: stack size mismatch %d original and %d testing\n", - exp->correct_stack.count, exp->testing_stack.count); + if (verbose >= 0){ + printf("error: stack size mismatch %d original and %d testing\n", + exp->correct_stack.count, exp->testing_stack.count); + } } int min_count = exp->correct_stack.count; @@ -224,12 +255,12 @@ run_experiment(Experiment *exp, char *filename, int verbose){ if (correct->type != testing->type){ pass = 0; - if (verbose) printf("type mismatch at token %d\n", j); + if (verbose >= 1) printf("type mismatch at token %d\n", j); } if (correct->start != testing->start || correct->size != testing->size){ pass = 0; - if (verbose){ + if (verbose >= 1){ printf("token range mismatch at token %d\n" " %d:%d original %d:%d testing\n" " %.*s original %.*s testing\n", @@ -239,19 +270,19 @@ run_experiment(Experiment *exp, char *filename, int verbose){ testing->size, file_cpp.data + testing->start); } } - + if (correct->flags != testing->flags){ pass = 0; - if (verbose) printf("token flag mismatch at token %d\n", j); + if (verbose >= 1) printf("token flag mismatch at token %d\n", j); } } if (pass){ exp->passed_total++; - printf("test passed!\n\n"); + if (verbose >= 0) printf("test passed!\n\n"); } else{ - printf("test failed, you failed, fix it now!\n\n"); + if (verbose >= 0) printf("test failed, you failed, fix it now!\n\n"); } } @@ -262,6 +293,9 @@ run_experiment(Experiment *exp, char *filename, int verbose){ #define BASE_DIR "w:/4ed/data/test/" int main(){ + int repeats = 100; + int verbose_level = -1; + int chunks = 0; char test_directory[] = BASE_DIR; File_List all_files = {}; Experiment exp = {}; @@ -271,20 +305,44 @@ int main(){ AllowLocal(test_directory); AllowLocal(all_files); - + #if 0 - run_experiment(&exp, BASE_DIR "lexer_test.cpp", 1); + (void)(repeats); + (void)(verbose_level); + + run_experiment(&exp, BASE_DIR "crazywords.cpp", 1, chunks); #else system_set_file_list(&all_files, make_lit_string(test_directory)); - - for (int i = 0; i < all_files.count; ++i){ - if (all_files.infos[i].folder == 0){ - run_experiment(&exp, all_files.infos[i].filename.str, 0); + + for (int j = 0; j < repeats; ++j){ + for (int i = 0; i < all_files.count; ++i){ + if (all_files.infos[i].folder == 0){ + run_experiment(&exp, all_files.infos[i].filename.str, verbose_level, chunks); + } } } #endif printf("you passed %d / %d tests\n", exp.passed_total, exp.test_total); + +#define OUTLINE(type) "%-30s "type"\n" +#define OUTLINE_VAR(t, var) #var, (t)var + + if (exp.passed_total == exp.test_total && exp.passed_total > 1){ + f32 speed_up = ((f32)handcoded_lexer_time) / fsm_lexer_time; + + printf( + "\nTime information for %d repeates\n" + OUTLINE("%d") + OUTLINE("%d") + OUTLINE("%f"), + + repeats, + OUTLINE_VAR(i32, handcoded_lexer_time), + OUTLINE_VAR(i32, fsm_lexer_time), + OUTLINE_VAR(f32, speed_up) + ); + } return(0); }