/* 4coder_lex_gen_main.cpp - A generator for language lexers. */ // TOP #if !defined(LANG_NAME_LOWER) || !defined(LANG_NAME_CAMEL) #error 4coder_lex_get_main.cpp not correctly included. #endif #include "4coder_base_types.h" #include "4coder_table.h" #include "4coder_token.h" #include "pcg_basic.h" #include "4coder_base_types.cpp" #include "4coder_stringf.cpp" #include "4coder_malloc_allocator.cpp" #include "4coder_hash_functions.cpp" #include "4coder_table.cpp" #include "pcg_basic.c" #define LANG_NAME_LOWER_STR stringify(LANG_NAME_LOWER) #define LANG_NAME_CAMEL_STR stringify(LANG_NAME_CAMEL) //////////////////////////////// // NOTE(allen): PRIMARY MODEL struct Token_Kind_Node{ Token_Kind_Node *next; b32 optimized_in; String_Const_u8 name; Token_Base_Kind base_kind; }; struct Token_Kind_Set{ Token_Kind_Node *first; Token_Kind_Node *last; i32 count; Table_Data_u64 name_to_ptr; }; struct Keyword{ Keyword *next; String_Const_u8 name; String_Const_u8 lexeme; }; struct Keyword_Set{ Keyword_Set *next; Keyword *first; Keyword *last; i32 count; b32 has_fallback_token_kind; String_Const_u8 fallback_name; Table_Data_u64 name_to_ptr; Table_Data_u64 lexeme_to_ptr; String_Const_u8 pretty_name; }; struct Keyword_Set_List{ Keyword_Set *first; Keyword_Set *last; i32 count; }; struct Keyword_Layout{ u64 seed; u64 error_score; u64 max_single_error_score; f32 iterations_per_lookup; u64 *hashes; u64 *contributed_error; Keyword **slots; i32 slot_count; }; typedef i32 Flag_Reset_Rule; enum{ FlagResetRule_AutoZero, FlagResetRule_KeepState, FlagResetRule_COUNT, }; struct Flag{ Flag *next; Flag_Reset_Rule reset_rule; Token_Base_Flag emit_flags; u16 emit_sub_flags; b32 optimized_in; String_Const_u8 base_name; i32 number; i32 index; u32 value; }; struct Flag_Set{ Flag *first; Flag *last; i32 count; }; typedef i32 Emit_Handler_Kind; enum{ EmitHandlerKind_Direct, EmitHandlerKind_Keywords, EmitHandlerKind_KeywordsDelim, }; struct Emit_Handler{ Emit_Handler *next; Emit_Handler_Kind kind; Flag *flag_check; union{ String_Const_u8 token_name; Keyword_Set *keywords; }; }; struct Emit_Check{ Emit_Check *next; String_Const_u8 emit_check; Flag *flag; b32 value; }; struct Emit_Check_List{ Emit_Check *first; Emit_Check *last; i32 count; }; struct Emit_Rule{ Emit_Check_List emit_checks; Emit_Handler *first; Emit_Handler *last; i32 count; }; typedef i32 Action_Kind; enum{ ActionKind_SetFlag, ActionKind_ZeroFlags, ActionKind_DelimMarkFirst, ActionKind_DelimMarkOnePastLast, ActionKind_Consume, ActionKind_Emit, }; struct Action{ Action *next; Action *prev; Action_Kind kind; union{ struct{ Flag *flag; b32 value; } set_flag; Emit_Rule *emit_rule; }; }; struct Action_List{ Action *first; Action *last; i32 count; }; typedef i32 Action_Context; enum{ ActionContext_Normal, ActionContext_EndOfFile, }; typedef i32 Transition_Consume_Rule; enum{ Transition_Consume, Transition_NoConsume, }; global u16 smi_eof = 256; struct Field_Pin{ Field_Pin *next; // This represents the set of flags with the particular /flag/ set to /flag/ // exactly half of all flag state possibilities. Flag *flag; b32 value; }; struct Field_Pin_List{ Field_Pin_List *next; // This set is the intersection of the set represented by each pin. // A list with nothing in it is _always_ the "full set". Field_Pin *first; Field_Pin *last; i32 count; }; struct Field_Set{ // This set is the union of the set represented by each list. Field_Pin_List *first; Field_Pin_List *last; i32 count; }; struct Input_Set{ u16 *inputs; i32 count; }; struct Condition_Node{ Condition_Node *next; Field_Set fields; Input_Set inputs; }; struct Condition_Set{ Condition_Node *first; Condition_Node *last; i32 count; }; typedef i32 Transition_Case_Kind; enum{ TransitionCaseKind_NONE, // intermediates only TransitionCaseKind_CharaterArray, TransitionCaseKind_EOF, TransitionCaseKind_Fallback, // actually stored in Transition_Case "kind" field TransitionCaseKind_DelimMatch, TransitionCaseKind_DelimMatchFail, TransitionCaseKind_ConditionSet, }; struct Transition_Case{ Transition_Case_Kind kind; union{ Condition_Set condition_set; }; }; struct Transition{ Transition *next; Transition *prev; struct State *parent_state; Transition_Case condition; Action_List activation_actions; struct State *dst_state; }; struct Transition_List{ Transition *first; Transition *last; i32 count; }; struct Transition_Ptr_Node{ Transition_Ptr_Node *next; Transition *ptr; }; struct Transition_Ptr_Set{ Transition_Ptr_Node *first; Transition_Ptr_Node *last; i32 count; }; struct State{ State *next; Transition_List transitions; String_Const_u8 pretty_name; b32 optimized_in; i32 number; Transition_Ptr_Set back_references; Action_List on_entry_actions; }; struct State_Set{ State *first; State *last; i32 count; }; struct Lexer_Model{ State *root; Flag_Set flags; State_Set states; }; struct Lexer_Primary_Context{ Base_Allocator *allocator; Arena arena; Token_Kind_Set tokens; Keyword_Set_List keywords; Lexer_Model model; }; //////////////////////////////// struct Flag_Ptr_Node{ Flag_Ptr_Node *next; Flag *flag; }; struct Flag_Bucket{ String_Const_u8 pretty_name; Flag_Ptr_Node *first; Flag_Ptr_Node *last; i32 max_bits; i32 count; i32 number_of_variables; }; typedef i32 Flag_Bind_Property; enum{ FlagBindProperty_Free, FlagBindProperty_Bound, FlagBindProperty_COUNT, }; struct Flag_Bucket_Set{ Flag_Bucket buckets[FlagBindProperty_COUNT][FlagResetRule_COUNT]; }; struct Partial_Transition{ Partial_Transition *next; Field_Set fields; Action_List actions; State *dst_state; }; struct Partial_Transition_List{ Partial_Transition *first; Partial_Transition *last; i32 count; }; struct Grouped_Input_Handler{ Grouped_Input_Handler *next; u8 inputs[256]; i32 input_count; b8 inputs_used[256]; Partial_Transition_List partial_transitions; }; struct Grouped_Input_Handler_List{ Grouped_Input_Handler *first; Grouped_Input_Handler *last; i32 count; Grouped_Input_Handler *group_with_biggest_input_set; }; //////////////////////////////// // NOTE(allen): MODELING SYNTAX HELPERS struct Operator{ Operator *next; String_Const_u8 name; String_Const_u8 op; }; struct Operator_Set{ Operator *first; Operator *last; i32 count; Table_Data_u64 lexeme_to_ptr; }; struct Lexer_Helper_Context{ Lexer_Primary_Context primary_ctx; Table_u64_Data char_to_name; Token_Base_Kind selected_base_kind; State *selected_state; Operator_Set *selected_op_set; Keyword_Set *selected_key_set; Emit_Rule *selected_emit_rule; Transition *selected_transition; // convenience pointer to primary's arena. Arena *arena; }; struct Character_Set{ Table_u64_u64 table; }; #include "4coder_lex_gen_hand_written.h" #include "4coder_lex_gen_hand_written.cpp" //////////////////////////////// //////////////////////////////// //////////////////////////////// //////////////////////////////// // NOTE(allen): INTERNAL CONSTRUCTORS internal void smi_primary_init(Base_Allocator *allocator, Lexer_Primary_Context *ctx){ ctx->allocator = allocator; ctx->arena = make_arena(allocator); ctx->model.root = 0; ctx->tokens.name_to_ptr = make_table_Data_u64(allocator, 400); } internal b32 smi_try_add_token(Lexer_Primary_Context *ctx, String_Const_u8 name, Token_Base_Kind base_kind){ b32 result = false; Token_Kind_Set *set = &ctx->tokens; Table_Lookup lookup = table_lookup(&set->name_to_ptr, make_data(name.str, name.size)); if (!lookup.found_match){ Token_Kind_Node *node = push_array_zero(&ctx->arena, Token_Kind_Node, 1); node->name = push_string_copy(&ctx->arena, name); node->base_kind = base_kind; table_insert(&set->name_to_ptr, make_data(node->name.str, node->name.size), (u64)PtrAsInt(node)); sll_queue_push(set->first, set->last, node); set->count += 1; result = true; } return(result); } internal b32 smi_key(Lexer_Primary_Context *ctx, Keyword_Set *set, String_Const_u8 name, String_Const_u8 lexeme, Token_Base_Kind base_kind){ b32 result = false; Table_Lookup lookup = table_lookup(&set->name_to_ptr, make_data(name.str, name.size)); if (!lookup.found_match){ lookup = table_lookup(&set->lexeme_to_ptr, make_data(lexeme.str, lexeme.size)); if (!lookup.found_match){ if (smi_try_add_token(ctx, name, base_kind)){ Keyword *key = push_array_zero(&ctx->arena, Keyword, 1); key->name = push_string_copy(&ctx->arena, name); key->lexeme = push_string_copy(&ctx->arena, lexeme); table_insert(&set->name_to_ptr, make_data(key->name.str, key->name.size), (u64)PtrAsInt(key)); table_insert(&set->lexeme_to_ptr, make_data(key->lexeme.str, key->lexeme.size), (u64)PtrAsInt(key)); sll_queue_push(set->first, set->last, key); set->count += 1; result = true; } } } return(result); } internal b32 smi_key_fallback(Lexer_Primary_Context *ctx, Keyword_Set *set, String_Const_u8 name, Token_Base_Kind base_kind){ b32 result = false; if (!set->has_fallback_token_kind){ if (smi_try_add_token(ctx, name, base_kind)){ set->has_fallback_token_kind = true; set->fallback_name = push_string_copy(&ctx->arena, name); result = true; } } return(result); } internal State* smi_add_state(Lexer_Primary_Context *ctx, String_Const_u8 pretty_name){ State_Set *set = &ctx->model.states; State *state = push_array_zero(&ctx->arena, State, 1); sll_queue_push(set->first, set->last, state); set->count += 1; state->pretty_name = push_string_copy(&ctx->arena, pretty_name); return(state); } internal Flag* smi_add_flag(Lexer_Primary_Context *ctx, Flag_Reset_Rule rule){ Flag_Set *set = &ctx->model.flags; Flag *flag = push_array_zero(&ctx->arena, Flag, 1); flag->reset_rule = rule; sll_queue_push(set->first, set->last, flag); set->count += 1; return(flag); } internal Emit_Rule* smi_emit_rule(Arena *arena){ return(push_array_zero(arena, Emit_Rule, 1)); } internal Emit_Handler* smi_emit_handler__inner(Arena *arena, Emit_Rule *rule, Emit_Handler_Kind kind, Flag *flag_check){ Emit_Handler *handler = push_array_zero(arena, Emit_Handler, 1); handler->kind = kind; handler->flag_check = flag_check; if (rule != 0){ sll_queue_push(rule->first, rule->last, handler); rule->count += 1; } return(handler); } internal Emit_Handler* smi_emit_handler(Arena *arena, Emit_Rule *rule, String_Const_u8 name, Flag *flag_check){ Emit_Handler *handler = smi_emit_handler__inner(arena, rule, EmitHandlerKind_Direct, flag_check); handler->token_name = name; return(handler); } internal Emit_Handler* smi_emit_handler(Arena *arena, Emit_Rule *rule, Keyword_Set *set, Flag *flag_check){ Emit_Handler *handler = smi_emit_handler__inner(arena, rule, EmitHandlerKind_Keywords, flag_check); handler->keywords = set; return(handler); } internal Emit_Handler* smi_emit_handler_delim(Arena *arena, Emit_Rule *rule, Keyword_Set *set, Flag *flag_check){ Emit_Handler *handler = smi_emit_handler__inner(arena, rule, EmitHandlerKind_KeywordsDelim, flag_check); handler->keywords = set; return(handler); } internal void smi_append_set_flag(Arena *arena, Action_List *list, Flag *flag, b32 value){ Action *action = push_array_zero(arena, Action, 1); zdll_push_back(list->first, list->last, action); list->count += 1; action->kind = ActionKind_SetFlag; action->set_flag.flag = flag; action->set_flag.value = value; } internal void smi_append_zero_flags(Arena *arena, Action_List *list){ Action *action = push_array_zero(arena, Action, 1); zdll_push_back(list->first, list->last, action); list->count += 1; action->kind = ActionKind_ZeroFlags; } internal void smi_append_delim_mark_first(Arena *arena, Action_List *list){ Action *action = push_array_zero(arena, Action, 1); zdll_push_back(list->first, list->last, action); list->count += 1; action->kind = ActionKind_DelimMarkFirst; } internal void smi_append_delim_mark_one_past_last(Arena *arena, Action_List *list){ Action *action = push_array_zero(arena, Action, 1); zdll_push_back(list->first, list->last, action); list->count += 1; action->kind = ActionKind_DelimMarkOnePastLast; } internal void smi_append_consume(Arena *arena, Action_List *list){ Action *action = push_array_zero(arena, Action, 1); zdll_push_back(list->first, list->last, action); list->count += 1; action->kind = ActionKind_Consume; } internal void smi_append_emit(Arena *arena, Action_List *list, Emit_Rule *emit){ Action *action = push_array_zero(arena, Action, 1); zdll_push_back(list->first, list->last, action); list->count += 1; action->kind = ActionKind_Emit; action->emit_rule = emit; } //////////////////////////////// #if 0 internal void CHECK_PIN_LIST(Field_Pin_List *list){ i32 counter = 0; for (Field_Pin *pin = list->first; pin != 0; pin = pin->next){ counter += 1; } Assert(counter == list->count); } #else #define CHECK_PIN_LIST(x) #endif internal Field_Pin* smi_field_pin_copy(Arena *arena, Field_Pin *pin){ Field_Pin *result = push_array_zero(arena, Field_Pin, 1); result->flag = pin->flag; result->value = pin->value; return(result); } internal Field_Pin_List* smi_field_pin_list_copy(Arena *arena, Field_Pin_List list){ CHECK_PIN_LIST(&list); Field_Pin_List *new_list = push_array_zero(arena, Field_Pin_List, 1); new_list->count = list.count; for (Field_Pin *node = list.first; node != 0; node = node->next){ Field_Pin *new_pin = smi_field_pin_copy(arena, node); sll_queue_push(new_list->first, new_list->last, new_pin); } CHECK_PIN_LIST(new_list); return(new_list); } internal Field_Set smi_field_set_copy(Arena *arena, Field_Set set){ Field_Set result = {}; result.count = set.count; for (Field_Pin_List *pin_list = set.first; pin_list != 0; pin_list = pin_list->next){ Field_Pin_List *new_list = smi_field_pin_list_copy(arena, *pin_list); sll_queue_push(result.first, result.last, new_list); } return(result); } internal void smi_field_pin_sub__recursive(Arena *arena, Field_Pin_List a, Field_Pin_List *list, Field_Pin_List growing_list, Field_Set *result){ if (list != 0){ growing_list.count += 1; Field_Pin_List *next_list = list->next; for (Field_Pin *pin = list->first; pin != 0; pin = pin->next){ Field_Pin local_pin = *pin; local_pin.next = 0; sll_queue_push(growing_list.first, growing_list.last, &local_pin); smi_field_pin_sub__recursive(arena, a, next_list, growing_list, result); } } else{ b32 has_conflicts = false; Temp_Memory restore_point = begin_temp(arena); Field_Pin_List *new_list = smi_field_pin_list_copy(arena, a); for (Field_Pin *pin = growing_list.first; pin != 0; pin = pin->next){ b32 is_duplicate = false; for (Field_Pin *a_pin = new_list->first; a_pin != 0; a_pin = a_pin->next){ if (pin->flag == a_pin->flag){ if (pin->value == a_pin->value){ end_temp(restore_point); has_conflicts = true; goto double_break; } is_duplicate = true; break; } } if (!is_duplicate){ Field_Pin *new_pin = smi_field_pin_copy(arena, pin); new_pin->value = !new_pin->value; sll_queue_push(new_list->first, new_list->last, new_pin); new_list->count += 1; } } double_break:; if (!has_conflicts){ CHECK_PIN_LIST(new_list); sll_queue_push(result->first, result->last, new_list); result->count += 1; } } } internal Field_Set smi_field_pin_sub(Arena *arena, Field_Pin_List a, Field_Set b){ Field_Set result = {}; Field_Pin_List *list = b.first; Field_Pin_List growing_list = {}; smi_field_pin_sub__recursive(arena, a, list, growing_list, &result); return(result); } internal Field_Set smi_field_set_subtract(Arena *arena, Field_Set a, Field_Set b){ Field_Set result = {}; for (Field_Pin_List *list = a.first; list != 0; list = list->next){ Field_Set partial = smi_field_pin_sub(arena, *list, b); if (result.first == 0){ result = partial; } else{ if (partial.first != 0){ result.last->next = partial.first; result.last = partial.last; result.count += partial.count; } } } return(result); } internal Field_Set smi_field_set_intersect(Arena *arena, Field_Set a, Field_Set b){ Field_Set result = {}; for (Field_Pin_List *a_list = a.first; a_list != 0; a_list = a_list->next){ for (Field_Pin_List *b_list = b.first; b_list != 0; b_list = b_list->next){ b32 has_conflicts = false; Temp_Memory restore_point = begin_temp(arena); Field_Pin_List *new_list = smi_field_pin_list_copy(arena, *a_list); for (Field_Pin *b_pin = b_list->first; b_pin != 0; b_pin = b_pin->next){ b32 is_duplicate = false; for (Field_Pin *pin = new_list->first; pin != 0; pin = pin->next){ if (pin->flag == pin->flag){ if (pin->value != pin->value){ end_temp(restore_point); has_conflicts = true; goto double_break; } is_duplicate = true; break; } } if (!is_duplicate){ Field_Pin *new_pin = smi_field_pin_copy(arena, b_pin); sll_queue_push(new_list->first, new_list->last, new_pin); new_list->count += 1; } } double_break:; if (!has_conflicts){ sll_queue_push(result.first, result.last, new_list); result.count += 1; } } } return(result); } internal b32 smi_field_set_match(Arena *scratch, Field_Set a, Field_Set b){ Temp_Memory temp = begin_temp(scratch); b32 result = false; Field_Set sub = smi_field_set_subtract(scratch, a, b); if (sub.count == 0){ sub = smi_field_set_subtract(scratch, b, a); if (sub.count == 0){ result = true; } } end_temp(temp); return(result); } internal Field_Set smi_field_set_union(Arena *arena, Field_Set a, Field_Set b){ Field_Set result = {}; if (a.first != 0){ if (b.first != 0){ a = smi_field_set_copy(arena, a); // TODO(allen): simplify these lists by seeing if they union // cleanly with the lists in a! b = smi_field_set_copy(arena, b); result.first = a.first; a.last->next = b.first; result.last = b.last; result.count = a.count + b.count; } else{ result = smi_field_set_copy(arena, a); } } else{ if (b.first != 0){ result = smi_field_set_copy(arena, b); } } return(result); } internal Field_Set smi_field_set_construct(Arena *arena){ Field_Set result = {}; Field_Pin_List *list = push_array_zero(arena, Field_Pin_List, 1); sll_queue_push(result.first, result.last, list); result.count += 1; return(result); } internal Field_Set smi_field_set_construct(Arena *arena, Flag *flag, b32 value){ Field_Set result = {}; if (flag != 0){ Field_Pin_List *list = push_array_zero(arena, Field_Pin_List, 1); sll_queue_push(result.first, result.last, list); result.count += 1; Field_Pin *pin = push_array_zero(arena, Field_Pin, 1); sll_queue_push(list->first, list->last, pin); list->count += 1; pin->flag = flag; pin->value = value; } else{ result = smi_field_set_construct(arena); } return(result); } internal Input_Set smi_input_set_copy(Arena *arena, Input_Set set){ Input_Set result = {}; result.inputs = push_array_write(arena, u16, set.count, set.inputs); result.count = set.count; return(result); } internal Input_Set smi_input_set_subtract(Arena *arena, Input_Set a, Input_Set b){ Input_Set result = {}; if (a.count > 0){ Temp_Memory restore_point = begin_temp(arena); result = smi_input_set_copy(arena, a); for (i32 i = 0; i < result.count; i += 1){ b32 is_subtracted = false; for (i32 j = 0; j < b.count; j += 1){ if (result.inputs[i] == b.inputs[j]){ is_subtracted = true; break; } } if (is_subtracted){ result.count -= 1; result.inputs[i] = result.inputs[result.count]; i -= 1; } } if (a.count == 0){ end_temp(restore_point); block_zero_struct(&result); } } return(result); } internal Input_Set smi_input_set_intersect(Arena *arena, Input_Set a, Input_Set b){ Input_Set result = {}; if (a.count > 0 && b.count > 0){ Temp_Memory restore_point = begin_temp(arena); result = smi_input_set_copy(arena, a); for (i32 i = 0; i < result.count; i += 1){ b32 is_shared = false; for (i32 j = 0; j < b.count; j += 1){ if (result.inputs[i] == b.inputs[j]){ is_shared = true; break; } } if (!is_shared){ result.count -= 1; result.inputs[i] = result.inputs[result.count]; i -= 1; } } if (result.count == 0){ end_temp(restore_point); block_zero_struct(&result); } } return(result); } internal Input_Set smi_input_set_union(Arena *arena, Input_Set a, Input_Set b){ Input_Set result = {}; if (a.count > 0 || b.count > 0){ result.inputs = push_array_zero(arena, u16, a.count + b.count); block_copy_dynamic_array(result.inputs, a.inputs, a.count); result.count = a.count; for (i32 i = 0; i < b.count; i += 1){ b32 is_duplicate = false; for (i32 j = 0; j < result.count; j += 1){ if (result.inputs[j] == b.inputs[i]){ is_duplicate = true; break; } } if (!is_duplicate){ result.inputs[result.count] = b.inputs[i]; result.count += 1; } } } return(result); } internal Input_Set smi_input_set_construct(Arena *arena, String_Const_u8 characters){ Input_Set result = {}; result.count = (i32)characters.size; result.inputs = push_array_zero(arena, u16, result.count); for (u64 i = 0; i < characters.size; i += 1){ result.inputs[i] = (u16)characters.str[i]; } return(result); } internal Input_Set smi_input_set_construct_eof(Arena *arena){ Input_Set result = {}; result.count = 1; result.inputs = push_array_zero(arena, u16, result.count); result.inputs[0] = smi_eof; return(result); } internal Input_Set smi_input_set_construct_fallback(Arena *arena){ Input_Set result = {}; result.count = 257; result.inputs = push_array_zero(arena, u16, result.count); for (u16 i = 0; i < 257; i += 1){ result.inputs[i] = i; } return(result); } internal Condition_Node* smi_condition_node_copy(Arena *arena, Condition_Node *node){ Condition_Node *result = push_array_zero(arena, Condition_Node, 1); result->fields = smi_field_set_copy(arena, node->fields); result->inputs = smi_input_set_copy(arena, node->inputs); return(result); } internal Condition_Set smi_condition_set_copy(Arena *arena, Condition_Set set){ Condition_Set result = {}; for (Condition_Node *node = set.first; node != 0; node = node->next){ Condition_Node *new_node = smi_condition_node_copy(arena, node); sll_queue_push(result.first, result.last, new_node); result.count += 1; } return(result); } internal Condition_Set smi_condition_node_sub(Arena *arena, Condition_Node a, Condition_Node b){ Condition_Set result = {}; Input_Set a_minus_b_input = smi_input_set_subtract(arena, a.inputs, b.inputs); if (a_minus_b_input.count == 0){ Field_Set a_minus_b_fields = smi_field_set_subtract(arena, a.fields, b.fields); if (a_minus_b_fields.count > 0){ Condition_Node *new_node = push_array_zero(arena, Condition_Node, 1); new_node->inputs = smi_input_set_copy(arena, a.inputs); new_node->fields = a_minus_b_fields; sll_queue_push(result.first, result.last, new_node); result.count += 1; } } else{ if (a_minus_b_input.count == a.inputs.count){ Condition_Node *new_node = push_array_zero(arena, Condition_Node, 1); new_node->inputs = a_minus_b_input; new_node->fields = smi_field_set_copy(arena, a.fields); sll_queue_push(result.first, result.last, new_node); result.count += 1; } else{ Field_Set a_minus_b_fields = smi_field_set_subtract(arena, a.fields, b.fields); if (a_minus_b_fields.count == 0){ Condition_Node *new_node = push_array_zero(arena, Condition_Node, 1); new_node->inputs = a_minus_b_input; new_node->fields = smi_field_set_copy(arena, a.fields); sll_queue_push(result.first, result.last, new_node); result.count += 1; } else{ Input_Set a_int_b_input = smi_input_set_intersect(arena, a.inputs, b.inputs); Condition_Node *node_1 = push_array_zero(arena, Condition_Node, 1); node_1->inputs = a_int_b_input; node_1->fields = a_minus_b_fields; sll_queue_push(result.first, result.last, node_1); Condition_Node *node_2 = push_array_zero(arena, Condition_Node, 1); node_2->inputs = a_minus_b_input; node_2->fields = smi_field_set_copy(arena, a.fields); sll_queue_push(result.first, result.last, node_2); result.count += 2; } } } return(result); } internal Condition_Node* smi_condition_node_int(Arena *arena, Condition_Node a, Condition_Node b){ Condition_Node *result = push_array_zero(arena, Condition_Node, 1); result->inputs = smi_input_set_intersect(arena, a.inputs, b.inputs); result->fields = smi_field_set_intersect(arena, a.fields, b.fields); return(result); } internal Condition_Set smi_condition_set_subtract_node(Arena *arena, Condition_Set a, Condition_Node *b){ Condition_Set result = {}; for (Condition_Node *node = a.first; node != 0; node = node->next){ Condition_Set partial = smi_condition_node_sub(arena, *node, *b); if (result.first == 0){ result = partial; } else{ if (partial.first != 0){ result.last->next = partial.first; result.last = partial.last; result.count += partial.count; } } } return(result); } internal Condition_Set smi_condition_set_subtract(Arena *arena, Condition_Set a, Condition_Set b){ Condition_Set result = a; for (Condition_Node *node = b.first; node != 0; node = node->next){ result = smi_condition_set_subtract_node(arena, result, node); } return(result); } internal Condition_Set smi_condition_set_intersect(Arena *arena, Condition_Set a, Condition_Set b){ Condition_Set result = {}; for (Condition_Node *a_node = a.first; a_node != 0; a_node = a_node->next){ for (Condition_Node *b_node = b.first; b_node != 0; b_node = b_node->next){ Condition_Node *node = smi_condition_node_int(arena, *a_node, *b_node); if (node->inputs.count > 0 && node->fields.count > 0){ sll_queue_push(result.first, result.last, node); result.count += 1; } } } return(result); } internal Condition_Set smi_condition_set_union(Arena *arena, Condition_Set a, Condition_Set b){ Condition_Set result = {}; if (a.count != 0){ if (b.count != 0){ a = smi_condition_set_copy(arena, a); // TODO(allen): simplify these down! b = smi_condition_set_copy(arena, b); result.first = a.first; a.last->next = b.first; result.last = b.last; result.count = a.count + b.count; } else{ result = smi_condition_set_copy(arena, a); } } else{ if (b.count != 0){ result = smi_condition_set_copy(arena, b); } } return(result); } internal Condition_Node* smi_condition_node(Arena *arena, Input_Set inputs, Field_Set fields){ Condition_Node *node = push_array_zero(arena, Condition_Node, 1); node->fields = fields; node->inputs = inputs; return(node); } internal Condition_Set smi_condition(Arena *arena, Input_Set inputs, Field_Set fields){ Condition_Set result = {}; Condition_Node *node = smi_condition_node(arena, inputs, fields); sll_queue_push(result.first, result.last, node); result.count += 1; return(result); } //////////////////////////////// internal Transition* smi_case(Lexer_Primary_Context *ctx, State *state, Transition_Case_Kind kind, String_Const_u8 characters, Flag *flag_check,b32 flag_check_value, State *dst, Transition_Consume_Rule consume_rule, Emit_Rule *emit){ Transition *transition = push_array_zero(&ctx->arena, Transition, 1); transition->parent_state = state; switch (kind){ default: { transition->condition.kind = kind; }break; case TransitionCaseKind_CharaterArray: { transition->condition.kind = TransitionCaseKind_ConditionSet; Input_Set inputs = smi_input_set_construct(&ctx->arena, characters); Field_Set fields = smi_field_set_construct(&ctx->arena, flag_check, flag_check_value); transition->condition.condition_set = smi_condition(&ctx->arena, inputs, fields); }break; case TransitionCaseKind_EOF: { transition->condition.kind = TransitionCaseKind_ConditionSet; Input_Set inputs = smi_input_set_construct_eof(&ctx->arena); Field_Set fields = smi_field_set_construct(&ctx->arena, flag_check, flag_check_value); transition->condition.condition_set = smi_condition(&ctx->arena, inputs, fields); }break; case TransitionCaseKind_Fallback: { transition->condition.kind = TransitionCaseKind_ConditionSet; Input_Set inputs = smi_input_set_construct_fallback(&ctx->arena); Field_Set fields = smi_field_set_construct(&ctx->arena, flag_check, flag_check_value); transition->condition.condition_set = smi_condition(&ctx->arena, inputs, fields); }break; } transition->dst_state = dst; if (consume_rule == Transition_Consume){ smi_append_consume(&ctx->arena, &transition->activation_actions); } if (emit != 0){ smi_append_emit(&ctx->arena, &transition->activation_actions, emit); } zdll_push_back(state->transitions.first, state->transitions.last, transition); state->transitions.count += 1; return(transition); } //////////////////////////////// // NOTE(allen): CONSTRUCTORS global Lexer_Helper_Context helper_ctx = {}; internal void sm_helper_init(Base_Allocator *allocator){ smi_primary_init(allocator, &helper_ctx.primary_ctx); helper_ctx.char_to_name = make_table_u64_Data(allocator, 100); helper_ctx.arena = &helper_ctx.primary_ctx.arena; } internal void sm_char_name(u8 c, char *str){ Table_Lookup lookup = table_lookup(&helper_ctx.char_to_name, c); if (lookup.found_match){ table_erase(&helper_ctx.char_to_name, lookup); } String_Const_u8 string = push_string_copy(helper_ctx.arena, SCu8(str)); table_insert(&helper_ctx.char_to_name, c, make_data(string.str, string.size)); } internal void sm_select_base_kind(Token_Base_Kind kind){ helper_ctx.selected_base_kind = kind; } internal void sm_select_state(State *state){ helper_ctx.selected_state = state; } internal void sm_select_op_set(Operator_Set *set){ helper_ctx.selected_op_set = set; } internal void sm_select_key_set(Keyword_Set *set){ helper_ctx.selected_key_set = set; } internal void sm_select_emit(Emit_Rule *emit){ helper_ctx.selected_emit_rule = emit; } internal void sm_select_transition(Transition *transition){ helper_ctx.selected_transition = transition; } internal b32 sm_direct_token_kind(char *str){ return(smi_try_add_token(&helper_ctx.primary_ctx, SCu8(str), helper_ctx.selected_base_kind)); } internal Operator_Set* sm_begin_op_set(void){ Operator_Set *set = push_array_zero(helper_ctx.arena, Operator_Set, 1); set->lexeme_to_ptr = make_table_Data_u64(helper_ctx.primary_ctx.allocator, 100); helper_ctx.selected_op_set = set; return(set); } internal b32 sm_op(String_Const_u8 lexeme, String_Const_u8 name){ b32 result = false; Operator_Set *set = helper_ctx.selected_op_set; Table_Lookup lookup = table_lookup(&set->lexeme_to_ptr, make_data(lexeme.str, lexeme.size)); if (!lookup.found_match){ if (smi_try_add_token(&helper_ctx.primary_ctx, name, helper_ctx.selected_base_kind)){ Operator *op = push_array_zero(helper_ctx.arena, Operator, 1); op->name = push_string_copy(helper_ctx.arena, name); op->op = push_string_copy(helper_ctx.arena, lexeme); table_insert(&set->lexeme_to_ptr, make_data(op->op.str, op->op.size), (u64)PtrAsInt(op)); sll_queue_push(set->first, set->last, op); set->count += 1; result = true; } } return(result); } internal b32 sm_op(char *lexeme, char *name){ return(sm_op(SCu8(lexeme), SCu8(name))); } internal b32 sm_op(char *lexeme){ String_Const_u8 l = SCu8(lexeme); List_String_Const_u8 name_list = {}; for (u64 i = 0; i < l.size; i += 1){ Table_Lookup lookup = table_lookup(&helper_ctx.char_to_name, l.str[i]); // If this fails first check that all the characters in the lexeme are named! Assert(lookup.found_match); String_Const_u8 name_data = {}; table_read(&helper_ctx.char_to_name, lookup, &name_data); string_list_push(helper_ctx.arena, &name_list, SCu8(name_data.str, name_data.size)); } String_Const_u8 name = string_list_flatten(helper_ctx.arena, name_list); return(sm_op(l, name)); } internal Keyword_Set* sm_begin_key_set(String_Const_u8 pretty_name){ Keyword_Set *set = push_array_zero(helper_ctx.arena, Keyword_Set, 1); set->name_to_ptr = make_table_Data_u64(helper_ctx.primary_ctx.allocator, 100); set->lexeme_to_ptr = make_table_Data_u64(helper_ctx.primary_ctx.allocator, 100); set->pretty_name = push_string_copy(helper_ctx.arena, pretty_name); sll_queue_push(helper_ctx.primary_ctx.keywords.first, helper_ctx.primary_ctx.keywords.last, set); helper_ctx.primary_ctx.keywords.count += 1; helper_ctx.selected_key_set = set; return(set); } internal Keyword_Set* sm_begin_key_set(char *pretty_name){ return(sm_begin_key_set(SCu8(pretty_name))); } internal b32 sm_key(String_Const_u8 name, String_Const_u8 lexeme){ return(smi_key(&helper_ctx.primary_ctx, helper_ctx.selected_key_set, name, lexeme, helper_ctx.selected_base_kind)); } internal b32 sm_key(char *str, char *lexeme){ return(sm_key(SCu8(str), SCu8(lexeme))); } internal b32 sm_key(char *str){ String_Const_u8 name = SCu8(str); String_Const_u8 lexeme = push_string_copy(helper_ctx.arena, name); lexeme = string_mod_lower(lexeme); return(sm_key(name, lexeme)); } internal b32 sm_key_fallback(String_Const_u8 name){ return(smi_key_fallback(&helper_ctx.primary_ctx, helper_ctx.selected_key_set, name, helper_ctx.selected_base_kind)); } internal b32 sm_key_fallback(char *str){ return(sm_key_fallback(SCu8(str))); } internal State* sm_add_state(String_Const_u8 pretty_name){ return(smi_add_state(&helper_ctx.primary_ctx, pretty_name)); } internal State* sm_add_state(char *pretty_name){ return(smi_add_state(&helper_ctx.primary_ctx, SCu8(pretty_name))); } internal State* sm_begin_state_machine(void){ State *state = sm_add_state("root"); // If this fails first check sm_begin_state_machine is only called once Assert(helper_ctx.primary_ctx.model.root == 0); helper_ctx.primary_ctx.model.root = state; return(state); } internal Flag* sm_add_flag(Flag_Reset_Rule rule){ return(smi_add_flag(&helper_ctx.primary_ctx, rule)); } internal void sm_flag_bind(Flag *flag, Token_Base_Kind emit_flags){ flag->emit_flags = emit_flags; } internal void sm_sub_flag_bind(Flag *flag, u16 emit_sub_flags){ flag->emit_sub_flags = emit_sub_flags; } internal Emit_Rule* sm_emit_rule(void){ Emit_Rule *rule = smi_emit_rule(helper_ctx.arena); helper_ctx.selected_emit_rule = rule; return(rule); } internal void sm_emit_handler_direct(Flag *flag_check, String_Const_u8 name){ Emit_Rule *rule = helper_ctx.selected_emit_rule; smi_emit_handler(helper_ctx.arena, rule, name, flag_check); } internal void sm_emit_handler_direct(char *name){ sm_emit_handler_direct(0, SCu8(name)); } internal void sm_emit_handler_direct(Flag *flag_check, char *name){ sm_emit_handler_direct(flag_check, SCu8(name)); } internal void sm_emit_handler_keys(Flag *flag_check, Keyword_Set *set){ Emit_Rule *rule = helper_ctx.selected_emit_rule; smi_emit_handler(helper_ctx.arena, rule, set, flag_check); } internal void sm_emit_handler_keys(Keyword_Set *set){ sm_emit_handler_keys(0, set); } internal void sm_emit_handler_keys_delim(Flag *flag_check, Keyword_Set *set){ Emit_Rule *rule = helper_ctx.selected_emit_rule; smi_emit_handler_delim(helper_ctx.arena, rule, set, flag_check); } internal void sm_emit_handler_keys_delim(Keyword_Set *set){ sm_emit_handler_keys_delim(0, set); } internal Transition* sm_case(String_Const_u8 str, Flag *flag_check, b32 flag_check_value, State *dst, Transition_Consume_Rule consume_rule, Emit_Rule *emit){ Transition *transition = smi_case(&helper_ctx.primary_ctx, helper_ctx.selected_state, TransitionCaseKind_CharaterArray, str, flag_check, flag_check_value, dst, consume_rule, emit); helper_ctx.selected_transition = transition; return(transition); } internal Transition* sm_case(Transition_Case_Kind kind, Flag *flag_check, b32 flag_check_value, State *dst, Transition_Consume_Rule consume_rule, Emit_Rule *emit){ Assert(kind != TransitionCaseKind_CharaterArray); String_Const_u8 str = {}; Transition *transition = smi_case(&helper_ctx.primary_ctx, helper_ctx.selected_state, kind, str, flag_check, flag_check_value, dst, consume_rule, emit); helper_ctx.selected_transition = transition; return(transition); } internal Transition* sm_case(char *str, State *dst){ return(sm_case(SCu8(str), 0, 0, dst, Transition_Consume, 0)); } internal Transition* sm_case(u8 *str, State *dst){ return(sm_case(SCu8(str), 0, 0, dst, Transition_Consume, 0)); } internal Transition* sm_case_peek(char *str, State *dst){ return(sm_case(SCu8(str), 0, 0, dst, Transition_NoConsume, 0)); } internal Transition* sm_case_peek(u8 *str, State *dst){ return(sm_case(SCu8(str), 0, 0, dst, Transition_NoConsume, 0)); } internal Transition* sm_case_flagged(Flag *flag_check, b32 flag_check_value, char *str, State *dst){ return(sm_case(SCu8(str), flag_check, flag_check_value, dst, Transition_Consume, 0)); } internal Transition* sm_case_flagged(Flag *flag_check, b32 flag_check_value, u8 *str, State *dst){ return(sm_case(SCu8(str), flag_check, flag_check_value, dst, Transition_Consume, 0)); } internal Transition* sm_case_peek_flagged(Flag *flag_check, b32 flag_check_value, char *str, State *dst){ return(sm_case(SCu8(str), flag_check, flag_check_value, dst, Transition_NoConsume, 0)); } internal Transition* sm_case_peek_flagged(Flag *flag_check, b32 flag_check_value, u8 *str, State *dst){ return(sm_case(SCu8(str), flag_check, flag_check_value, dst, Transition_NoConsume, 0)); } internal Transition* sm_case(char *str, Emit_Rule *emit){ return(sm_case(SCu8(str), 0, 0, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_case(u8 *str, Emit_Rule *emit){ return(sm_case(SCu8(str), 0, 0, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_case_peek(char *str, Emit_Rule *emit){ return(sm_case(SCu8(str), 0, 0, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal Transition* sm_case_peek(u8 *str, Emit_Rule *emit){ return(sm_case(SCu8(str), 0, 0, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal Transition* sm_case_flagged(Flag *flag_check, b32 flag_check_value, char *str, Emit_Rule *emit){ return(sm_case(SCu8(str), flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_case_flagged(Flag *flag_check, b32 flag_check_value, u8 *str, Emit_Rule *emit){ return(sm_case(SCu8(str), flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_case_peek_flagged(Flag *flag_check, b32 flag_check_value, char *str, Emit_Rule *emit){ return(sm_case(SCu8(str), flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal Transition* sm_case_peek_flagged(Flag *flag_check, b32 flag_check_value, u8 *str, Emit_Rule *emit){ return(sm_case(SCu8(str), flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal Transition* sm_case_eof(State *dst){ return(sm_case(TransitionCaseKind_EOF, 0, 0, dst, Transition_Consume, 0)); } internal Transition* sm_case_eof_peek(State *dst){ return(sm_case(TransitionCaseKind_EOF, 0, 0, dst, Transition_NoConsume, 0)); } internal Transition* sm_case_eof_flagged(Flag *flag_check, b32 flag_check_value, State *dst){ return(sm_case(TransitionCaseKind_EOF, flag_check, flag_check_value, dst, Transition_Consume, 0)); } internal Transition* sm_case_eof_peek_flagged(Flag *flag_check, b32 flag_check_value, State *dst){ return(sm_case(TransitionCaseKind_EOF, flag_check, flag_check_value, dst, Transition_NoConsume, 0)); } internal Transition* sm_case_eof(Emit_Rule *emit){ return(sm_case(TransitionCaseKind_EOF, 0, 0, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_case_eof_peek(Emit_Rule *emit){ return(sm_case(TransitionCaseKind_EOF, 0, 0, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal Transition* sm_case_eof_flagged(Flag *flag_check, b32 flag_check_value, Emit_Rule *emit){ return(sm_case(TransitionCaseKind_EOF, flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_case_eof_peek_flagged(Flag *flag_check, b32 flag_check_value, Emit_Rule *emit){ return(sm_case(TransitionCaseKind_EOF, flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal Transition* sm_fallback(State *dst){ return(sm_case(TransitionCaseKind_Fallback, 0, 0, dst, Transition_Consume, 0)); } internal Transition* sm_fallback_peek(State *dst){ return(sm_case(TransitionCaseKind_Fallback, 0, 0, dst, Transition_NoConsume, 0)); } internal Transition* sm_fallback_flagged(Flag *flag_check, b32 flag_check_value, State *dst){ return(sm_case(TransitionCaseKind_Fallback, flag_check, flag_check_value, dst, Transition_Consume, 0)); } internal Transition* sm_fallback_peek_flagged(Flag *flag_check, b32 flag_check_value, State *dst){ return(sm_case(TransitionCaseKind_Fallback, flag_check, flag_check_value, dst, Transition_NoConsume, 0)); } internal Transition* sm_fallback(Emit_Rule *emit){ return(sm_case(TransitionCaseKind_Fallback, 0, 0, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_fallback_peek(Emit_Rule *emit){ return(sm_case(TransitionCaseKind_Fallback, 0, 0, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal Transition* sm_fallback_flagged(Flag *flag_check, b32 flag_check_value, Emit_Rule *emit){ return(sm_case(TransitionCaseKind_Fallback, flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_Consume, emit)); } internal Transition* sm_fallback_peek_flagged(Flag *flag_check, b32 flag_check_value, Emit_Rule *emit){ return(sm_case(TransitionCaseKind_Fallback, flag_check, flag_check_value, helper_ctx.primary_ctx.model.root, Transition_NoConsume, emit)); } internal void sm_match_delim(State *dst, State *fail_dst){ sm_case(TransitionCaseKind_DelimMatch, 0, 0, dst, Transition_NoConsume, 0); sm_case(TransitionCaseKind_DelimMatchFail, 0, 0, fail_dst, Transition_NoConsume, 0); } internal void sm_on_transition_set_flag(Flag *flag, b32 value){ Transition *transition = helper_ctx.selected_transition; smi_append_set_flag(helper_ctx.arena, &transition->activation_actions, flag, value); } internal void sm_emit_check_set_flag(String_Const_u8 emit_check, Flag *flag, b32 value){ Emit_Rule *rule = helper_ctx.selected_emit_rule; Emit_Check *new_check = push_array_zero(helper_ctx.arena, Emit_Check, 1); sll_queue_push(rule->emit_checks.first, rule->emit_checks.last, new_check); rule->emit_checks.count += 1; new_check->emit_check = push_string_copy(helper_ctx.arena, emit_check); new_check->flag = flag; new_check->value = value; } internal void sm_emit_check_set_flag(char *emit_check, Flag *flag, b32 value){ sm_emit_check_set_flag(SCu8(emit_check), flag, value); } internal void sm_set_flag(Flag *flag, b32 value){ State *state = helper_ctx.selected_state; smi_append_set_flag(helper_ctx.arena, &state->on_entry_actions, flag, value); } internal void sm_delim_mark_first(void){ State *state = helper_ctx.selected_state; smi_append_delim_mark_first(helper_ctx.arena, &state->on_entry_actions); } internal void sm_delim_mark_one_past_last(void){ State *state = helper_ctx.selected_state; smi_append_delim_mark_one_past_last(helper_ctx.arena, &state->on_entry_actions); } //////////////////////////////// // NOTE(allen): OPERATORS FOR COMPOSING MODEL COMPONENTS AS EXPRESSIONS internal Operator_Set* smo_copy_op_set(Operator_Set *set){ Operator_Set *new_set = push_array_zero(helper_ctx.arena, Operator_Set, 1); new_set->lexeme_to_ptr = make_table_Data_u64(helper_ctx.primary_ctx.allocator, set->count*2); for (Operator *node = set->first; node != 0; node = node->next){ Operator *new_node = push_array_zero(helper_ctx.arena, Operator, 1); sll_queue_push(new_set->first, new_set->last, new_node); new_set->count += 1; new_node->name = node->name; new_node->op = node->op; table_insert(&new_set->lexeme_to_ptr, make_data(new_node->op.str, new_node->op.size), (u64)PtrAsInt(new_node)); } return(new_set); } internal void smo_remove_ops_with_prefix(Operator_Set *set, String_Const_u8 prefix){ Operator *first = 0; Operator *last = 0; i32 count = 0; for (Operator *node = set->first, *next = 0; node != 0; node = next){ next = node->next; if (string_match(prefix, string_prefix(node->op, prefix.size))){ table_erase(&set->lexeme_to_ptr, make_data(node->op.str, node->op.size)); } else{ sll_queue_push(first, last, node); count += 1; } } set->first = first; set->last = last; set->count = count; } internal void smo_remove_ops_with_prefix(Operator_Set *set, char *prefix){ smo_remove_ops_with_prefix(set, SCu8(prefix)); } internal void smo_remove_ops_without_prefix(Operator_Set *set, String_Const_u8 prefix){ Operator *first = 0; Operator *last = 0; i32 count = 0; for (Operator *node = set->first, *next = 0; node != 0; node = next){ next = node->next; if (!string_match(prefix, string_prefix(node->op, prefix.size))){ table_erase(&set->lexeme_to_ptr, make_data(node->op.str, node->op.size)); } else{ sll_queue_push(first, last, node); count += 1; } } set->first = first; set->last = last; set->count = count; } internal void smo_remove_ops_without_prefix(Operator_Set *set, char *prefix){ smo_remove_ops_without_prefix(set, SCu8(prefix)); } internal void smo_ops_string_skip(Operator_Set *set, u64 size){ Operator_Set new_set = {}; new_set.lexeme_to_ptr = make_table_Data_u64(helper_ctx.primary_ctx.allocator, set->count*2); for (Operator *node = set->first, *next = 0; node != 0; node = next){ next = node->next; if (node->op.size > size){ String_Const_u8 new_op = string_skip(node->op, size); if (table_insert(&new_set.lexeme_to_ptr, make_data(new_op.str, new_op.size), (u64)PtrAsInt(node))){ node->op = new_op; sll_queue_push(new_set.first, new_set.last, node); new_set.count += 1; } } } table_free(&set->lexeme_to_ptr); *set = new_set; } internal Character_Set* smo_new_char_set(void){ Character_Set *set = push_array_zero(helper_ctx.arena, Character_Set, 1); set->table = make_table_u64_u64(helper_ctx.primary_ctx.allocator, 100); return(set); } internal void smo_char_set_union_ops_firsts(Character_Set *chars, Operator_Set *ops){ for (Operator *node = ops->first; node != 0; node = node->next){ String_Const_u8 lexeme = node->op; u64 c = lexeme.str[0]; table_insert(&chars->table, c, c); } } internal void smo_char_set_remove(Character_Set *set, char *str){ for (char *ptr = str; *ptr != 0; ptr += 1){ table_erase(&set->table, (u64)(*ptr)); } } internal char* smo_char_set_get_array(Character_Set *set){ u32 count = set->table.used_count; char *result = push_array_zero(helper_ctx.arena, char, count + 1); u32 index = 0; u32 slot_count = set->table.slot_count; for (u32 i = 0; i < slot_count; i += 1){ u64 c = set->table.keys[i]; if (c != table_empty_key && c != table_erased_key){ result[index] = (u8)(c); index += 1; } } result[count] = 0; return(result); } internal State* smo_op_set_lexer_root(Operator_Set *set, State *machine_root, String_Const_u8 fallback_token_name){ Base_Allocator *allocator = helper_ctx.primary_ctx.allocator; Table_Data_u64 string_to_state = make_table_Data_u64(allocator, set->count*8); State *root = sm_add_state("op root"); for (Operator *node = set->first; node != 0; node = node->next){ String_Const_u8 lexeme = node->op; for (u64 i = 1; i < lexeme.size; i += 1){ String_Const_u8 prefix = string_prefix(lexeme, i); Table_Lookup lookup = table_lookup(&string_to_state, make_data(prefix.str, prefix.size)); if (!lookup.found_match){ State *state = sm_add_state("op stage"); State *parent = 0; if (prefix.size == 1){ parent = root; } else{ lookup = table_lookup(&string_to_state, make_data(prefix.str, prefix.size - 1)); Assert(lookup.found_match); u64 val = 0; table_read(&string_to_state, lookup, &val); parent = (State*)IntAsPtr(val); } u8 space[1]; space[0] = prefix.str[prefix.size - 1]; String_Const_u8 string = {space, 1}; smi_case(&helper_ctx.primary_ctx, parent, TransitionCaseKind_CharaterArray, string, 0, 0, state, Transition_Consume, 0); table_insert(&string_to_state, make_data(prefix.str, prefix.size), (u64)PtrAsInt(state)); } } } for (Operator *node = set->first; node != 0; node = node->next){ String_Const_u8 lexeme = node->op; Table_Lookup lookup = table_lookup(&string_to_state, make_data(lexeme.str, lexeme.size)); if (!lookup.found_match){ State *parent = 0; if (lexeme.size == 1){ parent = root; } else{ lookup = table_lookup(&string_to_state, make_data(lexeme.str, lexeme.size - 1)); Assert(lookup.found_match); u64 val = 0; table_read(&string_to_state, lookup, &val); parent = (State*)IntAsPtr(val); } u8 space[1]; space[0] = lexeme.str[lexeme.size - 1]; String_Const_u8 string = {space, 1}; Emit_Rule *emit = smi_emit_rule(helper_ctx.arena); smi_emit_handler(helper_ctx.arena, emit, node->name, 0); smi_case(&helper_ctx.primary_ctx, parent, TransitionCaseKind_CharaterArray, string, 0, 0, machine_root, Transition_Consume, emit); } } for (Operator *node = set->first; node != 0; node = node->next){ String_Const_u8 lexeme = node->op; Table_Lookup lookup = table_lookup(&string_to_state, make_data(lexeme.str, lexeme.size)); if (lookup.found_match){ u64 val = 0; table_read(&string_to_state, lookup, &val); State *state = (State*)IntAsPtr(val); String_Const_u8 string = {}; Emit_Rule *emit = smi_emit_rule(helper_ctx.arena); smi_emit_handler(helper_ctx.arena, emit, node->name, 0); smi_case(&helper_ctx.primary_ctx, state, TransitionCaseKind_Fallback, string, 0, 0, machine_root, Transition_NoConsume, emit); } } { String_Const_u8 zero_string = {}; Emit_Rule *emit = smi_emit_rule(helper_ctx.arena); smi_emit_handler(helper_ctx.arena, emit, fallback_token_name, 0); smi_case(&helper_ctx.primary_ctx, root, TransitionCaseKind_Fallback, zero_string, 0, 0, machine_root, Transition_NoConsume, emit); } for (Operator *node = set->first; node != 0; node = node->next){ String_Const_u8 lexeme = node->op; for (u64 i = 1; i < lexeme.size; i += 1){ String_Const_u8 prefix = string_prefix(lexeme, i); Table_Lookup lookup = table_lookup(&string_to_state, make_data(prefix.str, prefix.size)); Assert(lookup.found_match); u64 val = 0; table_read(&string_to_state, lookup, &val); State *state = (State*)IntAsPtr(val); String_Const_u8 string = {}; Emit_Rule *emit = smi_emit_rule(helper_ctx.arena); smi_emit_handler(helper_ctx.arena, emit, fallback_token_name, 0); smi_case(&helper_ctx.primary_ctx, state, TransitionCaseKind_Fallback, string, 0, 0, machine_root, Transition_NoConsume, emit); } } table_free(&string_to_state); return(root); } internal State* smo_op_set_lexer_root(Operator_Set *set, State *machine_root, char *fallback_token_name){ return(smo_op_set_lexer_root(set, machine_root, SCu8(fallback_token_name))); } //////////////////////////////// // NOTE(allen): HELPERS // NOTE(allen): utf8 should be an u8 array with 129 slots. // This will fill it out to represent all characters above the ASCII range. internal void smh_utf8_fill(u8 *utf8){ for (u16 i = 0; i < 128; i += 1){ utf8[i] = (u8)(i + 128); } utf8[128] = 0; } internal void smh_set_base_character_names(void){ sm_char_name('{', "BraceOp"); sm_char_name('}', "BraceCl"); sm_char_name('(', "ParenOp"); sm_char_name(')', "ParenCl"); sm_char_name('[', "BrackOp"); sm_char_name(']', "BrackCl"); sm_char_name('-', "Minus"); sm_char_name('+', "Plus"); sm_char_name('.', "Dot"); sm_char_name('!', "Bang"); sm_char_name('*', "Star"); sm_char_name(',', "Comma"); sm_char_name(':', "Colon"); sm_char_name(';', "Semicolon"); sm_char_name('@', "At"); sm_char_name('#', "Pound"); sm_char_name('$', "Dollar"); sm_char_name('%', "Percent"); sm_char_name('^', "Carrot"); sm_char_name('&', "Amp"); sm_char_name('=', "Eq"); sm_char_name('<', "Less"); sm_char_name('>', "Grtr"); sm_char_name('~', "Tilde"); sm_char_name('/', "Slash"); sm_char_name('?', "Question"); sm_char_name('|', "Pipe"); } internal void smh_typical_tokens(void){ sm_select_base_kind(TokenBaseKind_EOF); sm_direct_token_kind("EOF"); sm_select_base_kind(TokenBaseKind_Whitespace); sm_direct_token_kind("Whitespace"); sm_select_base_kind(TokenBaseKind_LexError); sm_direct_token_kind("LexError"); } //////////////////////////////// //////////////////////////////// //////////////////////////////// //////////////////////////////// //////////////////////////////// // NOTE(allen): OPTIMIZER internal String_Const_u8 string_char_subtract(String_Const_u8 a, String_Const_u8 b){ for (u64 i = 0; i < b.size; i += 1){ u8 c = b.str[i]; for (u64 j = 0; j < a.size;){ if (a.str[j] == c){ a.str[j] = a.str[a.size - 1]; a.size -= 1; } else{ j += 1; } } } return(a); } internal Action_List opt_copy_action_list(Arena *arena, Action_List actions){ Action_List result = {}; for (Action *node = actions.first; node != 0; node = node->next){ Action *new_node = push_array_write(arena, Action, 1, node); zdll_push_back(result.first, result.last, new_node); result.count += 1; } return(result); } internal Flag* opt_flag_fixup(Flag *old_flag, Table_u64_u64 old_to_new){ Flag *result = 0; if (old_flag != 0){ Table_Lookup lookup = table_lookup(&old_to_new, (u64)PtrAsInt(old_flag)); Assert(lookup.found_match); u64 val = 0; table_read(&old_to_new, lookup, &val); result = (Flag*)IntAsPtr(val); } return(result); } internal Transition_Case opt_copy_condition(Arena *arena, Transition_Case condition, Table_u64_u64 old_to_new){ Transition_Case result = condition; if (result.kind == TransitionCaseKind_ConditionSet){ result.condition_set = smi_condition_set_copy(arena, condition.condition_set); for (Condition_Node *node = result.condition_set.first; node != 0; node = node->next){ Field_Set fields = node->fields; for (Field_Pin_List *pin_list = fields.first; pin_list != 0; pin_list = pin_list->next){ for (Field_Pin *pin = pin_list->first; pin != 0; pin = pin->next){ pin->flag = opt_flag_fixup(pin->flag, old_to_new); } } } } return(result); } internal Emit_Rule* opt_copy_emit_rule(Arena *arena, Emit_Rule *emit, Table_u64_u64 old_to_new){ Emit_Rule *new_emit = push_array_write(arena, Emit_Rule, 1, emit); block_zero_struct(&new_emit->emit_checks); for (Emit_Check *emit_check = emit->emit_checks.first; emit_check != 0; emit_check = emit_check->next){ Emit_Check *new_emit_check = push_array_write(arena, Emit_Check, 1, emit_check); sll_queue_push(new_emit->emit_checks.first, new_emit->emit_checks.last, new_emit_check); new_emit->emit_checks.count += 1; new_emit_check->flag = opt_flag_fixup(new_emit_check->flag, old_to_new); } new_emit->first = 0; new_emit->last = 0; for (Emit_Handler *handler = emit->first; handler != 0; handler = handler->next){ Emit_Handler *new_handler = push_array_write(arena, Emit_Handler, 1, handler); sll_queue_push(new_emit->first, new_emit->last, new_handler); new_handler->flag_check = opt_flag_fixup(handler->flag_check, old_to_new); } return(new_emit); } internal Lexer_Model opt_copy_model(Arena *arena, Lexer_Model model){ Lexer_Model result = {}; i32 pointer_count = model.states.count + model.flags.count; Table_u64_u64 old_to_new = make_table_u64_u64(arena->base_allocator, pointer_count*2); Table_u64_u64 new_to_old = make_table_u64_u64(arena->base_allocator, pointer_count*2); for (Flag *flag = model.flags.first; flag != 0; flag = flag->next){ Flag *new_flag = push_array_zero(arena, Flag, 1); sll_queue_push(result.flags.first, result.flags.last, new_flag); result.flags.count += 1; new_flag->reset_rule = flag->reset_rule; new_flag->emit_flags = flag->emit_flags; new_flag->emit_sub_flags = flag->emit_sub_flags; table_insert(&old_to_new, (u64)PtrAsInt(flag), (u64)PtrAsInt(new_flag)); table_insert(&new_to_old, (u64)PtrAsInt(new_flag), (u64)PtrAsInt(flag)); } for (State *state = model.states.first; state != 0; state = state->next){ State *new_state = push_array_zero(arena, State, 1); sll_queue_push(result.states.first, result.states.last, new_state); result.states.count += 1; table_insert(&old_to_new, (u64)PtrAsInt(state), (u64)PtrAsInt(new_state)); table_insert(&new_to_old, (u64)PtrAsInt(new_state), (u64)PtrAsInt(state)); new_state->pretty_name = push_string_copy(arena, state->pretty_name); } for (State *new_state = result.states.first; new_state != 0; new_state = new_state->next){ Table_Lookup lookup = table_lookup(&new_to_old, (u64)PtrAsInt(new_state)); Assert(lookup.found_match); State *state = 0; u64 val = 0; table_read(&new_to_old, lookup, &val); state = (State*)(IntAsPtr(val)); for (Transition *trans = state->transitions.first; trans != 0; trans = trans->next){ Transition *new_trans = push_array_zero(arena, Transition, 1); zdll_push_back(new_state->transitions.first, new_state->transitions.last, new_trans); new_state->transitions.count += 1; new_trans->parent_state = new_state; new_trans->condition = opt_copy_condition(arena, trans->condition, old_to_new); new_trans->activation_actions = opt_copy_action_list(arena, trans->activation_actions); for (Action *action = new_trans->activation_actions.first; action != 0; action = action->next){ switch (action->kind){ case ActionKind_SetFlag: { action->set_flag.flag = opt_flag_fixup(action->set_flag.flag, old_to_new); }break; case ActionKind_Emit: { action->emit_rule = opt_copy_emit_rule(arena, action->emit_rule, old_to_new); }break; } } lookup = table_lookup(&old_to_new, (u64)PtrAsInt(trans->dst_state)); Assert(lookup.found_match); State *new_dst_state = 0; table_read(&old_to_new, lookup, &val); new_dst_state = (State*)(IntAsPtr(val)); new_trans->dst_state = new_dst_state; } } table_free(&old_to_new); table_free(&new_to_old); for (State *state = model.states.first, *new_state = result.states.first; state != 0 && new_state != 0; state = state->next, new_state = new_state->next){ if (model.root == state){ result.root = new_state; break; } } Assert(result.root); return(result); } internal void opt_simplify_transitions(Lexer_Primary_Context *ctx){ for (State *state = ctx->model.states.first; state != 0; state = state->next){ Transition_List *transitions = &state->transitions; b32 is_delim_match = false; if (transitions->first->condition.kind == TransitionCaseKind_DelimMatch){ is_delim_match = true; } if (!is_delim_match){ Transition *first = 0; Transition *last = 0; i32 count = 0; for (Transition *trans = transitions->first, *next = 0; trans != 0; trans = next){ next = trans->next; Transition_Case condition = trans->condition; Assert(condition.kind == TransitionCaseKind_ConditionSet); Condition_Set condition_set = condition.condition_set; for (Transition *prev_trans = first; prev_trans != 0; prev_trans = prev_trans->next){ Transition_Case prev_condition = prev_trans->condition; condition_set = smi_condition_set_subtract(&ctx->arena, condition_set, prev_condition.condition_set); if (condition_set.count == 0){ break; } } if (condition_set.count != 0){ trans->condition.condition_set = condition_set; zdll_push_back(first, last, trans); count += 1; } } transitions->first = first; transitions->last = last; transitions->count = count; } } } internal void opt_mark_all_states_excluded(Lexer_Primary_Context *ctx){ for (State *state = ctx->model.states.first; state != 0; state = state->next){ state->optimized_in = false; } } internal void opt_mark_all_states_included(Lexer_Primary_Context *ctx){ for (State *state = ctx->model.states.first; state != 0; state = state->next){ state->optimized_in = true; } } internal void opt_discard_all_excluded_states(Lexer_Primary_Context *ctx){ State *first = 0; State *last = 0; i32 count = 0; for (State *state = ctx->model.states.first, *next = 0; state != 0; state = next){ next = state->next; if (state->optimized_in){ state->optimized_in = false; sll_queue_push(first, last, state); count += 1; } } ctx->model.states.first = first; ctx->model.states.last = last; ctx->model.states.count = count; } internal void opt_include_reachable_states(State *state){ if (!state->optimized_in){ state->optimized_in = true; for (Transition *trans = state->transitions.first; trans != 0; trans = trans->next){ opt_include_reachable_states(trans->dst_state); } } } internal void opt_update_state_back_references(Lexer_Primary_Context *ctx){ for (State *state = ctx->model.states.first; state != 0; state = state->next){ block_zero_struct(&state->back_references); } for (State *state = ctx->model.states.first; state != 0; state = state->next){ for (Transition *trans = state->transitions.first; trans != 0; trans = trans->next){ State *dst = trans->dst_state; Transition_Ptr_Node *new_ptr_node = push_array_zero(&ctx->arena, Transition_Ptr_Node, 1); new_ptr_node->ptr = trans; sll_queue_push(dst->back_references.first, dst->back_references.last, new_ptr_node); dst->back_references.count += 1; } } } internal void opt_set_auto_zero_flags_on_root(Lexer_Primary_Context *ctx){ State *root = ctx->model.root; smi_append_zero_flags(&ctx->arena, &root->on_entry_actions); } internal void opt_transfer_state_actions_to_transitions(Lexer_Primary_Context *ctx){ opt_update_state_back_references(ctx); for (State *state = ctx->model.states.first; state != 0; state = state->next){ Action_List actions = state->on_entry_actions; if (actions.count > 0){ for (Transition_Ptr_Node *node = state->back_references.first; node != 0; node = node->next){ Transition *trans = node->ptr; Action_List actions_copy = opt_copy_action_list(&ctx->arena, actions); if (trans->activation_actions.first == 0){ trans->activation_actions = actions_copy; } else{ trans->activation_actions.last->next = actions_copy.first; actions_copy.first->prev = trans->activation_actions.last; trans->activation_actions.last = actions_copy.last; trans->activation_actions.count += actions_copy.count; } } block_zero_struct(&state->on_entry_actions); } } } internal void opt_flags_set_numbers(Lexer_Model model){ i32 number = 0; for (Flag *flag = model.flags.first; flag != 0; flag = flag->next){ flag->number = number; number += 1; } } internal void opt_states_set_numbers(Lexer_Model model){ i32 number = 1; for (State *state = model.states.first; state != 0; state = state->next){ state->number = number; number += 1; } } internal void opt_transition_pull_actions_backward(Lexer_Primary_Context *ctx, Transition *a, Transition *b){ if (b->activation_actions.count > 0){ Action_List b_actions = opt_copy_action_list(&ctx->arena, b->activation_actions); if (a->activation_actions.first == 0){ a->activation_actions = b_actions; } else{ if (b_actions.first != 0){ a->activation_actions.last->next = b_actions.first; a->activation_actions.last = b_actions.last; a->activation_actions.count += b_actions.count; } } } a->dst_state = b->dst_state; } internal void opt_transition_push_actions_forward(Lexer_Primary_Context *ctx, Transition *a, Transition *b){ if (b->activation_actions.count > 0){ Action_List a_actions = opt_copy_action_list(&ctx->arena, a->activation_actions); if (b->activation_actions.first == 0){ b->activation_actions = a_actions; } else{ if (a_actions.first != 0){ a_actions.last->next = b->activation_actions.first; b->activation_actions.first = a_actions.first; b->activation_actions.count += a_actions.count; } } } } internal b32 opt_action_list_contains_consume(Action_List list){ b32 result = false; for (Action *act = list.first; act != 0; act = act->next){ if (act->kind == ActionKind_Consume){ result = true; break; } } return(result); } internal void opt_skip_past_thunk_states(Lexer_Primary_Context *ctx){ opt_mark_all_states_included(ctx); for (State *state = ctx->model.states.first; state != 0; state = state->next){ // TODO(allen): A more complete thunk state test would check if all transitions // have the same effect. If they do, then it is a thunk state. Only having // one transition is just a special case of this more general rule. if (state->transitions.count == 1){ Transition *trans = state->transitions.first; // TODO(allen): Consumes could be pulled forward into the transition actions // for these types of "thunk states" as well, but only if we add a new concept // for representing "action blocks" separately from actions contained in a // transition handler, so that a handler can have multiple blocks. Then we would // need to be able to identify thunk cycles, and add an entire extra concept to // the state machine generated code, that it can sometimes get into a "stateless" // thunk loop that can never be exited, but continues to consume one input at // a time doing each action block. b32 contains_consume = opt_action_list_contains_consume(trans->activation_actions); if (!contains_consume){ state->optimized_in = false; } } } for (State *state = ctx->model.states.first; state != 0; state = state->next){ if (state->optimized_in){ Transition_List *transitions = &state->transitions; for (Transition *trans = transitions->first; trans != 0; trans = trans->next){ for (;!trans->dst_state->optimized_in;){ Transition *dst_trans = trans->dst_state->transitions.first; opt_transition_pull_actions_backward(ctx, trans, dst_trans); } } } } } internal b32 opt_emit_rule_match(Emit_Rule *rule_a, Emit_Rule *rule_b){ b32 result = true; if (rule_a->emit_checks.count != rule_b->emit_checks.count){ result = false; goto end; } for (Emit_Check *check_a = rule_a->emit_checks.first, *check_b = rule_b->emit_checks.first; check_a != 0 && check_b != 0; check_a = check_a->next, check_b = check_b->next){ if (check_a->flag != check_b->flag || !string_match(check_a->emit_check, check_b->emit_check) || check_a->value != check_b->value){ result = false; goto end; } } if (rule_a->count != rule_b->count){ result = false; goto end; } for (Emit_Handler *handler_a = rule_a->first, *handler_b = rule_b->first; handler_a != 0 && handler_b != 0; handler_a = handler_a->next, handler_b = handler_b->next){ if (handler_a->kind != handler_b->kind || handler_a->flag_check != handler_b->flag_check){ result = false; goto end; } switch (handler_a->kind){ case EmitHandlerKind_Direct: { if (!string_match(handler_a->token_name, handler_b->token_name)){ result = false; goto end; } }break; case EmitHandlerKind_Keywords: case EmitHandlerKind_KeywordsDelim: { if (handler_a->keywords != handler_b->keywords){ result = false; goto end; } }break; } } end:; return(result); } internal b32 opt_action_lists_match(Action_List a, Action_List b){ b32 result = false; if (a.count == b.count){ result = true; for (Action *node_a = a.first, *node_b = b.first; node_a != 0 && node_b != 0; node_a = node_a->next, node_b = node_b->next){ if (node_a->kind != node_b->kind){ result = false; goto double_break; } switch (node_a->kind){ case ActionKind_SetFlag: { if (node_a->set_flag.flag != node_b->set_flag.flag || node_a->set_flag.value != node_b->set_flag.value){ result = false; goto double_break; } }break; case ActionKind_Emit: { if (!opt_emit_rule_match(node_a->emit_rule, node_b->emit_rule)){ result = false; goto double_break; } }break; } } } double_break:; return(result); } internal void opt_merge_redundant_transitions_in_each_state(Lexer_Primary_Context *ctx){ for (State *state = ctx->model.states.first; state != 0; state = state->next){ Transition_List *transitions = &state->transitions; Transition *first = 0; Transition *last = 0; i32 count = 0; for (Transition *trans = transitions->first, *next = 0; trans != 0; trans = next){ next = trans->next; Transition *merge_trans = 0; for (Transition *comp_trans = trans->next; comp_trans != 0; comp_trans = comp_trans->next){ if (opt_action_lists_match(trans->activation_actions, comp_trans->activation_actions) && trans->dst_state == comp_trans->dst_state){ merge_trans = comp_trans; break; } } if (merge_trans != 0){ Assert(trans->condition.kind == TransitionCaseKind_ConditionSet); Assert(merge_trans->condition.kind == TransitionCaseKind_ConditionSet); merge_trans->condition.condition_set = smi_condition_set_union(&ctx->arena, trans->condition.condition_set, merge_trans->condition.condition_set); } else{ zdll_push_back(first, last, trans); count += 1; } } transitions->first = first; transitions->last = last; transitions->count = count; } } internal b32 opt_condition_set_is_subset(Arena *scratch, Condition_Set sub, Condition_Set super){ Temp_Memory temp = begin_temp(scratch); Condition_Set left_over = smi_condition_set_subtract(scratch, sub, super); b32 result = (left_over.count == 0); end_temp(temp); return(result); } internal void opt_remove_peeks_without_creating_transition_splits(Lexer_Primary_Context *ctx){ for (State *state = ctx->model.states.first; state != 0; state = state->next){ Transition_List *transitions = &state->transitions; if (transitions->first->condition.kind != TransitionCaseKind_ConditionSet){ continue; } for (Transition *trans = transitions->first; trans != 0; trans = trans->next){ i32 step_counter = 0; for (;!opt_action_list_contains_consume(trans->activation_actions); step_counter += 1){ // NOTE(allen): Hitting this (most likely) indicates a peek cycle // that wasn't caught by type checking. Assert(step_counter < ctx->model.states.count); b32 found_action_extension = false; State *dst_state = trans->dst_state; Transition_List *dst_transitions = &dst_state->transitions; if (dst_transitions->first->condition.kind != TransitionCaseKind_ConditionSet){ break; } for (Transition *dst_trans = dst_transitions->first; dst_trans != 0; dst_trans = dst_trans->next){ if (opt_condition_set_is_subset(&ctx->arena, trans->condition.condition_set, dst_trans->condition.condition_set)){ opt_transition_pull_actions_backward(ctx, trans, dst_trans); found_action_extension = true; break; } } if (!found_action_extension){ break; } } } } } internal void opt_remove_peeks_into_single_entry_point_states(Lexer_Primary_Context *ctx){ opt_update_state_back_references(ctx); opt_mark_all_states_included(ctx); for (State *state = ctx->model.states.first; state != 0; state = state->next){ if (state->transitions.first->condition.kind != TransitionCaseKind_ConditionSet){ continue; } if (state->back_references.count == 1){ Transition *src_trans = state->back_references.first->ptr; if (src_trans->condition.kind != TransitionCaseKind_ConditionSet){ continue; } if (!opt_action_list_contains_consume(src_trans->activation_actions)){ State *src_state = src_trans->parent_state; state->optimized_in = false; Transition *first = 0; Transition *last = 0; i32 count = 0; for (Transition *trans = state->transitions.first, *next = 0; trans != 0; trans = next){ next = trans->next; trans->condition.condition_set = smi_condition_set_intersect(&ctx->arena, trans->condition.condition_set, src_trans->condition.condition_set); if (trans->condition.condition_set.count > 0){ trans->parent_state = src_state; opt_transition_push_actions_forward(ctx, src_trans, trans); zdll_push_back(first, last, trans); count += 1; } } Assert(count != 0); if (src_trans->prev != 0){ src_trans->prev->next = first; } if (src_trans->next != 0){ src_trans->next->prev = last; } first->prev = src_trans->prev; last->next = src_trans->next; src_state->transitions.count += count; } } } } internal b32 opt_condition_is_eof_only(Transition_Case condition){ b32 result = false; if (condition.kind == TransitionCaseKind_ConditionSet){ result = true; for (Condition_Node *node = condition.condition_set.first; node != 0; node = node->next){ Input_Set inputs = node->inputs; if (inputs.count > 1 || inputs.inputs[0] != smi_eof){ result = false; break; } } } return(result); } internal Keyword_Layout opt_key_layout(Arena *arena, Keyword_Set keywords, i32 slot_count, u64 seed){ Keyword_Layout layout = {}; slot_count = clamp_bot(keywords.count + 1, slot_count); layout.seed = seed; layout.hashes = push_array_zero(arena, u64, slot_count); layout.contributed_error = push_array_zero(arena, u64, slot_count); layout.slots = push_array_zero(arena, Keyword*, slot_count); layout.slot_count = slot_count; for (Keyword *keyword = keywords.first; keyword != 0; keyword = keyword->next){ u64 hash = lexeme_hash(seed, keyword->lexeme.str, keyword->lexeme.size); i32 first_index = (hash%slot_count); i32 index = first_index; Keyword *keyword_insert = keyword; u64 contributed_error = 0; for (;;){ if (layout.slots[index] == 0){ layout.hashes[index] = hash; layout.contributed_error[index] = contributed_error; layout.slots[index] = keyword_insert; break; } else{ if (contributed_error > layout.contributed_error[index]){ Swap(u64, hash, layout.hashes[index]); Swap(Keyword*, keyword_insert, layout.slots[index]); Swap(u64, contributed_error, layout.contributed_error[index]); } } index += 1; contributed_error += 1; if (index >= slot_count){ index = 0; } if (index == first_index){ InvalidPath; } } } i32 max_run_length = 0; i32 run_length = 0; for (i32 i = 0; i < slot_count; i += 1){ if (layout.slots[i] == 0){ run_length = 0; } else{ run_length += 1; layout.error_score += run_length; max_run_length = Max(max_run_length, run_length); } } i32 total_run_length = run_length; for (i32 i = 0; i < slot_count; i += 1){ if (layout.slots[i] == 0){ break; } else{ layout.error_score += run_length; total_run_length += 1; max_run_length = Max(max_run_length, total_run_length); } } layout.max_single_error_score = max_run_length; layout.iterations_per_lookup = (f32)layout.error_score/(f32)layout.slot_count; return(layout); } internal u64 random_u64_dirty(void){ u64 a = pcg32_random(); u64 b = pcg32_random(); return((b << 32) | a); } #if 0 internal Keyword_Layout opt_key_layout(Arena *arena, Keyword_Set keywords){ i32 slot_count = keywords.count*2; u64 seed = random_u64_dirty(); return(opt_key_layout(arena, keywords, slot_count, seed)); } #endif internal Keyword_Layout opt_key_layout(Arena *arena, Keyword_Set keywords){ i32 init_slot_count = keywords.count + 1; if (keywords.count == 1){ init_slot_count = 1; } #if 0 // heavy optimization effort f32 acceptable_error_threshold = 2.f; f32 accumulated_error_threshold = 8000.f; i32 acceptable_max_single_error = 4; i32 accumulated_max_single_error_threshold = Thousand(800); #else // light optimization effort f32 acceptable_error_threshold = 1.1f; f32 accumulated_error_threshold = 200.f; i32 acceptable_max_single_error = 5; i32 accumulated_max_single_error_threshold = Thousand(40); #endif Keyword_Layout best_layout = {}; best_layout.iterations_per_lookup = max_f32; i32 slot_count = init_slot_count; for (;; slot_count += 1){ f32 accumulated_error = 0; for (;;){ u64 seed = random_u64_dirty(); Temp_Memory restore_point = begin_temp(arena); Keyword_Layout layout = opt_key_layout(arena, keywords, slot_count, seed); accumulated_error += layout.iterations_per_lookup; if (layout.iterations_per_lookup < best_layout.iterations_per_lookup){ best_layout = layout; if (layout.iterations_per_lookup <= acceptable_error_threshold){ goto optimize_max_single_error; } } else{ end_temp(restore_point); } if (accumulated_error >= accumulated_error_threshold){ break; } } } optimize_max_single_error: if (best_layout.max_single_error_score <= acceptable_max_single_error){ goto finished; } for (;; slot_count += 1){ u64 accumulated_error = 0; for (;;){ u64 seed = random_u64_dirty(); Temp_Memory restore_point = begin_temp(arena); Keyword_Layout layout = opt_key_layout(arena, keywords, slot_count, seed); u64 adjusted_error_score = (layout.max_single_error_score + acceptable_max_single_error - 1)/acceptable_max_single_error; adjusted_error_score *= adjusted_error_score; adjusted_error_score *= acceptable_max_single_error; accumulated_error += adjusted_error_score; if (layout.max_single_error_score < best_layout.max_single_error_score && layout.iterations_per_lookup <= best_layout.iterations_per_lookup){ best_layout = layout; if (layout.max_single_error_score <= acceptable_max_single_error){ goto finished; } } else{ end_temp(restore_point); } if (accumulated_error >= accumulated_max_single_error_threshold){ break; } } } finished:; return(best_layout); } //////////////////////////////// internal b32 opt__input_set_contains(Input_Set set, u16 x){ b32 result = false; for (i32 i = 0; i < set.count; i += 1){ if (set.inputs[i] == x){ result = true; break; } } return(result); } internal b32 opt__partial_transition_match(Arena *scratch, Partial_Transition *a, Partial_Transition *b){ b32 result = false; if (smi_field_set_match(scratch, a->fields, b->fields)){ if (opt_action_lists_match(a->actions, b->actions)){ if (a->dst_state == b->dst_state){ result = true; } } } return(result); } internal void opt__push_partial_transition(Arena *arena, Partial_Transition_List *list, Field_Set fields, Transition *trans){ Partial_Transition partial = {}; partial.fields = fields; partial.actions = trans->activation_actions; partial.dst_state = trans->dst_state; b32 is_duplicate = false; for (Partial_Transition *node = list->first; node != 0; node = node->next){ if (opt__partial_transition_match(arena, node, &partial)){ is_duplicate = true; break; } } if (!is_duplicate){ Partial_Transition *result = push_array_write(arena, Partial_Transition, 1, &partial); sll_queue_push(list->first, list->last, result); list->count += 1; } } internal b32 opt__partial_transition_list_match(Arena *scratch, Partial_Transition_List *a, Partial_Transition_List *b){ b32 result = false; if (a->count == b->count){ result = true; for (Partial_Transition *node_a = a->first; node_a != 0; node_a = node_a->next){ b32 has_match = false; for (Partial_Transition *node_b = b->first; node_b != 0; node_b = node_b->next){ if (opt__partial_transition_match(scratch, node_a, node_b)){ has_match = true; break; } } if (!has_match){ result = false; } } } return(result); } internal void opt__insert_input_into_group(Grouped_Input_Handler *group, u8 x){ if (!group->inputs_used[x]){ group->inputs_used[x] = true; group->inputs[group->input_count] = x; group->input_count += 1; } } internal Grouped_Input_Handler_List opt_grouped_input_handlers(Arena *arena, Transition *first_trans){ Grouped_Input_Handler_List result = {}; Assert(first_trans->condition.kind == TransitionCaseKind_ConditionSet); Grouped_Input_Handler *biggest_group = 0; i32 size_of_biggest = 0; for (u16 i = 0; i <= 255; i += 1){ Temp_Memory restore_point = begin_temp(arena); Partial_Transition_List list = {}; for (Transition *trans = first_trans; trans != 0; trans = trans->next){ Assert(trans->condition.kind == TransitionCaseKind_ConditionSet); Condition_Set condition_set = trans->condition.condition_set; for (Condition_Node *node = condition_set.first; node != 0; node = node->next){ if (opt__input_set_contains(node->inputs, i)){ opt__push_partial_transition(arena, &list, node->fields, trans); } } } Grouped_Input_Handler *matching_group = 0; for (Grouped_Input_Handler *group = result.first; group != 0; group = group->next){ if (opt__partial_transition_list_match(arena, &group->partial_transitions, &list)){ matching_group = group; break; } } if (matching_group != 0){ end_temp(restore_point); } else{ matching_group = push_array_zero(arena, Grouped_Input_Handler, 1); sll_queue_push(result.first, result.last, matching_group); result.count += 1; matching_group->partial_transitions = list; } opt__insert_input_into_group(matching_group, (u8)i); if (matching_group->input_count > size_of_biggest){ size_of_biggest = matching_group->input_count; biggest_group = matching_group; } } result.group_with_biggest_input_set = biggest_group; return(result); } //////////////////////////////// internal void debug_print_states(Lexer_Primary_Context *ctx){ printf("Number of States: %d\n", ctx->model.states.count); i32 transition_count = 0; for (State *state = ctx->model.states.first; state != 0; state = state->next){ Transition_List *transitions = &state->transitions; transition_count += transitions->count; } printf("Number of Transitions: %d\n", transition_count); for (State *state = ctx->model.states.first; state != 0; state = state->next){ printf("State: %.*s\n", string_expand(state->pretty_name)); } } internal void debug_print_transitions(Arena *scratch, Lexer_Model model){ Temp_Memory temp = begin_temp(scratch); i32 field_bit_width = model.flags.count; char *field_memory = push_array(scratch, char, field_bit_width); printf("Number of States: %d\n", model.states.count); i32 transition_count = 0; for (State *state = model.states.first; state != 0; state = state->next){ Transition_List *transitions = &state->transitions; transition_count += transitions->count; } printf("Number of Transitions: %d\n", transition_count); for (State *state = model.states.first; state != 0; state = state->next){ printf("State: %.*s\n", string_expand(state->pretty_name)); Transition_List *transitions = &state->transitions; for (Transition *trans = transitions->first; trans != 0; trans = trans->next){ #define transition_on "Transition on " if (trans->condition.kind == TransitionCaseKind_DelimMatch){ printf("\t" transition_on "\n"); } else{ printf("\t" transition_on ""); for (Condition_Node *node = trans->condition.condition_set.first; node != 0; node = node->next){ printf("([%3d]", node->inputs.count); if (node->inputs.count < 10){ b32 all_printable = true; char ascii[30]; i32 j = 0; for (i32 i = 0; i < node->inputs.count; i += 1){ b32 is_ascii = character_is_basic_ascii(node->inputs.inputs[i]); b32 is_eof = (node->inputs.inputs[i] == smi_eof); if (!(is_ascii || is_eof)){ all_printable = false; break; } if (is_ascii){ ascii[j] = (char)(node->inputs.inputs[i]); j += 1; } else if (is_eof){ ascii[j] = 'E'; j += 1; ascii[j] = 'O'; j += 1; ascii[j] = 'F'; j += 1; } } if (all_printable){ printf(" = {%.*s}", j, ascii); } } printf(" x "); printf("("); for (Field_Pin_List *pins = node->fields.first; pins != 0; pins = pins->next){ block_fill_u8(field_memory, field_bit_width, '*'); for (Field_Pin *pin = pins->first; pin != 0; pin = pin->next){ i32 flag_number = pin->flag->number; field_memory[flag_number] = pin->value?'1':'0'; } printf("%.*s", field_bit_width, field_memory); if (pins->next != 0){ printf(", "); } } printf("))"); if (node->next != 0){ printf(" union\n\t%.*s", (i32)(sizeof(transition_on) - 1), " "); } } printf(":\n"); } for (Action *act = trans->activation_actions.first; act != 0; act = act->next){ switch (act->kind){ case ActionKind_SetFlag: { printf("\t\tSet Flag\n"); }break; case ActionKind_ZeroFlags: { printf("\t\tZero Flags\n"); }break; case ActionKind_DelimMarkFirst: { printf("\t\tDelim Mark First\n"); }break; case ActionKind_DelimMarkOnePastLast: { printf("\t\tDelim Mark One Past Last\n"); }break; case ActionKind_Consume: { printf("\t\tConsume\n"); }break; case ActionKind_Emit: { printf("\t\tEmit\n"); }break; } } printf("\t\tGo to %.*s;\n", string_expand(trans->dst_state->pretty_name)); } } end_temp(temp); } internal void debug_print_transitions(Lexer_Primary_Context *ctx){ debug_print_transitions(&ctx->arena, ctx->model); } internal void debug_print_keyword_table_metrics(Keyword_Layout key_layout, i32 keyword_count){ printf("used count: %d\n", keyword_count); printf("slot count: %d\n", key_layout.slot_count); printf("table load factor: %f\n", (f32)keyword_count/(f32)key_layout.slot_count); printf("error score: %llu\n", key_layout.error_score); printf("error per lookup: %f\n", key_layout.iterations_per_lookup); printf("max single error score: %llu\n", key_layout.max_single_error_score); for (i32 i = 0; i < key_layout.slot_count; i += 1){ Keyword *keyword = key_layout.slots[i]; if (keyword == 0){ printf("[%d] -> \n", i); } else{ printf("[%d] -> \"%.*s\"\n", i, string_expand(keyword->lexeme)); } } } //////////////////////////////// internal char* gen_token_full_name(Arena *arena, String_Const_u8 base_name){ String_Const_u8 string = push_u8_stringf(arena, "Token" LANG_NAME_CAMEL_STR "Kind_%.*s", string_expand(base_name)); return((char*)(string.str)); } internal void gen_tokens(Arena *scratch, Token_Kind_Set tokens, FILE *out){ Temp_Memory temp = begin_temp(scratch); i32 counter = 0; fprintf(out, "typedef u16 Token_" LANG_NAME_CAMEL_STR "_Kind;\n"); fprintf(out, "enum{\n"); for (Token_Kind_Node *node = tokens.first; node != 0; node = node->next){ char *full_name = gen_token_full_name(scratch, node->name); fprintf(out, "%s = %d,\n", full_name, counter); counter += 1; } char *full_name = gen_token_full_name(scratch, SCu8("COUNT")); fprintf(out, "%s = %d,\n", full_name, counter); fprintf(out, "};\n"); fprintf(out, "char *token_" LANG_NAME_LOWER_STR "_kind_names[] = {\n"); for (Token_Kind_Node *node = tokens.first; node != 0; node = node->next){ fprintf(out, "\"%.*s\",\n", string_expand(node->name)); } fprintf(out, "};\n"); end_temp(temp); } internal void gen_keyword_table(Arena *scratch, Token_Kind_Set tokens, Keyword_Set keywords, FILE *out){ Temp_Memory temp = begin_temp(scratch); Keyword_Layout key_layout = opt_key_layout(scratch, keywords); fprintf(out, "u64 " LANG_NAME_LOWER_STR "_%.*s_hash_array[%d] = {\n", string_expand(keywords.pretty_name), key_layout.slot_count); for (i32 i = 0; i < key_layout.slot_count; i += 1){ if (key_layout.slots[i] == 0){ fprintf(out, "0x%016x,", 0); } else{ fprintf(out, "0x%016llx,", (u64)((u64)(key_layout.hashes[i]) | 1)); } if (i % 4 == 3 || i + 1 == key_layout.slot_count){ fprintf(out, "\n"); } } fprintf(out, "};\n"); for (i32 i = 0; i < key_layout.slot_count; i += 1){ if (key_layout.slots[i] != 0){ fprintf(out, "u8 " LANG_NAME_LOWER_STR "_%.*s_key_array_%d[] = {", string_expand(keywords.pretty_name), i); String_Const_u8 lexeme = key_layout.slots[i]->lexeme; for (u64 j = 0; j < lexeme.size; j += 1){ fprintf(out, "0x%02x,", lexeme.str[j]); } fprintf(out, "};\n"); } } fprintf(out, "String_Const_u8 " LANG_NAME_LOWER_STR "_%.*s_key_array[%d] = {\n", string_expand(keywords.pretty_name), key_layout.slot_count); for (i32 i = 0; i < key_layout.slot_count; i += 1){ if (key_layout.slots[i] == 0){ fprintf(out, "{0, 0},\n"); } else{ fprintf(out, "{" LANG_NAME_LOWER_STR "_%.*s_key_array_%d, %llu},\n", string_expand(keywords.pretty_name), i, key_layout.slots[i]->lexeme.size); } } fprintf(out, "};\n"); fprintf(out, "Lexeme_Table_Value " LANG_NAME_LOWER_STR "_%.*s_value_array[%d] = {\n", string_expand(keywords.pretty_name), key_layout.slot_count); for (i32 i = 0; i < key_layout.slot_count; i += 1){ if (key_layout.slots[i] == 0){ fprintf(out, "{0, 0},\n"); } else{ Temp_Memory temp2 = begin_temp(scratch); Keyword *keyword = key_layout.slots[i]; String_Const_u8 name = keyword->name; char *full_token_name = gen_token_full_name(scratch, name); Table_Lookup lookup = table_lookup(&tokens.name_to_ptr, make_data(name.str, name.size)); Assert(lookup.found_match); u64 val = 0; table_read(&tokens.name_to_ptr, lookup, &val); Token_Kind_Node *token_node = (Token_Kind_Node*)IntAsPtr(val); fprintf(out, "{%u, %s},\n", token_node->base_kind, full_token_name); end_temp(temp2); } } fprintf(out, "};\n"); fprintf(out, "i32 " LANG_NAME_LOWER_STR "_%.*s_slot_count = %d;\n", string_expand(keywords.pretty_name), key_layout.slot_count); fprintf(out, "u64 " LANG_NAME_LOWER_STR "_%.*s_seed = 0x%016llx;\n", string_expand(keywords.pretty_name), key_layout.seed); end_temp(temp); } internal void gen_flag_check__cont_flow(Flag *flag, b32 value, FILE *out){ if (value == 0){ fprintf(out, "!"); } fprintf(out, "HasFlag(state.%.*s%d, 0x%x)", string_expand(flag->base_name), flag->index, flag->value); } internal void gen_SLOW_field_set_check__cont_flow(Field_Set fields, FILE *out){ for (Field_Pin_List *pin_list = fields.first; pin_list != 0; pin_list = pin_list->next){ fprintf(out, "("); if (pin_list->count > 0){ for (Field_Pin *pin = pin_list->first; pin != 0; pin = pin->next){ gen_flag_check__cont_flow(pin->flag, pin->value, out); if (pin->next != 0){ fprintf(out, " && "); } } } else{ fprintf(out, "true"); } fprintf(out, ")"); if (pin_list->next != 0){ fprintf(out, " || "); } } } internal void gen_goto_state__cont_flow(State *state, Action_Context context, FILE *out){ switch (context){ case ActionContext_Normal: { fprintf(out, "goto state_label_%d; // %.*s\n", state->number, string_expand(state->pretty_name)); }break; case ActionContext_EndOfFile: { fprintf(out, "result = true;\n"); fprintf(out, "goto end;\n"); }break; } } internal void gen_goto_dst_state__cont_flow(Transition *trans, Action_Context context, FILE *out){ gen_goto_state__cont_flow(trans->dst_state, context, out); } internal void gen_action__set_flag(Flag *flag, b32 value, FILE *out){ if (flag != 0){ if (value == 0){ fprintf(out, "state.%.*s%d &= ~(0x%x);\n", string_expand(flag->base_name), flag->index, flag->value); } else{ fprintf(out, "state.%.*s%d |= 0x%x;\n", string_expand(flag->base_name), flag->index, flag->value); } } } internal void gen_emit__fill_token_flags(Flag_Set flags, Flag_Bucket_Set bucket_set, FILE *out){ if (bucket_set.buckets[FlagBindProperty_Bound][FlagResetRule_AutoZero].count > 0){ if (bucket_set.buckets[FlagBindProperty_Bound][FlagResetRule_KeepState].count > 0){ fprintf(out, "token.flags = state.flag_ZB0 | state.flags_KB0;\n"); } else{ fprintf(out, "token.flags = state.flags_ZB0;\n"); } } else{ if (bucket_set.buckets[FlagBindProperty_Bound][FlagResetRule_KeepState].count > 0){ fprintf(out, "token.flags = state.flags_KB0;\n"); } } for (Flag *flag = flags.first; flag != 0; flag = flag->next){ if (flag->emit_sub_flags != 0){ fprintf(out, "if ("); gen_flag_check__cont_flow(flag, true, out); fprintf(out, "){\n"); fprintf(out, "token.sub_flags |= 0x%x;\n", flag->emit_sub_flags); fprintf(out, "}\n"); } } } internal void gen_emit__fill_token_base_kind(Token_Kind_Set tokens, String_Const_u8 name, FILE *out){ Table_Lookup lookup = table_lookup(&tokens.name_to_ptr, make_data(name.str, name.size)); Assert(lookup.found_match); u64 val = 0; table_read(&tokens.name_to_ptr, lookup, &val); Token_Kind_Node *node = (Token_Kind_Node*)IntAsPtr(val); Token_Base_Kind base_kind = node->base_kind; // TODO(allen): pretty names for token base kinds? fprintf(out, "token.kind = %u;\n", base_kind); } internal void gen_emit__direct(Arena *scratch, Token_Kind_Set tokens, String_Const_u8 base_name, FILE *out){ Temp_Memory temp = begin_temp(scratch); char *token_full_name = gen_token_full_name(scratch, base_name); fprintf(out, "token.sub_kind = %s;\n", token_full_name); gen_emit__fill_token_base_kind(tokens, base_name, out); end_temp(temp); } internal Action_Context gen_SLOW_action_list__cont_flow(Arena *scratch, Token_Kind_Set tokens, Flag_Set flags, Flag_Bucket_Set bucket_set, Action_List action_list, Action_Context context, FILE *out){ Action_Context result_context = ActionContext_Normal; for (Action *action = action_list.first; action != 0; action = action->next){ switch (action->kind){ case ActionKind_SetFlag: { gen_action__set_flag(action->set_flag.flag, action->set_flag.value, out); }break; case ActionKind_ZeroFlags: { for (i32 i = 0; i < FlagBindProperty_COUNT; i += 1){ Flag_Bucket *bucket = &bucket_set.buckets[i][FlagResetRule_AutoZero]; for (i32 j = 0; j < bucket->number_of_variables; j += 1){ fprintf(out, "state.%.*s%d = 0;\n", string_expand(bucket->pretty_name), j); } } }break; case ActionKind_DelimMarkFirst: { fprintf(out, "state.delim_first = state.ptr;\n"); }break; case ActionKind_DelimMarkOnePastLast: { fprintf(out, "state.delim_one_past_last = state.ptr;\n"); }break; case ActionKind_Consume: { if (context != ActionContext_EndOfFile){ fprintf(out, "state.ptr += 1;\n"); } else{ result_context = ActionContext_EndOfFile; } }break; case ActionKind_Emit: { Emit_Rule *emit = action->emit_rule; fprintf(out, "{\n"); fprintf(out, "Token token = {};\n"); fprintf(out, "token.pos = (i64)(state.emit_ptr - state.base);\n"); fprintf(out, "token.size = (i64)(state.ptr - state.emit_ptr);\n"); gen_emit__fill_token_flags(flags, bucket_set, out); fprintf(out, "do{\n"); b32 keep_looping = true; for (Emit_Handler *handler = emit->first; handler != 0 && keep_looping; handler = handler->next){ if (handler->flag_check != 0){ fprintf(out, "if ("); gen_flag_check__cont_flow(handler->flag_check, true, out); fprintf(out, "){\n"); } switch (handler->kind){ case EmitHandlerKind_Direct: { gen_emit__direct(scratch, tokens, handler->token_name, out); if (handler->flag_check != 0){ fprintf(out, "break;\n"); } keep_looping = false; }break; case EmitHandlerKind_Keywords: { Keyword_Set *keywords = handler->keywords; fprintf(out, "Lexeme_Table_Lookup lookup = " "lexeme_table_lookup(" LANG_NAME_LOWER_STR "_%.*s_hash_array, " LANG_NAME_LOWER_STR "_%.*s_key_array, " LANG_NAME_LOWER_STR "_%.*s_value_array, " LANG_NAME_LOWER_STR "_%.*s_slot_count, " LANG_NAME_LOWER_STR "_%.*s_seed, " "state.emit_ptr, token.size);\n", string_expand(keywords->pretty_name), string_expand(keywords->pretty_name), string_expand(keywords->pretty_name), string_expand(keywords->pretty_name), string_expand(keywords->pretty_name)); fprintf(out, "if (lookup.found_match){\n"); fprintf(out, "token.kind = lookup.base_kind;\n"); fprintf(out, "token.sub_kind = lookup.sub_kind;\n"); fprintf(out, "break;\n"); fprintf(out, "}\n"); if (handler->keywords->has_fallback_token_kind){ gen_emit__direct(scratch, tokens, keywords->fallback_name, out); keep_looping = false; } }break; case EmitHandlerKind_KeywordsDelim: { Keyword_Set *keywords = handler->keywords; fprintf(out, "Lexeme_Table_Lookup lookup = " "lexeme_table_lookup(" LANG_NAME_LOWER_STR "_%.*s_hash_array, " LANG_NAME_LOWER_STR "_%.*s_key_array, " LANG_NAME_LOWER_STR "_%.*s_value_array, " LANG_NAME_LOWER_STR "_%.*s_slot_count, " LANG_NAME_LOWER_STR "_%.*s_seed, " "state.delim_first, (state.delim_one_past_last - state.delim_first));\n", string_expand(keywords->pretty_name), string_expand(keywords->pretty_name), string_expand(keywords->pretty_name), string_expand(keywords->pretty_name), string_expand(keywords->pretty_name)); fprintf(out, "if (lookup.found_match){\n"); fprintf(out, "token.kind = lookup.base_kind;\n"); fprintf(out, "token.sub_kind = lookup.sub_kind;\n"); fprintf(out, "break;\n"); fprintf(out, "}\n"); if (handler->keywords->has_fallback_token_kind){ gen_emit__direct(scratch, tokens, keywords->fallback_name, out); keep_looping = false; } }break; } if (handler->flag_check != 0){ fprintf(out, "}\n"); keep_looping = true; } } fprintf(out, "}while(0);\n"); if (emit->emit_checks.count > 0){ fprintf(out, "switch (token.sub_kind){\n"); for (Emit_Check *emit_check = emit->emit_checks.first; emit_check != 0; emit_check = emit_check->next){ Temp_Memory temp = begin_temp(scratch); char *emit_check_full_name = gen_token_full_name(scratch, emit_check->emit_check); fprintf(out, "case %s:\n", emit_check_full_name); fprintf(out, "{\n"); gen_action__set_flag(emit_check->flag, emit_check->value, out); fprintf(out, "}break;\n"); end_temp(temp); } fprintf(out, "}\n"); } fprintf(out, "token_list_push(arena, list, &token);\n"); fprintf(out, "emit_counter += 1;\n"); fprintf(out, "state.emit_ptr = state.ptr;\n"); if (context != ActionContext_EndOfFile){ fprintf(out, "if (emit_counter == max){\n"); fprintf(out, "goto end;\n"); fprintf(out, "}\n"); } fprintf(out, "}\n"); }break; } } return(result_context); } internal void gen_flag_declarations__cont_flow(Flag_Bucket *bucket, FILE *out){ i32 max_bits = bucket->max_bits; i32 number_of_flag_variables = (bucket->count + max_bits - 1)/max_bits; String_Const_u8 pretty_name = bucket->pretty_name; for (i32 i = 0; i < number_of_flag_variables; i += 1){ fprintf(out, "u%d %.*s%d;\n", max_bits, string_expand(pretty_name), i); } bucket->number_of_variables = number_of_flag_variables; } internal void gen_flag_init__cont_flow(Flag_Bucket *bucket, FILE *out){ i32 max_bits = bucket->max_bits; i32 number_of_flag_variables = (bucket->count + max_bits - 1)/max_bits; String_Const_u8 pretty_name = bucket->pretty_name; for (i32 i = 0; i < number_of_flag_variables; i += 1){ fprintf(out, "state_ptr->%.*s%d = 0;\n", string_expand(pretty_name), i); } bucket->number_of_variables = number_of_flag_variables; } internal void gen_bound_flag_fill_lookup__cont_flow(Flag_Bucket *bucket){ i32 counter = 0; for (Flag_Ptr_Node *node = bucket->first; node != 0; node = node->next, counter += 1){ Flag *flag = node->flag; flag->base_name = bucket->pretty_name; flag->number = counter; flag->index = 0; flag->value = flag->emit_flags; } } internal void gen_flag_fill_lookup__cont_flow(Flag_Bucket *bucket){ i32 max_bits = bucket->max_bits; i32 counter = 0; for (Flag_Ptr_Node *node = bucket->first; node != 0; node = node->next, counter += 1){ Flag *flag = node->flag; flag->base_name = bucket->pretty_name; flag->number = counter; flag->index = counter/max_bits; flag->value = (1 << (counter % max_bits)); } } internal void gen_contiguous_control_flow_lexer(Arena *scratch, Token_Kind_Set tokens, Lexer_Model model, FILE *out){ Temp_Memory temp = begin_temp(scratch); model = opt_copy_model(scratch, model); opt_flags_set_numbers(model); opt_states_set_numbers(model); Input_Set cut_inputs = smi_input_set_construct_eof(scratch); Field_Set cut_fields = smi_field_set_construct(scratch); Condition_Set cut_set = smi_condition(scratch, cut_inputs, cut_fields); // Split EOFs and insert at beginning for (State *state = model.states.first; state != 0; state = state->next){ Transition_List *transitions = &state->transitions; if (transitions->first->condition.kind == TransitionCaseKind_ConditionSet){ Transition *first = 0; Transition *last = 0; i32 count = 0; for (Transition *trans = transitions->first, *next = 0; trans != 0; trans = next){ next = trans->next; Assert(trans->condition.kind == TransitionCaseKind_ConditionSet); Condition_Set original = trans->condition.condition_set; Condition_Set condition_int = smi_condition_set_intersect(scratch, original, cut_set); if (condition_int.count == 0){ zdll_push_back(first, last, trans); count += 1; } else{ trans->condition.condition_set = condition_int; zdll_push_front(first, last, trans); count += 1; Condition_Set condition_sub = smi_condition_set_subtract(scratch, original, cut_set); if (condition_sub.count > 0){ Transition *new_trans = push_array(scratch, Transition, 1); zdll_push_back(first, last, new_trans); count += 1; new_trans->parent_state = state; new_trans->condition.kind = TransitionCaseKind_ConditionSet; new_trans->condition.condition_set = condition_sub; new_trans->activation_actions = opt_copy_action_list(scratch, trans->activation_actions); new_trans->dst_state = trans->dst_state; } } } state->transitions.first = first; state->transitions.last = last; state->transitions.count = count; } } Flag_Bucket_Set bucket_set = {}; bucket_set.buckets[FlagBindProperty_Free][FlagResetRule_AutoZero].pretty_name = string_u8_litexpr("flags_ZF"); bucket_set.buckets[FlagBindProperty_Free][FlagResetRule_AutoZero].max_bits = 32; bucket_set.buckets[FlagBindProperty_Free][FlagResetRule_KeepState].pretty_name = string_u8_litexpr("flags_KF"); bucket_set.buckets[FlagBindProperty_Free][FlagResetRule_KeepState].max_bits = 32; bucket_set.buckets[FlagBindProperty_Bound][FlagResetRule_AutoZero].pretty_name = string_u8_litexpr("flags_ZB"); bucket_set.buckets[FlagBindProperty_Bound][FlagResetRule_AutoZero].max_bits = 16; bucket_set.buckets[FlagBindProperty_Bound][FlagResetRule_KeepState].pretty_name = string_u8_litexpr("flags_KB"); bucket_set.buckets[FlagBindProperty_Bound][FlagResetRule_KeepState].max_bits = 16; for (Flag *flag = model.flags.first; flag != 0; flag = flag->next){ Flag_Reset_Rule reset_rule = flag->reset_rule; Flag_Bind_Property bind_property = (flag->emit_flags != 0)?FlagBindProperty_Bound:FlagBindProperty_Free; Flag_Bucket *bucket = &bucket_set.buckets[bind_property][reset_rule]; Flag_Ptr_Node *node = push_array(scratch, Flag_Ptr_Node, 1); sll_queue_push(bucket->first, bucket->last, node); bucket->count += 1; node->flag = flag; } for (i32 i = 0; i < FlagBindProperty_COUNT; i += 1){ for (i32 j = 0; j < FlagResetRule_COUNT; j += 1){ if (i == FlagBindProperty_Bound){ gen_bound_flag_fill_lookup__cont_flow(&bucket_set.buckets[i][j]); } else{ gen_flag_fill_lookup__cont_flow(&bucket_set.buckets[i][j]); } } } fprintf(out, "struct Lex_State_" LANG_NAME_CAMEL_STR "{\n"); for (i32 i = 0; i < FlagBindProperty_COUNT; i += 1){ for (i32 j = 0; j < FlagResetRule_COUNT; j += 1){ gen_flag_declarations__cont_flow(&bucket_set.buckets[i][j], out); } } fprintf(out, "u8 *base;\n"); fprintf(out, "u8 *delim_first;\n"); fprintf(out, "u8 *delim_one_past_last;\n"); fprintf(out, "u8 *emit_ptr;\n"); fprintf(out, "u8 *ptr;\n"); fprintf(out, "u8 *opl_ptr;\n"); fprintf(out, "};\n"); fprintf(out, "internal void\n"); fprintf(out, "lex_full_input_" LANG_NAME_LOWER_STR "_init(Lex_State_" LANG_NAME_CAMEL_STR " *state_ptr, String_Const_u8 input){\n"); for (i32 i = 0; i < FlagBindProperty_COUNT; i += 1){ for (i32 j = 0; j < FlagResetRule_COUNT; j += 1){ gen_flag_init__cont_flow(&bucket_set.buckets[i][j], out); } } fprintf(out, "state_ptr->base = input.str;\n"); fprintf(out, "state_ptr->delim_first = input.str;\n"); fprintf(out, "state_ptr->delim_one_past_last = input.str;\n"); fprintf(out, "state_ptr->emit_ptr = input.str;\n"); fprintf(out, "state_ptr->ptr = input.str;\n"); fprintf(out, "state_ptr->opl_ptr = input.str + input.size;\n"); fprintf(out, "}\n"); fprintf(out, "internal b32\n"); fprintf(out, "lex_full_input_" LANG_NAME_LOWER_STR "_breaks(" "Arena *arena, Token_List *list, Lex_State_" LANG_NAME_CAMEL_STR " *state_ptr, u64 max){\n"); fprintf(out, "b32 result = false;\n"); fprintf(out, "u64 emit_counter = 0;\n"); fprintf(out, "Lex_State_" LANG_NAME_CAMEL_STR " state;\n"); fprintf(out, "block_copy_struct(&state, state_ptr);\n"); for (State *state = model.states.first; state != 0; state = state->next){ fprintf(out, "{\n"); fprintf(out, "state_label_%d: // %.*s\n", state->number, string_expand(state->pretty_name)); Transition_List *transitions = &state->transitions; Transition *trans = transitions->first; Transition_Case_Kind state_trans_kind = trans->condition.kind; switch (state_trans_kind){ default: { InvalidPath; }break; case TransitionCaseKind_DelimMatch: { Transition *success_trans = trans; Transition *failure_trans = trans->next; Assert(failure_trans->condition.kind == TransitionCaseKind_DelimMatchFail); fprintf(out, "u64 delim_length = state.delim_one_past_last - state.delim_first;\n"); fprintf(out, "u64 parse_length = 0;\n"); fprintf(out, "for (;;){\n"); { fprintf(out, "if (parse_length == delim_length){\n"); { gen_SLOW_action_list__cont_flow(scratch, tokens, model.flags, bucket_set, success_trans->activation_actions, ActionContext_Normal, out); gen_goto_dst_state__cont_flow(success_trans, ActionContext_Normal, out); } fprintf(out, "}\n"); fprintf(out, "if (state.ptr == state.opl_ptr){\n"); { gen_SLOW_action_list__cont_flow(scratch, tokens, model.flags, bucket_set, failure_trans->activation_actions, ActionContext_Normal, out); gen_goto_dst_state__cont_flow(success_trans, ActionContext_Normal, out); } fprintf(out, "}\n"); fprintf(out, "if (*state.ptr == state.delim_first[parse_length]){\n"); fprintf(out, "state.ptr += 1;\n"); fprintf(out, "parse_length += 1;\n"); fprintf(out, "}\n"); fprintf(out, "else{\n"); { gen_SLOW_action_list__cont_flow(scratch, tokens, model.flags, bucket_set, failure_trans->activation_actions, ActionContext_Normal, out); gen_goto_dst_state__cont_flow(failure_trans, ActionContext_Normal, out); } fprintf(out, "}\n"); } fprintf(out, "}\n"); }break; case TransitionCaseKind_ConditionSet: { { fprintf(out, "if (state.ptr == state.opl_ptr){\n"); for (; trans != 0; trans = trans->next){ if (opt_condition_is_eof_only(trans->condition)){ Assert(trans->condition.condition_set.count == 1); Condition_Node *node = trans->condition.condition_set.first; fprintf(out, "if ("); gen_SLOW_field_set_check__cont_flow(node->fields, out); fprintf(out, "){\n"); Action_Context action_ctx = ActionContext_EndOfFile; action_ctx = gen_SLOW_action_list__cont_flow(scratch, tokens, model.flags, bucket_set, trans->activation_actions, action_ctx, out); gen_goto_dst_state__cont_flow(trans, action_ctx, out); fprintf(out, "}\n"); } else{ break; } } fprintf(out, "}\n"); } Grouped_Input_Handler_List group_list = opt_grouped_input_handlers(scratch, trans); fprintf(out, "switch (*state.ptr){\n"); for (Grouped_Input_Handler *group = group_list.first; group != 0; group = group->next){ if (group == group_list.group_with_biggest_input_set){ fprintf(out, "default:\n"); } else{ i32 input_count = group->input_count; u8 *inputs = group->inputs; for (i32 i = 0; i < input_count; i += 1){ fprintf(out, "case 0x%02x:", inputs[i]); if ((i % 7) == 6 || i + 1 == input_count){ fprintf(out, "\n"); } } } fprintf(out, "{\n"); for (Partial_Transition *partial = group->partial_transitions.first; partial != 0; partial = partial->next){ if (partial->next != 0){ fprintf(out, "if ("); gen_SLOW_field_set_check__cont_flow(partial->fields, out); fprintf(out, "){\n"); } { gen_SLOW_action_list__cont_flow(scratch, tokens, model.flags, bucket_set, partial->actions, ActionContext_Normal, out); gen_goto_state__cont_flow(partial->dst_state, ActionContext_Normal, out); } if (partial->next != 0){ fprintf(out, "}\n"); } } fprintf(out, "}break;\n"); } fprintf(out, "}\n"); }break; } fprintf(out, "}\n"); } fprintf(out, "end:;\n"); fprintf(out, "block_copy_struct(state_ptr, &state);\n"); fprintf(out, "return(result);\n"); fprintf(out, "}\n"); fprintf(out, "internal Token_List\n"); fprintf(out, "lex_full_input_" LANG_NAME_LOWER_STR "(Arena *arena, String_Const_u8 input){\n"); fprintf(out, "Lex_State_" LANG_NAME_CAMEL_STR " state = {};\n"); fprintf(out, "lex_full_input_" LANG_NAME_LOWER_STR "_init(&state, input);\n"); fprintf(out, "Token_List list = {};\n"); fprintf(out, "lex_full_input_" LANG_NAME_LOWER_STR "_breaks(arena, &list, &state, max_u64);\n"); fprintf(out, "return(list);\n"); fprintf(out, "}\n"); end_temp(temp); } //////////////////////////////// #include #include internal void build_language_model(void); internal String_Const_u8 file_read_all(Arena *arena, FILE *file){ String_Const_u8 result = {}; fseek(file, 0, SEEK_END); result.size = ftell(file); fseek(file, 0, SEEK_SET); result.str = push_array(arena, u8, result.size + 1); fread(result.str, result.size, 1, file); result.str[result.size] = 0; return(result); } int main(void){ pcg32_srandom(time(0), time(0)); Base_Allocator *allocator = get_allocator_malloc(); sm_helper_init(allocator); build_language_model(); Lexer_Primary_Context *ctx = &helper_ctx.primary_ctx; // NOTE(allen): Type checking // DelimMatch only with a single catch-all fallback, no peeks. // Remove the declaration of states and flags? // Flag bindings are one to one //////////////////////////////// // NOTE(allen): High level reorganization of state machine opt_set_auto_zero_flags_on_root(ctx); opt_transfer_state_actions_to_transitions(ctx); //////////////////////////////// // NOTE(allen): High level optimization opt_simplify_transitions(ctx); opt_mark_all_states_excluded(ctx); opt_include_reachable_states(ctx->model.root); opt_discard_all_excluded_states(ctx); opt_merge_redundant_transitions_in_each_state(ctx); opt_skip_past_thunk_states(ctx); opt_mark_all_states_excluded(ctx); opt_include_reachable_states(ctx->model.root); opt_discard_all_excluded_states(ctx); opt_remove_peeks_without_creating_transition_splits(ctx); opt_mark_all_states_excluded(ctx); opt_include_reachable_states(ctx->model.root); opt_discard_all_excluded_states(ctx); opt_remove_peeks_into_single_entry_point_states(ctx); opt_discard_all_excluded_states(ctx); opt_states_set_numbers(ctx->model); //////////////////////////////// // NOTE(allen): Debug inspection of model #if 0 opt_flags_set_numbers(ctx->model); debug_print_transitions(ctx); #endif //////////////////////////////// // NOTE(allen): Arrange input files and output files String_Const_u8 path_to_self = string_u8_litexpr(__FILE__); path_to_self = string_remove_last_folder(path_to_self); String_Const_u8 hand_written_h_name = push_u8_stringf(&ctx->arena, "%.*s4coder_lex_gen_hand_written.h", string_expand(path_to_self)); String_Const_u8 hand_written_name = push_u8_stringf(&ctx->arena, "%.*s4coder_lex_gen_hand_written.cpp", string_expand(path_to_self)); FILE *hand_written_h_file = fopen((char*)hand_written_h_name.str, "rb"); if (hand_written_h_file == 0){ printf("error: could not open 4coder_lex_gen_hand_written.h\n"); exit(1); } String_Const_u8 hand_written_h = file_read_all(&ctx->arena, hand_written_h_file); fclose(hand_written_h_file); FILE *hand_written_file = fopen((char*)hand_written_name.str , "rb"); if (hand_written_file == 0){ printf("error: could not open 4coder_lex_gen_hand_written.cpp\n"); exit(1); } String_Const_u8 hand_written = file_read_all(&ctx->arena, hand_written_file); fclose(hand_written_file); String_Const_u8 path_to_src = string_remove_last_folder(path_to_self); String_Const_u8 out_h_name = push_u8_stringf(&ctx->arena, "%.*sgenerated/lexer_" LANG_NAME_LOWER_STR ".h", string_expand(path_to_src)); String_Const_u8 out_cpp_name = push_u8_stringf(&ctx->arena, "%.*sgenerated/lexer_" LANG_NAME_LOWER_STR ".cpp", string_expand(path_to_src)); FILE *out_h_file = fopen((char*)out_h_name.str, "wb"); if (out_h_file == 0){ printf("error: could not open output file %.*s\n", string_expand(out_h_name)); exit(1); } FILE *out_cpp_file = fopen((char*)out_cpp_name.str, "wb"); if (out_cpp_file == 0){ printf("error: could not open output file %.*s\n", string_expand(out_cpp_name)); exit(1); } //////////////////////////////// // NOTE(allen): Code generation fprintf(out_h_file, "%s\n", hand_written_h.str); gen_tokens(&ctx->arena, ctx->tokens, out_h_file); fprintf(out_cpp_file, "%s\n", hand_written.str); for (Keyword_Set *set = ctx->keywords.first; set != 0; set = set->next){ gen_keyword_table(&ctx->arena, ctx->tokens, *set, out_cpp_file); } gen_contiguous_control_flow_lexer(&ctx->arena, ctx->tokens, ctx->model, out_cpp_file); fclose(out_h_file); fclose(out_cpp_file); printf("%.*s:1:\n", string_expand(out_h_name)); printf("%.*s:1:\n", string_expand(out_cpp_name)); // NOTE(allen): Simplifying the state machine // Isolate the state machine's parts into small L.U.T. then generate tables? // If using L.U.T: Optimize all action lists that don't contain a "consume" action // NOTE(allen): State machine generation // Implementation: Control Flow // Feature: Fully Contiguous input // // Implementation: L.U.T. Accelerated // // Feature: Spatially chunked input // Feature: Temporally chunked input return(0); } // BOTTOM