4coder/test/fsm_table_generator.cpp

723 lines
21 KiB
C++
Raw Normal View History

2016-03-24 01:05:28 +00:00
/*
* FSM table generator:
* Generate FSM tables as ".c" files from FSM functions.
*
* 23.03.2016 (dd.mm.yyyy)
*/
// TOP
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
2016-03-24 14:01:53 +00:00
#include <assert.h>
#define ArrayCount(a) (sizeof(a)/sizeof(*a))
2016-03-24 01:05:28 +00:00
#include "4cpp_lexer_fsms.h"
Whitespace_FSM
whitespace_skip_fsm(Whitespace_FSM wfsm, char c){
if (wfsm.pp_state != LSPP_default){
if (c == '\n') wfsm.pp_state = LSPP_default;
}
if (!(c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' || c == '\v')){
wfsm.white_done = 1;
}
return(wfsm);
}
Lex_FSM
int_fsm(Lex_FSM fsm, char c){
switch (fsm.int_state){
case LSINT_default:
switch (c){
case 'u': case 'U': fsm.int_state = LSINT_u; break;
case 'l': fsm.int_state = LSINT_l; break;
case 'L': fsm.int_state = LSINT_L; break;
default: fsm.emit_token = 1; break;
}
break;
case LSINT_u:
switch (c){
case 'l': fsm.int_state = LSINT_ul; break;
case 'L': fsm.int_state = LSINT_uL; break;
default: fsm.emit_token = 1; break;
}
break;
case LSINT_l:
switch (c){
case 'l': fsm.int_state = LSINT_ll; break;
case 'U': case 'u': fsm.int_state = LSINT_extra; break;
default: fsm.emit_token = 1; break;
}
break;
case LSINT_L:
switch (c){
case 'L': fsm.int_state = LSINT_ll; break;
case 'U': case 'u': fsm.int_state = LSINT_extra; break;
default: fsm.emit_token = 1; break;
}
break;
case LSINT_ul:
switch (c){
case 'l': fsm.int_state = LSINT_extra; break;
default: fsm.emit_token = 1; break;
}
break;
case LSINT_uL:
switch (c){
case 'L': fsm.int_state = LSINT_extra; break;
default: fsm.emit_token = 1; break;
}
break;
case LSINT_ll:
switch (c){
case 'u': case 'U': fsm.int_state = LSINT_extra; break;
default: fsm.emit_token = 1; break;
}
break;
case LSINT_extra:
fsm.emit_token = 1;
break;
}
return(fsm);
}
Lex_FSM
main_fsm(Lex_FSM fsm, unsigned char pp_state, unsigned char c){
if (c == 0) fsm.emit_token = 1;
else
switch (pp_state){
case LSPP_error:
fsm.state = LS_error_message;
if (c == '\n') fsm.emit_token = 1;
break;
case LSPP_include:
switch (fsm.state){
case LSINC_default:
switch (c){
case '"': fsm.state = LSINC_quotes; break;
case '<': fsm.state = LSINC_pointy; break;
default: fsm.state = LSINC_junk; break;
}
break;
case LSINC_quotes:
if (c == '"') fsm.emit_token = 1;
break;
case LSINC_pointy:
if (c == '>') fsm.emit_token = 1;
break;
case LSINC_junk:
if (c == '\n') fsm.emit_token = 1;
break;
}
break;
default:
switch (fsm.state){
case LS_default:
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'){
fsm.state = LS_identifier;
}
else if (c >= '1' && c <= '9'){
fsm.state = LS_number;
}
else if (c == '0'){
fsm.state = LS_number0;
}
else switch (c){
case '\'': fsm.state = LS_char; break;
case '"': fsm.state = LS_string; break;
case '/': fsm.state = LS_comment_pre; break;
case '.': fsm.state = LS_dot; break;
case '<': fsm.state = LS_less; break;
case '>': fsm.state = LS_more; break;
case '-': fsm.state = LS_minus; break;
case '&': fsm.state = LS_and; break;
case '|': fsm.state = LS_or; break;
case '+': fsm.state = LS_plus; break;
case ':': fsm.state = LS_colon; break;
case '*': fsm.state = LS_star; break;
case '%': fsm.state = LS_modulo; break;
case '^': fsm.state = LS_caret; break;
case '=': fsm.state = LS_eq; break;
case '!': fsm.state = LS_bang; break;
case '#': fsm.state = LS_pound; break;
#define OperCase(op,type) case op: fsm.emit_token = 1; break;
OperCase('{', CPP_TOKEN_BRACE_OPEN);
OperCase('}', CPP_TOKEN_BRACE_CLOSE);
OperCase('[', CPP_TOKEN_BRACKET_OPEN);
OperCase(']', CPP_TOKEN_BRACKET_CLOSE);
OperCase('(', CPP_TOKEN_PARENTHESE_OPEN);
OperCase(')', CPP_TOKEN_PARENTHESE_CLOSE);
OperCase('~', CPP_TOKEN_TILDE);
OperCase(',', CPP_TOKEN_COMMA);
OperCase(';', CPP_TOKEN_SEMICOLON);
OperCase('?', CPP_TOKEN_TERNARY_QMARK);
OperCase('@', CPP_TOKEN_JUNK);
OperCase('$', CPP_TOKEN_JUNK);
OperCase('\\', CPP_TOKEN_JUNK);
#undef OperCase
}
break;
case LS_identifier:
if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')){
fsm.emit_token = 1;
}
break;
case LS_pound:
if (pp_state == LSPP_default){
if (c == ' ' || c == '\t' || c == '\r' || c == '\f' || c == '\v'){
fsm.state = LS_pound;
}
else if (c == '\n'){
fsm.emit_token = 1;
}
else{
fsm.state = LS_pp;
}
}
else{
switch (c){
case '#': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
}
break;
case LS_pp:
if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')){
fsm.emit_token = 1;
}
break;
case LS_char:
2016-03-24 14:01:53 +00:00
case LS_char_multiline:
2016-03-24 01:05:28 +00:00
switch(c){
case '\'': fsm.emit_token = 1; break;
case '\\': fsm.state = LS_char_slashed; break;
}
break;
case LS_char_slashed:
switch (c){
case '\r': case '\f': case '\v': break;
2016-03-24 14:01:53 +00:00
case '\n': fsm.state = LS_char_multiline; break;
2016-03-24 01:05:28 +00:00
default: fsm.state = LS_char; break;
}
break;
case LS_string:
2016-03-24 14:01:53 +00:00
case LS_string_multiline:
2016-03-24 01:05:28 +00:00
switch(c){
case '\"': fsm.emit_token = 1; break;
case '\\': fsm.state = LS_string_slashed; break;
}
break;
case LS_string_slashed:
switch (c){
case '\r': case '\f': case '\v': break;
2016-03-24 14:01:53 +00:00
case '\n': fsm.state = LS_string_multiline; break;
2016-03-24 01:05:28 +00:00
default: fsm.state = LS_string; break;
}
break;
case LS_number:
if (c >= '0' && c <= '9'){
fsm.state = LS_number;
}
else{
switch (c){
case '.': fsm.state = LS_float; break;
default: fsm.emit_token = 1; break;
}
}
break;
case LS_number0:
if (c >= '0' && c <= '9'){
fsm.state = LS_number;
}
else if (c == 'x'){
fsm.state = LS_hex;
}
else if (c == '.'){
fsm.state = LS_float;
}
else{
fsm.emit_token = 1;
}
break;
case LS_float:
if (!(c >= '0' && c <= '9')){
switch (c){
case 'e': fsm.state = LS_crazy_float0; break;
default: fsm.emit_token = 1; break;
}
}
break;
case LS_crazy_float0:
{
if ((c >= '0' && c <= '9') || c == '-'){
fsm.state = LS_crazy_float1;
}
else{
fsm.emit_token = 1;
}
}
break;
case LS_crazy_float1:
{
if (!(c >= '0' && c <= '9')){
fsm.emit_token = 1;
}
}
break;
case LS_hex:
if (!(c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F')){
fsm.emit_token = 1;
}
break;
case LS_dot:
if (c >= '0' && c <= '9'){
fsm.state = LS_float;
}
else
switch (c){
case '.': fsm.state = LS_ellipsis; break;
case '*': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_ellipsis: fsm.emit_token = 1; break;
case LS_less:
switch (c){
case '<': fsm.state = LS_less_less; break;
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_less_less:
switch (c){
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_more:
switch (c){
case '>': fsm.state = LS_more_more; break;
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_more_more:
switch (c){
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_comment_pre:
switch (c){
case '/': fsm.state = LS_comment; break;
case '*': fsm.state = LS_comment_block; break;
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_comment:
switch (c){
case '\\': fsm.state = LS_comment_slashed; break;
case '\n': fsm.emit_token = 1; break;
}
break;
case LS_comment_slashed:
switch (c){
case '\r': case '\f': case '\v': break;
default: fsm.state = LS_comment; break;
}
break;
case LS_comment_block:
switch (c){
case '*': fsm.state = LS_comment_block_ending; break;
}
break;
case LS_comment_block_ending:
switch (c){
case '*': fsm.state = LS_comment_block_ending; break;
case '/': fsm.emit_token = 1; break;
default: fsm.state = LS_comment_block; break;
}
break;
case LS_minus:
switch (c){
case '>': fsm.state = LS_arrow; break;
case '-': fsm.emit_token = 1; break;
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_arrow:
switch (c){
case '*': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_and:
switch (c){
case '&': fsm.emit_token = 1; break;
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_or:
switch (c){
case '|': fsm.emit_token = 1; break;
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_plus:
switch (c){
case '+': fsm.emit_token = 1; break;
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_colon:
switch (c){
case ':': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_star:
switch (c){
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_modulo:
switch (c){
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_caret:
switch (c){
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_eq:
switch (c){
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
case LS_bang:
switch (c){
case '=': fsm.emit_token = 1; break;
default: fsm.emit_token = 1; break;
}
break;
}
break;
}
return(fsm);
}
2016-03-24 14:01:53 +00:00
void
begin_table(FILE *file, char *type, char *group_name, char *table_name){
fprintf(file, "unsigned %s %s_%s[] = {\n", type, group_name, table_name);
}
2016-03-24 01:05:28 +00:00
void
begin_table(FILE *file, char *type, char *table_name){
fprintf(file, "unsigned %s %s[] = {\n", type, table_name);
}
2016-03-24 14:01:53 +00:00
void
begin_ptr_table(FILE *file, char *type, char *table_name){
fprintf(file, "unsigned %s * %s[] = {\n", type, table_name);
}
2016-03-24 01:05:28 +00:00
void
do_table_item(FILE *file, unsigned short item){
2016-03-24 14:01:53 +00:00
fprintf(file, "%2d,", (int)item);
}
void
do_table_item_direct(FILE *file, char *item, char *tail){
fprintf(file, "%s%s,", item, tail);
2016-03-24 01:05:28 +00:00
}
void
end_row(FILE *file){
fprintf(file, "\n");
}
void
end_table(FILE *file){
fprintf(file, "};\n\n");
}
2016-03-24 14:01:53 +00:00
struct FSM_Tables{
unsigned char *full_transition_table;
unsigned char *marks;
unsigned char *eq_class;
unsigned char *eq_class_rep;
unsigned char *reduced_transition_table;
unsigned char eq_class_counter;
unsigned short state_count;
};
FSM_Tables
generate_whitespace_skip_table(){
unsigned char state_count = LSPP_count;
FSM_Tables table;
table.full_transition_table = (unsigned char*)malloc(state_count * 256);
table.marks = (unsigned char*)malloc(state_count * 256);
table.eq_class = (unsigned char*)malloc(state_count * 256);
table.eq_class_rep = (unsigned char*)malloc(state_count * 256);
table.state_count = state_count;
memset(table.marks, 0, 256);
int i = 0;
Whitespace_FSM wfsm = {0};
Whitespace_FSM new_wfsm;
for (unsigned short c = 0; c < 256; ++c){
for (unsigned char state = 0; state < state_count; ++state){
wfsm.pp_state = state;
wfsm.white_done = 0;
new_wfsm = whitespace_skip_fsm(wfsm, (unsigned char)c);
table.full_transition_table[i++] = new_wfsm.pp_state + state_count*new_wfsm.white_done;
}
}
table.eq_class_counter = 0;
unsigned char *c_line = table.full_transition_table;
for (unsigned short c = 0; c < 256; ++c){
if (table.marks[c] == 0){
table.eq_class[c] = table.eq_class_counter;
table.eq_class_rep[table.eq_class_counter] = (unsigned char)c;
unsigned char *c2_line = c_line + state_count;
for (unsigned short c2 = c + 1; c2 < 256; ++c2){
if (memcmp(c_line, c2_line, state_count) == 0){
table.marks[c2] = 1;
table.eq_class[c2] = table.eq_class_counter;
}
c2_line += state_count;
}
++table.eq_class_counter;
}
c_line += state_count;
}
table.reduced_transition_table = (unsigned char*)malloc(state_count * table.eq_class_counter);
i = 0;
for (unsigned short eq = 0; eq < table.eq_class_counter; ++eq){
for (unsigned char state = 0; state < state_count; ++state){
wfsm.pp_state = state;
wfsm.white_done = 0;
new_wfsm = whitespace_skip_fsm(wfsm, table.eq_class_rep[eq]);
table.reduced_transition_table[i++] = new_wfsm.pp_state + state_count*new_wfsm.white_done;
}
}
2016-03-24 01:05:28 +00:00
2016-03-24 14:01:53 +00:00
return(table);
}
FSM_Tables
generate_fsm_table(unsigned char pp_state){
unsigned char state_count = LS_count;
FSM_Tables table;
table.full_transition_table = (unsigned char*)malloc(state_count * 256);
table.marks = (unsigned char*)malloc(state_count * 256);
table.eq_class = (unsigned char*)malloc(state_count * 256);
table.eq_class_rep = (unsigned char*)malloc(state_count * 256);
table.state_count = state_count;
memset(table.marks, 0, 256);
2016-03-24 01:05:28 +00:00
int i = 0;
Lex_FSM fsm = {0};
Lex_FSM new_fsm;
for (unsigned short c = 0; c < 256; ++c){
2016-03-24 14:01:53 +00:00
for (unsigned char state = 0; state < state_count; ++state){
2016-03-24 01:05:28 +00:00
fsm.state = state;
fsm.emit_token = 0;
2016-03-24 14:01:53 +00:00
new_fsm = main_fsm(fsm, pp_state, (unsigned char)c);
table.full_transition_table[i++] = new_fsm.state + state_count*new_fsm.emit_token;
2016-03-24 01:05:28 +00:00
}
}
2016-03-24 14:01:53 +00:00
table.eq_class_counter = 0;
unsigned char *c_line = table.full_transition_table;
2016-03-24 01:05:28 +00:00
for (unsigned short c = 0; c < 256; ++c){
2016-03-24 14:01:53 +00:00
if (table.marks[c] == 0){
table.eq_class[c] = table.eq_class_counter;
table.eq_class_rep[table.eq_class_counter] = (unsigned char)c;
unsigned char *c2_line = c_line + state_count;
2016-03-24 01:05:28 +00:00
for (unsigned short c2 = c + 1; c2 < 256; ++c2){
2016-03-24 14:01:53 +00:00
if (memcmp(c_line, c2_line, state_count) == 0){
table.marks[c2] = 1;
table.eq_class[c2] = table.eq_class_counter;
2016-03-24 01:05:28 +00:00
}
2016-03-24 14:01:53 +00:00
c2_line += state_count;
2016-03-24 01:05:28 +00:00
}
2016-03-24 14:01:53 +00:00
++table.eq_class_counter;
2016-03-24 01:05:28 +00:00
}
2016-03-24 14:01:53 +00:00
c_line += state_count;
2016-03-24 01:05:28 +00:00
}
2016-03-24 14:01:53 +00:00
table.reduced_transition_table = (unsigned char*)malloc(state_count * table.eq_class_counter);
2016-03-24 01:05:28 +00:00
i = 0;
2016-03-24 14:01:53 +00:00
for (unsigned short eq = 0; eq < table.eq_class_counter; ++eq){
for (unsigned char state = 0; state < state_count; ++state){
fsm.state = state;
2016-03-24 01:05:28 +00:00
fsm.emit_token = 0;
2016-03-24 14:01:53 +00:00
new_fsm = main_fsm(fsm, pp_state, table.eq_class_rep[eq]);
table.reduced_transition_table[i++] = new_fsm.state + state_count*new_fsm.emit_token;
2016-03-24 01:05:28 +00:00
}
}
2016-03-24 14:01:53 +00:00
return(table);
}
2016-03-24 01:05:28 +00:00
2016-03-24 14:01:53 +00:00
void
render_fsm_table(FILE *file, FSM_Tables tables, char *group_name){
begin_table(file, "short", group_name, "eq_classes");
2016-03-24 01:05:28 +00:00
for (unsigned short c = 0; c < 256; ++c){
2016-03-24 14:01:53 +00:00
do_table_item(file, tables.eq_class[c]*tables.state_count);
2016-03-24 01:05:28 +00:00
}
end_row(file);
end_table(file);
2016-03-24 14:01:53 +00:00
fprintf(file, "const int num_%s_eq_classes = %d;\n\n", group_name, tables.eq_class_counter);
int i = 0;
begin_table(file, "char", group_name, "table");
for (unsigned short c = 0; c < tables.eq_class_counter; ++c){
for (unsigned char state = 0; state < tables.state_count; ++state){
do_table_item(file, tables.reduced_transition_table[i++]);
}
end_row(file);
}
end_table(file);
}
struct PP_Names{
unsigned char pp_state;
char *name;
};
PP_Names pp_names[] = {
{LSPP_default, "main_fsm"},
{LSPP_include, "pp_include_fsm"},
{LSPP_macro_identifier, "pp_macro_fsm"},
{LSPP_identifier, "pp_identifier_fsm"},
{LSPP_body_if, "pp_body_if_fsm"},
{LSPP_body, "pp_body_fsm"},
{LSPP_number, "pp_number_fsm"},
{LSPP_error, "pp_error_fsm"},
{LSPP_junk, "pp_junk_fsm"},
};
int main(){
FILE *file;
file = fopen("4cpp_lexer_tables.c", "wb");
2016-03-24 01:05:28 +00:00
2016-03-24 14:01:53 +00:00
FSM_Tables wtables = generate_whitespace_skip_table();
render_fsm_table(file, wtables, "whitespace_fsm");
2016-03-24 01:05:28 +00:00
2016-03-24 14:01:53 +00:00
begin_table(file, "char", "multiline_state_table");
2016-03-24 01:05:28 +00:00
for (unsigned char state = 0; state < LS_count; ++state){
2016-03-24 14:01:53 +00:00
do_table_item(file, (state == LS_string_multiline || state == LS_char_multiline));
}
end_row(file);
end_table(file);
for (int i = 0; i < ArrayCount(pp_names); ++i){
assert(i == pp_names[i].pp_state);
FSM_Tables tables = generate_fsm_table(pp_names[i].pp_state);
render_fsm_table(file, tables, pp_names[i].name);
}
begin_ptr_table(file, "short", "get_eq_classes");
for (int i = 0; i < ArrayCount(pp_names); ++i){
do_table_item_direct(file, pp_names[i].name, "_eq_classes");
2016-03-24 01:05:28 +00:00
end_row(file);
2016-03-24 14:01:53 +00:00
}
2016-03-24 01:05:28 +00:00
end_table(file);
2016-03-24 14:01:53 +00:00
begin_ptr_table(file, "char", "get_table");
for (int i = 0; i < ArrayCount(pp_names); ++i){
do_table_item_direct(file, pp_names[i].name, "_table");
2016-03-24 01:05:28 +00:00
end_row(file);
}
end_table(file);
fclose(file);
return(0);
}
// BOTTOM