more work on the new lexer

master
Allen Webster 2016-03-10 11:28:44 -05:00
parent 19dd5af51a
commit 52e74f7a4f
2 changed files with 419 additions and 39 deletions

View File

@ -6,6 +6,9 @@
#include "../4cpp_lexer_types.h" #include "../4cpp_lexer_types.h"
namespace new_lex{
//
#define lexer_link static #define lexer_link static
@ -77,13 +80,29 @@ cpp_shift_token_starts(Cpp_Token_Stack *stack, int from_token_i, int shift_amoun
enum Lex_State{ enum Lex_State{
LS_default, LS_default,
LS_identifier,
LS_char,
LS_string,
LS_number,
LS_comment_pre, LS_comment_pre,
LS_comment, LS_comment,
LS_comment_block, LS_comment_block,
LS_comment_block_ending, LS_comment_block_ending,
LS_dot, LS_dot,
LS_ellipsis,
LS_less, LS_less,
LS_less_less,
LS_more, LS_more,
LS_more_more,
LS_minus,
LS_arrow,
LS_and,
LS_or,
LS_plus,
LS_colon,
LS_star,
LS_modulo,
LS_caret,
}; };
struct Lex_Data{ struct Lex_Data{
@ -109,29 +128,69 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack *
int emit_token = 0; int emit_token = 0;
char c; char c = 0;
chunk -= file_absolute_pos; chunk -= file_absolute_pos;
for (; pos < end_pos && token_i < max_token_i; ++pos){ for (; pos < end_pos && token_i < max_token_i;){
for (; pos < end_pos;){
c = chunk[pos++]; c = chunk[pos];
if (!(c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' || c == '\v')) break;
if (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' || c == '\v'){
for (; pos < end_pos;){
c = chunk[pos++];
if (!(c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f' || c == '\v')) break;
}
--pos;
} }
--pos;
lex_data.token_start = pos; lex_data.token_start = pos;
state = LS_default; state = LS_default;
emit_token = 0; emit_token = 0;
for (; emit_token == 0 && pos < end_pos;){ for (; emit_token == 0 && pos <= end_pos;){
c = chunk[pos++]; if (pos < end_pos){
c = chunk[pos++];
}
else{
c = 0;
++pos;
}
switch (state){ switch (state){
case LS_default: case LS_default:
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'){
state = LS_identifier;
}
else if (c >= '1' && c <= '9'){
state = LS_number;
}
else
switch (c){ switch (c){
case '\'': state = LS_char; break;
case '"': state = LS_string; break;
case '/': state = LS_comment_pre; break; case '/': state = LS_comment_pre; break;
case '.': state = LS_dot; break;
case '<': state = LS_less; break;
case '>': state = LS_more; break;
case '-': state = LS_minus; break;
case '&': state = LS_and; break;
case '|': state = LS_or; break;
case '+': state = LS_plus; break;
case ':': state = LS_colon; break;
case '*': state = LS_star; break;
case '%': state = LS_modulo; break;
case '^': state = LS_caret; break;
#define OperCase(op,type) case op: emit_token = 1; break; #define OperCase(op,type) case op: emit_token = 1; break;
OperCase('{', CPP_TOKEN_BRACE_OPEN); OperCase('{', CPP_TOKEN_BRACE_OPEN);
OperCase('}', CPP_TOKEN_BRACE_CLOSE); OperCase('}', CPP_TOKEN_BRACE_CLOSE);
@ -144,30 +203,83 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack *
OperCase('~', CPP_TOKEN_TILDE); OperCase('~', CPP_TOKEN_TILDE);
OperCase(',', CPP_TOKEN_COMMA); OperCase(',', CPP_TOKEN_COMMA);
OperCase(';', CPP_TOKEN_SEMICOLON);
OperCase('?', CPP_TOKEN_TERNARY_QMARK); OperCase('?', CPP_TOKEN_TERNARY_QMARK);
#undef OperCase #undef OperCase
#if 0
case '.': state = LS_dot; break;
case '<': state = LS_less; break;
case '>': state = LS_more; break;
#endif
} }
break; break;
case LS_identifier:
if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')){
emit_token = 1;
}
break;
case LS_char:
// TODO
break;
case LS_string:
// TODO
break;
case LS_number:
if (c >= '0' && c <= '9'){
state = LS_number;
}
else if (c == '.'){
state = LS_float;
}
break;
case LS_dot: case LS_dot:
switch (c){
case '.': state = LS_ellipsis; break;
case '*': emit_token = 1; break;
default: emit_token = 1; break;
}
break; break;
case LS_ellipsis:
emit_token = 1;
break;
case LS_less: case LS_less:
switch (c){
case '<': state = LS_less_less; break;
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break; break;
case LS_less_less:
switch (c){
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_more: case LS_more:
switch (c){
case '>': state = LS_more_more; break;
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break; break;
case LS_more_more:
switch (c){
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_comment_pre: case LS_comment_pre:
switch (c){ switch (c){
case '/': state = LS_comment; break; case '/': state = LS_comment; break;
case '*': state = LS_comment_block; break; case '*': state = LS_comment_block; break;
case '=': emit_token = 1; break;
default: emit_token = 1; break;
} }
break; break;
@ -190,16 +302,83 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack *
default: state = LS_comment_block; break; default: state = LS_comment_block; break;
} }
break; break;
case LS_minus:
switch (c){
case '>': state = LS_arrow; break;
case '-': emit_token = 1; break;
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_arrow:
switch (c){
case '*': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_and:
switch (c){
case '&': emit_token = 1; break;
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_or:
switch (c){
case '|': emit_token = 1; break;
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_plus:
switch (c){
case '+': emit_token = 1; break;
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_colon:
switch (c){
case ':': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_star:
switch (c){
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_modulo:
switch (c){
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
case LS_caret:
switch (c){
case '=': emit_token = 1; break;
default: emit_token = 1; break;
}
break;
} }
} }
if (emit_token){ if (emit_token){
--pos;
lex_data.token_end = pos; lex_data.token_end = pos;
switch (state){ switch (state){
case LS_default: case LS_default:
switch (chunk[pos]){ switch (c){
#define OperCase(op,t) case op: token.type = t; break; #define OperCase(op,t) case op: token.type = t; break;
OperCase('{', CPP_TOKEN_BRACE_OPEN); OperCase('{', CPP_TOKEN_BRACE_OPEN);
OperCase('}', CPP_TOKEN_BRACE_CLOSE); OperCase('}', CPP_TOKEN_BRACE_CLOSE);
@ -212,30 +391,235 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack *
OperCase('~', CPP_TOKEN_TILDE); OperCase('~', CPP_TOKEN_TILDE);
OperCase(',', CPP_TOKEN_COMMA); OperCase(',', CPP_TOKEN_COMMA);
OperCase(';', CPP_TOKEN_SEMICOLON);
OperCase('?', CPP_TOKEN_TERNARY_QMARK); OperCase('?', CPP_TOKEN_TERNARY_QMARK);
#undef OperCase #undef OperCase
} }
token.flags = CPP_TFLAG_IS_OPERATOR; token.flags = CPP_TFLAG_IS_OPERATOR;
break; break;
case LS_identifier:
token.type = CPP_TOKEN_IDENTIFIER;
token.flags = 0;
--lex_data.token_end;
--pos;
break;
case LS_comment_pre:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '=': token.type = CPP_TOKEN_DIVEQ; break;
default:
token.type = CPP_TOKEN_DIV;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_comment: case LS_comment_block_ending: case LS_comment: case LS_comment_block_ending:
token.type = CPP_TOKEN_COMMENT; token.type = CPP_TOKEN_COMMENT;
token.flags = 0; token.flags = 0;
c = chunk[pos]; c = chunk[--lex_data.token_end];
while (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v' || c == '\f'){ while (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v' || c == '\f'){
--pos; --lex_data.token_end;
c = chunk[pos]; c = chunk[lex_data.token_end];
}
++lex_data.token_end;
break;
case LS_dot:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '*': token.type = CPP_TOKEN_PTRDOT; break;
default:
token.type = CPP_TOKEN_DOT;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_ellipsis:
switch (c){
case '.':
token.flags = CPP_TFLAG_IS_OPERATOR;
token.type = CPP_TOKEN_ELLIPSIS;
break;
default:
token.type = CPP_TOKEN_JUNK;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_less:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '=': token.type = CPP_TOKEN_LESSEQ; break;
default:
token.type = CPP_TOKEN_LESS;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_less_less:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '=': token.type = CPP_TOKEN_LSHIFTEQ; break;
default:
token.type = CPP_TOKEN_LSHIFT;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_more:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '=': token.type = CPP_TOKEN_GRTREQ; break;
default:
token.type = CPP_TOKEN_GRTR;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_more_more:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '=': token.type = CPP_TOKEN_RSHIFTEQ; break;
default:
token.type = CPP_TOKEN_RSHIFT;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_minus:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '-': token.type = CPP_TOKEN_DECREMENT; break;
case '=': token.type = CPP_TOKEN_SUBEQ; break;
default:
token.type = CPP_TOKEN_MINUS;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_arrow:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '*': token.type = CPP_TOKEN_PTRARROW; break;
default:
token.type = CPP_TOKEN_ARROW;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_and:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '&': token.type = CPP_TOKEN_AND; break;
case '=': token.type = CPP_TOKEN_ANDEQ; break;
default:
token.type = CPP_TOKEN_AMPERSAND;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_or:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '|': token.type = CPP_TOKEN_OR; break;
case '=': token.type = CPP_TOKEN_OREQ; break;
default:
token.type = CPP_TOKEN_BIT_OR;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_plus:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '+': token.type = CPP_TOKEN_INCREMENT; break;
case '=': token.type = CPP_TOKEN_ADDEQ; break;
default:
token.type = CPP_TOKEN_PLUS;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_colon:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case ':': token.type = CPP_TOKEN_SCOPE; break;
default:
token.type = CPP_TOKEN_COLON;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_star:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '=': token.type = CPP_TOKEN_MULEQ; break;
default:
token.type = CPP_TOKEN_STAR;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_modulo:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '=': token.type = CPP_TOKEN_MODEQ; break;
default:
token.type = CPP_TOKEN_MOD;
--lex_data.token_end;
--pos;
break;
}
break;
case LS_caret:
token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
case '^': token.type = CPP_TOKEN_XOREQ; break;
default:
token.type = CPP_TOKEN_BIT_XOR;
--lex_data.token_end;
--pos;
break;
} }
++pos;
break; break;
} }
token.start = lex_data.token_start; token.start = lex_data.token_start;
token.size = pos - lex_data.token_start; token.size = lex_data.token_end - lex_data.token_start;
token.state_flags = pp_state; token.state_flags = pp_state;
out_tokens[token_i++] = token; out_tokens[token_i++] = token;
pos = lex_data.token_end;
} }
} }
@ -245,6 +629,8 @@ cpp_lex_nonalloc(char *chunk, int file_absolute_pos, int size, Cpp_Token_Stack *
return(lex_data); return(lex_data);
} }
}
#endif #endif
// BOTTOM // BOTTOM

View File

@ -10,20 +10,14 @@
// TOP // TOP
#include "../4ed_meta.h" #include "../4ed_meta.h"
#define FCPP_STRING_IMPLEMENTATION #define FCPP_STRING_IMPLEMENTATION
#include "../4coder_string.h" #include "../4coder_string.h"
#include "../4cpp_types.h" #include "../4cpp_types.h"
#include "../4cpp_lexer_types.h" #include "../4cpp_lexer_types.h"
#define FCPP_LEXER_IMPLEMENTATION #define FCPP_LEXER_IMPLEMENTATION
#include "../4cpp_lexer.h" #include "../4cpp_lexer.h"
namespace new_lex{
#include "4cpp_new_lexer.h" #include "4cpp_new_lexer.h"
}
#include <windows.h> #include <windows.h>
@ -238,8 +232,8 @@ run_experiment(Experiment *exp, char *filename){
if (correct->start != testing->start || correct->size != testing->size){ if (correct->start != testing->start || correct->size != testing->size){
pass = 0; pass = 0;
printf("token range mismatch at token %d\n" printf("token range mismatch at token %d\n"
"\t%d:%d original %d:%d testing\n" " %d:%d original %d:%d testing\n"
"\t%.*s original %.*s testing\n", " %.*s original %.*s testing\n",
j, j,
correct->start, correct->size, testing->start, testing->size, correct->start, correct->size, testing->start, testing->size,
correct->size, file_cpp.data + correct->start, correct->size, file_cpp.data + correct->start,