486 lines
14 KiB
C
486 lines
14 KiB
C
|
|
// TOP
|
|
|
|
#ifndef FCPP_LEXER_TYPES_INC
|
|
#define FCPP_LEXER_TYPES_INC
|
|
|
|
#ifndef ENUM
|
|
#define ENUM(type,name) typedef type name; enum name##_
|
|
#endif
|
|
|
|
#ifndef ENUM_INTERNAL
|
|
#define ENUM_INTERNAL(type,name) typedef type name; enum name##_
|
|
#endif
|
|
|
|
#ifndef struct_internal
|
|
#define struct_internal struct
|
|
#endif
|
|
|
|
/* DOC(A Cpp_Token_Type classifies a token to make parsing easier. Some types are not
|
|
actually output by the lexer, but exist because parsers will also make use of token
|
|
types in their own output.) */
|
|
ENUM(uint32_t, Cpp_Token_Type){
|
|
|
|
CPP_TOKEN_JUNK = 0,
|
|
CPP_TOKEN_COMMENT = 1,
|
|
|
|
CPP_PP_INCLUDE = 2,
|
|
CPP_PP_DEFINE = 3,
|
|
CPP_PP_UNDEF = 4,
|
|
CPP_PP_IF = 5,
|
|
CPP_PP_IFDEF = 6,
|
|
CPP_PP_IFNDEF = 7,
|
|
CPP_PP_ELSE = 8,
|
|
CPP_PP_ELIF = 9,
|
|
CPP_PP_ENDIF = 10,
|
|
CPP_PP_ERROR = 11,
|
|
CPP_PP_IMPORT = 12,
|
|
CPP_PP_USING = 13,
|
|
CPP_PP_LINE = 14,
|
|
CPP_PP_PRAGMA = 15,
|
|
CPP_PP_STRINGIFY = 16,
|
|
CPP_PP_CONCAT = 17,
|
|
CPP_PP_UNKNOWN = 18,
|
|
|
|
CPP_PP_DEFINED = 19,
|
|
CPP_PP_INCLUDE_FILE = 20,
|
|
CPP_PP_ERROR_MESSAGE = 21,
|
|
|
|
CPP_TOKEN_KEY_TYPE = 22,
|
|
CPP_TOKEN_KEY_MODIFIER = 23,
|
|
CPP_TOKEN_KEY_QUALIFIER = 24,
|
|
/* DOC(This type is not stored in token output from the lexer.) */
|
|
CPP_TOKEN_KEY_OPERATOR = 25,
|
|
CPP_TOKEN_KEY_CONTROL_FLOW = 26,
|
|
CPP_TOKEN_KEY_CAST = 27,
|
|
CPP_TOKEN_KEY_TYPE_DECLARATION = 28,
|
|
CPP_TOKEN_KEY_ACCESS = 29,
|
|
CPP_TOKEN_KEY_LINKAGE = 30,
|
|
CPP_TOKEN_KEY_OTHER = 31,
|
|
|
|
CPP_TOKEN_IDENTIFIER = 32,
|
|
CPP_TOKEN_INTEGER_CONSTANT = 33,
|
|
CPP_TOKEN_CHARACTER_CONSTANT = 34,
|
|
CPP_TOKEN_FLOATING_CONSTANT = 35,
|
|
CPP_TOKEN_STRING_CONSTANT = 36,
|
|
CPP_TOKEN_BOOLEAN_CONSTANT = 37,
|
|
|
|
CPP_TOKEN_STATIC_ASSERT = 38,
|
|
|
|
CPP_TOKEN_BRACKET_OPEN = 39,
|
|
CPP_TOKEN_BRACKET_CLOSE = 40,
|
|
CPP_TOKEN_PARENTHESE_OPEN = 41,
|
|
CPP_TOKEN_PARENTHESE_CLOSE = 42,
|
|
CPP_TOKEN_BRACE_OPEN = 43,
|
|
CPP_TOKEN_BRACE_CLOSE = 44,
|
|
CPP_TOKEN_SEMICOLON = 45,
|
|
CPP_TOKEN_ELLIPSIS = 46,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_STAR = 47,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_AMPERSAND = 48,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_TILDE = 49,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_PLUS = 50,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_MINUS = 51,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_INCREMENT = 52,
|
|
|
|
/* DOC(This is an 'ambiguous' token type because it requires
|
|
parsing to determine the full nature of the token.) */
|
|
CPP_TOKEN_DECREMENT = 53,
|
|
|
|
// NOTE(allen): Precedence 1, LtoR
|
|
CPP_TOKEN_SCOPE = 54,
|
|
|
|
// NOTE(allen): Precedence 2, LtoR
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_POSTINC = 55,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_POSTDEC = 56,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_FUNC_STYLE_CAST = 57,
|
|
CPP_TOKEN_CPP_STYLE_CAST = 58,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_CALL = 59,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_INDEX = 60,
|
|
CPP_TOKEN_DOT = 61,
|
|
CPP_TOKEN_ARROW = 62,
|
|
|
|
// NOTE(allen): Precedence 3, RtoL
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_PREINC = 63,
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_PREDEC = 64,
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_POSITIVE = 65,
|
|
/* DOC(This token is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_NEGAITVE = 66,
|
|
CPP_TOKEN_NOT = 67,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_BIT_NOT = 68,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_CAST = 69,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_DEREF = 70,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_TYPE_PTR = 71,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_ADDRESS = 72,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_TYPE_REF = 73,
|
|
CPP_TOKEN_SIZEOF = 74,
|
|
CPP_TOKEN_ALIGNOF = 75,
|
|
CPP_TOKEN_DECLTYPE = 76,
|
|
CPP_TOKEN_TYPEID = 77,
|
|
CPP_TOKEN_NEW = 78,
|
|
CPP_TOKEN_DELETE = 79,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_NEW_ARRAY = 80,
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_DELETE_ARRAY = 81,
|
|
|
|
// NOTE(allen): Precedence 4, LtoR
|
|
CPP_TOKEN_PTRDOT = 82,
|
|
CPP_TOKEN_PTRARROW = 83,
|
|
|
|
// NOTE(allen): Precedence 5, LtoR
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_MUL = 84,
|
|
CPP_TOKEN_DIV = 85,
|
|
CPP_TOKEN_MOD = 86,
|
|
|
|
// NOTE(allen): Precedence 6, LtoR
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_ADD = 87,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_SUB = 88,
|
|
|
|
// NOTE(allen): Precedence 7, LtoR
|
|
CPP_TOKEN_LSHIFT = 89,
|
|
CPP_TOKEN_RSHIFT = 90,
|
|
|
|
// NOTE(allen): Precedence 8, LtoR
|
|
CPP_TOKEN_LESS = 91,
|
|
CPP_TOKEN_GRTR = 92,
|
|
CPP_TOKEN_GRTREQ = 93,
|
|
CPP_TOKEN_LESSEQ = 94,
|
|
|
|
// NOTE(allen): Precedence 9, LtoR
|
|
CPP_TOKEN_EQEQ = 95,
|
|
CPP_TOKEN_NOTEQ = 96,
|
|
|
|
// NOTE(allen): Precedence 10, LtoR
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_BIT_AND = 97,
|
|
|
|
// NOTE(allen): Precedence 11, LtoR
|
|
CPP_TOKEN_BIT_XOR = 98,
|
|
|
|
// NOTE(allen): Precedence 12, LtoR
|
|
CPP_TOKEN_BIT_OR = 99,
|
|
|
|
// NOTE(allen): Precedence 13, LtoR
|
|
CPP_TOKEN_AND = 100,
|
|
|
|
// NOTE(allen): Precedence 14, LtoR
|
|
CPP_TOKEN_OR = 101,
|
|
|
|
// NOTE(allen): Precedence 15, RtoL
|
|
CPP_TOKEN_TERNARY_QMARK = 102,
|
|
CPP_TOKEN_COLON = 103,
|
|
CPP_TOKEN_THROW = 104,
|
|
CPP_TOKEN_EQ = 105,
|
|
CPP_TOKEN_ADDEQ = 106,
|
|
CPP_TOKEN_SUBEQ = 107,
|
|
CPP_TOKEN_MULEQ = 108,
|
|
CPP_TOKEN_DIVEQ = 109,
|
|
CPP_TOKEN_MODEQ = 110,
|
|
CPP_TOKEN_LSHIFTEQ = 111,
|
|
CPP_TOKEN_RSHIFTEQ = 112,
|
|
CPP_TOKEN_ANDEQ = 113,
|
|
CPP_TOKEN_OREQ = 114,
|
|
CPP_TOKEN_XOREQ = 115,
|
|
|
|
// NOTE(allen): Precedence 16, LtoR
|
|
CPP_TOKEN_COMMA = 116,
|
|
|
|
/* DOC(This type is for parser use, it is not output by the lexer.) */
|
|
CPP_TOKEN_EOF = 117,
|
|
|
|
CPP_TOKEN_TYPE_COUNT = 118
|
|
};
|
|
|
|
/* DOC(Cpp_Token represents a single lexed token.
|
|
It is the primary output of the lexing system.)
|
|
DOC_SEE(Cpp_Token_Flag) */
|
|
struct Cpp_Token{
|
|
/* DOC(The type field indicates the type of the token.
|
|
All tokens have a type no matter the circumstances.) */
|
|
Cpp_Token_Type type;
|
|
|
|
/* DOC(The start field indicates the index of the first character
|
|
of this token's lexeme.) */
|
|
int32_t start;
|
|
|
|
/* DOC(The size field indicates the number of bytes in this token's lexeme.) */
|
|
int32_t size;
|
|
|
|
/* DOC(The state_flags should not be used outside of the lexer's implementation.) */
|
|
uint16_t state_flags;
|
|
|
|
/* DOC(The flags field contains extra useful information about the token.) */
|
|
uint16_t flags;
|
|
};
|
|
|
|
/* DOC(The Cpp_Token_Flags are used to mark up tokens with additional information.) */
|
|
ENUM(uint16_t, Cpp_Token_Flag){
|
|
/* DOC(Indicates that the token is a preprocessor directive.) */
|
|
CPP_TFLAG_PP_DIRECTIVE = 0x1,
|
|
|
|
/* DOC(Indicates that the token is on the line of a preprocessor directive.) */
|
|
CPP_TFLAG_PP_BODY = 0x2,
|
|
|
|
/* DOC(Indicates that the token spans across multiple lines. This can show up
|
|
on line comments and string literals with back slash line continuation. ) */
|
|
CPP_TFLAG_MULTILINE = 0x4,
|
|
|
|
/* DOC(Indicates that the token is some kind of operator or punctuation like braces.) */
|
|
CPP_TFLAG_IS_OPERATOR = 0x8,
|
|
|
|
/* DOC(Indicates that the token is a keyword.) */
|
|
CPP_TFLAG_IS_KEYWORD = 0x10
|
|
};
|
|
|
|
/* DOC(Cpp_Token_Array is used to bundle together the common elements
|
|
of a growing array of Cpp_Tokens. To initialize it the tokens field should
|
|
point to a block of memory with a size equal to max_count*sizeof(Cpp_Token)
|
|
and the count should be initialized to zero.) */
|
|
struct Cpp_Token_Array{
|
|
/* DOC(The tokens field points to the memory used to store the array of tokens.) */
|
|
Cpp_Token *tokens;
|
|
|
|
/* DOC(The count field counts how many tokens in the array are currently used.) */
|
|
int32_t count;
|
|
|
|
/* DOC(The max_count field specifies the maximum size the count field may grow to before
|
|
the tokens array is out of space.) */
|
|
int32_t max_count;
|
|
};
|
|
|
|
static Cpp_Token_Array null_cpp_token_array = {0};
|
|
|
|
/* DOC(Cpp_Get_Token_Result is the return result of the cpp_get_token call.)
|
|
DOC_SEE(cpp_get_token) */
|
|
struct Cpp_Get_Token_Result{
|
|
/* DOC(The token_index field indicates which token answers the query. To get the token from
|
|
the source array CODE_EXAMPLE(array.tokens[result.token_index])) */
|
|
int32_t token_index;
|
|
|
|
/* DOC(The in_whitespace field is true when the query position was actually in whitespace
|
|
after the result token.) */
|
|
int32_t in_whitespace;
|
|
|
|
/* DOC(If the token_index refers to an actual token, this is the start value of the token.
|
|
Otherwise this is zero.) */
|
|
int32_t token_start;
|
|
|
|
/* DOC(If the token_index refers to an actual token, this is the start+size value of the token.
|
|
Otherwise this is zero.) */
|
|
int32_t token_end;
|
|
};
|
|
|
|
/* DOC(Cpp_Relex_Range is the return result of the cpp_get_relex_range call.)
|
|
DOC_SEE(cpp_get_relex_range) */
|
|
struct Cpp_Relex_Range{
|
|
/* DOC(The index of the first token in the unedited array that needs to be relexed.) */
|
|
int32_t start_token_index;
|
|
/* DOC(The index of the first token in the unedited array after the edited range
|
|
that may not need to be relexed. Sometimes a relex operation has to lex past this
|
|
position to find a token that is not effected by the edit.) */
|
|
int32_t end_token_index;
|
|
};
|
|
|
|
struct_internal Cpp_Lex_FSM{
|
|
uint8_t state;
|
|
uint8_t int_state;
|
|
uint8_t emit_token;
|
|
uint8_t multi_line;
|
|
};
|
|
static Cpp_Lex_FSM null_lex_fsm = {0};
|
|
|
|
/* DOC(Cpp_Lex_Data represents the state of the lexer so that the system may be resumable
|
|
and the user can manage the lexer state and decide when to resume lexing with it. To create
|
|
a new lexer state call cpp_lex_data_init.
|
|
|
|
The internals of the lex state should not be treated as a part of the public API.)
|
|
DOC_SEE(cpp_lex_data_init)
|
|
HIDE_MEMBERS() */
|
|
struct Cpp_Lex_Data{
|
|
char tb[32];
|
|
int32_t tb_pos;
|
|
int32_t token_start;
|
|
|
|
int32_t pos;
|
|
int32_t pos_overide;
|
|
int32_t chunk_pos;
|
|
|
|
Cpp_Lex_FSM fsm;
|
|
uint8_t white_done;
|
|
uint8_t pp_state;
|
|
uint8_t completed;
|
|
|
|
Cpp_Token token;
|
|
|
|
int32_t __pc__;
|
|
};
|
|
|
|
/* DOC(Cpp_Lex_Result is returned from the lexing engine to indicate why it stopped lexing.) */
|
|
ENUM(int32_t, Cpp_Lex_Result){
|
|
/* DOC(This indicates that the system got to the end of the file and will not accept more input.) */
|
|
LexResult_Finished = 0,
|
|
|
|
/* DOC(This indicates that the system got to the end of an input chunk and is ready to receive the
|
|
next input chunk.) */
|
|
LexResult_NeedChunk = 1,
|
|
|
|
/* DOC(This indicates that the output array ran out of space to store tokens and needs to be
|
|
replaced or expanded before continuing.) */
|
|
LexResult_NeedTokenMemory = 2,
|
|
|
|
/* DOC(This indicates that the maximum number of output tokens as specified by the user was hit.) */
|
|
LexResult_HitTokenLimit = 3,
|
|
};
|
|
|
|
/* DOC(Cpp_Relex_Data represents the state of the relexer so that the system may be resumable.
|
|
To create a new relex state call cpp_relex_init.)
|
|
DOC_SEE(cpp_relex_init)
|
|
HIDE_MEMBERS()*/
|
|
struct Cpp_Relex_Data{
|
|
Cpp_Lex_Data lex;
|
|
|
|
Cpp_Token end_token;
|
|
|
|
int32_t relex_start_position;
|
|
int32_t start_token_index;
|
|
int32_t end_token_index;
|
|
int32_t original_end_token_index;
|
|
|
|
int32_t character_shift_amount;
|
|
|
|
Cpp_Lex_Result result_state;
|
|
|
|
int32_t __pc__;
|
|
};
|
|
|
|
ENUM_INTERNAL(uint16_t, Cpp_Preprocessor_State){
|
|
CPP_LEX_PP_DEFAULT,
|
|
CPP_LEX_PP_IDENTIFIER,
|
|
CPP_LEX_PP_MACRO_IDENTIFIER,
|
|
CPP_LEX_PP_INCLUDE,
|
|
CPP_LEX_PP_BODY,
|
|
CPP_LEX_PP_BODY_IF,
|
|
CPP_LEX_PP_NUMBER,
|
|
CPP_LEX_PP_ERROR,
|
|
CPP_LEX_PP_JUNK,
|
|
CPP_LEX_PP_COUNT
|
|
};
|
|
|
|
ENUM_INTERNAL(uint8_t, Cpp_Lex_State){
|
|
LS_default,
|
|
LS_identifier,
|
|
LS_pound,
|
|
LS_pp,
|
|
LS_ppdef,
|
|
LS_char,
|
|
LS_char_multiline,
|
|
LS_char_slashed,
|
|
LS_string,
|
|
LS_string_multiline,
|
|
LS_string_slashed,
|
|
LS_number,
|
|
LS_number0,
|
|
LS_float,
|
|
LS_crazy_float0,
|
|
LS_crazy_float1,
|
|
LS_hex,
|
|
LS_comment_pre,
|
|
LS_comment,
|
|
LS_comment_slashed,
|
|
LS_comment_block,
|
|
LS_comment_block_ending,
|
|
LS_dot,
|
|
LS_ellipsis,
|
|
LS_less,
|
|
LS_less_less,
|
|
LS_more,
|
|
LS_more_more,
|
|
LS_minus,
|
|
LS_arrow,
|
|
LS_and,
|
|
LS_or,
|
|
LS_plus,
|
|
LS_colon,
|
|
LS_star,
|
|
LS_modulo,
|
|
LS_caret,
|
|
LS_eq,
|
|
LS_bang,
|
|
LS_error_message,
|
|
//
|
|
LS_count
|
|
};
|
|
|
|
ENUM_INTERNAL(uint8_t, Cpp_Lex_Int_State){
|
|
LSINT_default,
|
|
LSINT_u,
|
|
LSINT_l,
|
|
LSINT_L,
|
|
LSINT_ul,
|
|
LSINT_uL,
|
|
LSINT_ll,
|
|
LSINT_extra,
|
|
//
|
|
LSINT_count
|
|
};
|
|
|
|
ENUM_INTERNAL(uint8_t, Cpp_Lex_PP_State){
|
|
LSPP_default,
|
|
LSPP_include,
|
|
LSPP_macro_identifier,
|
|
LSPP_identifier,
|
|
LSPP_body_if,
|
|
LSPP_body,
|
|
LSPP_number,
|
|
LSPP_error,
|
|
LSPP_junk,
|
|
//
|
|
LSPP_count
|
|
};
|
|
|
|
#endif
|
|
|
|
// BOTTOM
|
|
|