work on new lexer

This commit is contained in:
Allen Webster 2016-06-01 19:52:06 -04:00
parent e1a03017e8
commit 5e56483ec0
8 changed files with 658 additions and 1568 deletions

View File

@ -65,6 +65,19 @@ NOTES ON USE:
#include "4cpp_lexer_types.h"
struct Cpp_Lex_Data{
Cpp_Preprocessor_State pp_state;
int pos;
int complete;
};
struct Cpp_Read_Result{
Cpp_Token token;
int pos;
char newline;
char has_result;
};
Cpp_File
data_as_cpp_file(Data data){
Cpp_File result;
@ -140,7 +153,6 @@ FCPP_LINK bool cpp_push_token_no_merge(Cpp_Token_Stack *stack, Cpp_Token token);
FCPP_LINK bool cpp_push_token_nonalloc(Cpp_Token_Stack *stack, Cpp_Token token);
inline Cpp_Lex_Data cpp_lex_data_zero() { Cpp_Lex_Data data = {(Cpp_Preprocessor_State)0}; return(data); }
inline Cpp_Token_Stack cpp_token_stack_zero() { Cpp_Token_Stack stack={0}; return(stack); }
FCPP_LINK Cpp_Read_Result cpp_lex_step(Cpp_File file, Cpp_Lex_Data *lex);
@ -1266,10 +1278,10 @@ cpp_lex_step(Cpp_File file, Cpp_Lex_Data *lex_data){
}
}
}
result.token.state_flags = state_flags;
result.has_result = has_result;
*lex_data = lex;
return result;
}

View File

@ -209,23 +209,15 @@ enum Cpp_Preprocessor_State{
CPP_LEX_PP_COUNT
};
struct Cpp_Lex_Data{
Cpp_Preprocessor_State pp_state;
int pos;
int complete;
};
struct Cpp_Read_Result{
Cpp_Token token;
int pos;
char newline;
char has_result;
};
struct Cpp_Token_Stack{
Cpp_Token *tokens;
int count, max_count;
};
inline Cpp_Token_Stack
cpp_token_stack_zero(){
Cpp_Token_Stack stack={0};
return(stack);
}
struct Cpp_Token_Merge{
Cpp_Token new_token;

View File

@ -809,6 +809,40 @@ Job_Callback_Sig(job_full_lex){
tokens.max_count = memory->size / sizeof(Cpp_Token);
tokens.count = 0;
#if 0
b32 still_lexing = 1;
Lex_Data lex = {0};
do{
i32 result =
cpp_lex_nonalloc(&lex, cpp_file.data, cpp_file.size, &tokens, 2048);
switch (result){
case LexNeedChunk: Assert(!"Invalid Path"); break;
case LexNeedTokenMemory:
if (system->check_cancel(thread)){
return;
}
system->grow_thread_memory(memory);
tokens.tokens = (Cpp_Token*)memory->data;
tokens.max_count = memory->size / sizeof(Cpp_Token);
break;
case LexHitTokenLimit:
if (system->check_cancel(thread)){
return;
}
break;
case LexFinished: still_lexing = 0; break;
}
} while (still_lexing);
#else
Cpp_Lex_Data status = {};
do{
@ -836,6 +870,10 @@ Job_Callback_Sig(job_full_lex){
}
} while(!status.complete);
#endif
i32 new_max = LargeRoundUp(tokens.count+1, Kbytes(1));
system->acquire_lock(FRAME_LOCK);
@ -948,9 +986,9 @@ file_relex_parallel(System_Functions *system,
i32 shift_amount = relex_space.count - delete_amount;
if (shift_amount != 0){
int new_count = stack->count + shift_amount;
i32 new_count = stack->count + shift_amount;
if (new_count > stack->max_count){
int new_max = LargeRoundUp(new_count, Kbytes(1));
i32 new_max = LargeRoundUp(new_count, Kbytes(1));
stack->tokens = (Cpp_Token*)
general_memory_reallocate(general, stack->tokens,
stack->count*sizeof(Cpp_Token),
@ -958,7 +996,7 @@ file_relex_parallel(System_Functions *system,
stack->max_count = new_max;
}
int shift_size = stack->count - relex_end;
i32 shift_size = stack->count - relex_end;
if (shift_size > 0){
Cpp_Token *old_base = stack->tokens + relex_end;
memmove(old_base + shift_amount, old_base,
@ -3238,7 +3276,6 @@ try_kill_file(System_Functions *system, Models *models,
}
else{
kill_file(system, models, file, string_zero());
view_show_file(view);
}
}
}
@ -3286,6 +3323,7 @@ interactive_view_complete(System_Functions *system, View *view, String dest, i32
case IAct_Kill:
try_kill_file(system, models, 0, 0, dest);
view_show_file(view);
break;
case IAct_Sure_To_Close:

View File

@ -99,6 +99,9 @@
; [X] feedback messages
; [X] feedback message API
; [X] kill rect
; [X] add high DPI support
;
; [] OS font rendering
;
; [] file status in custom API
; [] user file bar string

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@
#ifndef FCPP_NEW_LEXER_INC
#define FCPP_NEW_LEXER_INC
#include "..\4cpp_lexer_types.h"
#include "4cpp_lexer_fsms.h"
#include "4cpp_lexer_tables.c"
@ -286,22 +287,35 @@ cpp_attempt_token_merge(Cpp_Token prev_token, Cpp_Token next_token){
return result;
}
lexer_link void
cpp_push_token_nonalloc(Cpp_Token *out_tokens, int *token_i, Cpp_Token token){
lexer_link int
cpp_place_token_nonalloc(Cpp_Token *out_tokens, int token_i, Cpp_Token token){
Cpp_Token_Merge merge = {(Cpp_Token_Type)0};
Cpp_Token prev_token = {(Cpp_Token_Type)0};
if (*token_i > 0){
prev_token = out_tokens[*token_i - 1];
if (token_i > 0){
prev_token = out_tokens[token_i - 1];
merge = new_lex::cpp_attempt_token_merge(prev_token, token);
if (merge.did_merge){
out_tokens[*token_i - 1] = merge.new_token;
out_tokens[token_i - 1] = merge.new_token;
}
}
if (!merge.did_merge){
out_tokens[(*token_i)++] = token;
out_tokens[token_i++] = token;
}
return(token_i);
}
lexer_link bool
cpp_push_token_nonalloc(Cpp_Token_Stack *out_tokens, Cpp_Token token){
bool result = 0;
if (out_tokens->count == out_tokens->max_count){
out_tokens->count =
cpp_place_token_nonalloc(out_tokens->tokens, out_tokens->count, token);
result = 1;
}
return(result);
}
struct Lex_Data{
@ -311,15 +325,13 @@ struct Lex_Data{
int pos;
int pos_overide;
int chunk_pos;
Lex_FSM fsm;
Whitespace_FSM wfsm;
unsigned char pp_state;
unsigned char completed;
unsigned short *key_eq_classes;
unsigned char *key_table;
Cpp_Token token;
int __pc__;
@ -335,20 +347,27 @@ struct Lex_Data{
token_stack_out->count = token_i;\
*S_ptr = S; S_ptr->__pc__ = -1; return(n); }
enum Lex_Result{
LexFinished,
LexNeedChunk,
LexNeedTokenMemory,
LexHitTokenLimit
};
lexer_link int
cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_stack_out){
Lex_Data S = *S_ptr;
Cpp_Token *out_tokens = token_stack_out->tokens;
int token_i = token_stack_out->count;
int max_token_i = token_stack_out->max_count;
Pos_Update_Rule pos_update_rule = PUR_none;
char c = 0;
int end_pos = size + S.pos;
chunk -= S.pos;
int end_pos = size + S.chunk_pos;
chunk -= S.chunk_pos;
switch (S.__pc__){
DrCase(1);
@ -357,7 +376,6 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
DrCase(4);
DrCase(5);
DrCase(6);
DrCase(7);
}
for (;;){
@ -372,7 +390,8 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.wfsm.white_done = (S.wfsm.pp_state >= LSPP_count);
if (S.wfsm.white_done == 0){
DrYield(4, 1);
S.chunk_pos += size;
DrYield(4, LexNeedChunk);
}
else break;
}
@ -380,7 +399,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.pp_state = S.wfsm.pp_state;
if (S.pp_state >= LSPP_count){
S.pp_state -= LSPP_count;
}
}
S.token_start = S.pos;
S.tb_pos = 0;
@ -388,19 +407,20 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
for(;;){
unsigned short *eq_classes = get_eq_classes[S.pp_state];
unsigned char *fsm_table = get_table[S.pp_state];
for (; S.fsm.state < LS_count && S.pos < end_pos;){
c = chunk[S.pos++];
S.tb[S.tb_pos++] = c;
int i = S.fsm.state + eq_classes[c];
S.fsm.state = fsm_table[i];
S.fsm.multi_line |= multiline_state_table[S.fsm.state];
}
S.fsm.emit_token = (S.fsm.state >= LS_count);
if (S.fsm.emit_token == 0){
DrYield(3, 1);
S.chunk_pos += size;
DrYield(3, LexNeedChunk);
}
else break;
}
@ -413,13 +433,13 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
if (S.pp_state == LSPP_include){
switch (S.fsm.state){
case LSINC_default:break;
case LSINC_quotes:
case LSINC_pointy:
S.token.type = CPP_TOKEN_INCLUDE_FILE;
S.token.flags = 0;
break;
case LSINC_junk:
S.token.type = CPP_TOKEN_JUNK;
S.token.flags = 0;
@ -433,22 +453,22 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
#define OperCase(op,t) case op: S.token.type = t; break;
OperCase('{', CPP_TOKEN_BRACE_OPEN);
OperCase('}', CPP_TOKEN_BRACE_CLOSE);
OperCase('[', CPP_TOKEN_BRACKET_OPEN);
OperCase(']', CPP_TOKEN_BRACKET_CLOSE);
OperCase('(', CPP_TOKEN_PARENTHESE_OPEN);
OperCase(')', CPP_TOKEN_PARENTHESE_CLOSE);
OperCase('~', CPP_TOKEN_TILDE);
OperCase(',', CPP_TOKEN_COMMA);
OperCase(';', CPP_TOKEN_SEMICOLON);
OperCase('?', CPP_TOKEN_TERNARY_QMARK);
OperCase('@', CPP_TOKEN_JUNK);
OperCase('$', CPP_TOKEN_JUNK);
#undef OperCase
case '\\':
if (S.pp_state == LSPP_default){
S.token.type = CPP_TOKEN_JUNK;
@ -461,13 +481,14 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
c = chunk[S.pos++];
if (!(c == ' ' || c == '\t' || c == '\r' || c == '\v' || c == '\f')) S.wfsm.white_done = 1;
}
if (S.wfsm.white_done == 0){
DrYield(1, 1);
S.chunk_pos += size;
DrYield(1, LexNeedChunk);
}
else break;
}
if (c == '\n'){
S.fsm.emit_token = 0;
S.pos_overide = 0;
@ -485,46 +506,10 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
case LS_identifier:
{
S.fsm.state = 0;
S.fsm.emit_token = 0;
S.fsm.sub_machine = 0;
--S.pos;
for (;;){
// TODO(allen): Need to drop down to the instructions to optimize
// this correctly I think. This looks like it will have more branches
// than it needs unless I am very careful.
for (; S.fsm.state < LSKEY_totally_finished && S.pos < end_pos;){
// TODO(allen): Rebase these super tables so that we don't have
// to do a subtract on the state.
S.key_table = key_tables[S.fsm.sub_machine];
S.key_eq_classes = key_eq_class_tables[S.fsm.sub_machine];
for (; S.fsm.state < LSKEY_table_transition && S.pos < end_pos;){
c = chunk[S.pos++];
S.fsm.state = S.key_table[S.fsm.state + S.key_eq_classes[c]];
}
if (S.fsm.state >= LSKEY_table_transition && S.fsm.state < LSKEY_totally_finished){
S.fsm.sub_machine = S.fsm.state - LSKEY_table_transition;
S.fsm.state = 0;
}
}
S.fsm.emit_token = (S.fsm.int_state >= LSKEY_totally_finished);
if (S.fsm.emit_token == 0){
DrYield(7, 1);
}
else break;
}
--S.pos;
// TODO(allen): do stuff regarding the actual type of the token
S.token.type = CPP_TOKEN_INTEGER_CONSTANT;
S.token.flags = 0;
#if 0
--S.pos;
int word_size = S.pos - S.token_start;
if (S.pp_state == LSPP_body_if){
if (match(make_string(S.tb, word_size), make_lit_string("defined"))){
S.token.type = CPP_TOKEN_DEFINED;
@ -532,17 +517,17 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
}
Sub_Match_List_Result sub_match;
sub_match = sub_match_list(S.tb, S.tb_pos, 0, bool_lits, word_size);
if (sub_match.index != -1){
S.token.type = CPP_TOKEN_BOOLEAN_CONSTANT;
S.token.flags = CPP_TFLAG_IS_KEYWORD;
}
else{
sub_match = sub_match_list(S.tb, S.tb_pos, 0, keywords, word_size);
if (sub_match.index != -1){
String_And_Flag data = keywords.data[sub_match.index];
S.token.type = (Cpp_Token_Type)data.flags;
@ -553,10 +538,8 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.token.flags = 0;
}
}
#endif
}break;
case LS_pound:
S.token.flags = 0;
switch (c){
@ -567,7 +550,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_pp:
{
S.fsm.directive_state = LSDIR_default;
@ -578,9 +561,10 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.fsm.directive_state = pp_directive_table[S.fsm.directive_state + pp_directive_eq_classes[c]];
}
S.fsm.emit_token = (S.fsm.int_state >= LSDIR_count);
if (S.fsm.emit_token == 0){
DrYield(6, 1);
S.chunk_pos += size;
DrYield(6, LexNeedChunk);
}
else break;
}
@ -590,13 +574,13 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.token.type = type;
if (type == CPP_TOKEN_JUNK){
S.token.flags = 0;
}
}
else{
S.token.flags = CPP_TFLAG_PP_DIRECTIVE;
S.pp_state = (unsigned char)cpp_pp_directive_to_state(S.token.type);
}
}
}break;
case LS_number:
case LS_number0:
case LS_hex:
@ -609,18 +593,19 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.fsm.int_state = int_fsm_table[S.fsm.int_state + int_fsm_eq_classes[c]];
}
S.fsm.emit_token = (S.fsm.int_state >= LSINT_count);
if (S.fsm.emit_token == 0){
DrYield(5, 1);
S.chunk_pos += size;
DrYield(5, LexNeedChunk);
}
else break;
}
--S.pos;
S.token.type = CPP_TOKEN_INTEGER_CONSTANT;
S.token.flags = 0;
break;
case LS_float:
case LS_crazy_float0:
case LS_crazy_float1:
@ -634,27 +619,27 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_char:
S.token.type = CPP_TOKEN_CHARACTER_CONSTANT;
S.token.flags = 0;
break;
case LS_char_multiline:
S.token.type = CPP_TOKEN_CHARACTER_CONSTANT;
S.token.flags = CPP_TFLAG_MULTILINE;
break;
case LS_string:
S.token.type = CPP_TOKEN_STRING_CONSTANT;
S.token.flags = 0;
break;
case LS_string_multiline:
S.token.type = CPP_TOKEN_STRING_CONSTANT;
S.token.flags = CPP_TFLAG_MULTILINE;
break;
case LS_comment_pre:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -665,19 +650,19 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_comment: case LS_comment_block_ending:
S.token.type = CPP_TOKEN_COMMENT;
S.token.flags = 0;
pos_update_rule = PUR_unget_whitespace;
break;
case LS_error_message:
S.token.type = CPP_TOKEN_ERROR_MESSAGE;
S.token.flags = 0;
pos_update_rule = PUR_unget_whitespace;
break;
case LS_dot:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -688,21 +673,21 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_ellipsis:
switch (c){
case '.':
S.token.flags = CPP_TFLAG_IS_OPERATOR;
S.token.type = CPP_TOKEN_ELLIPSIS;
break;
default:
S.token.type = CPP_TOKEN_JUNK;
pos_update_rule = PUR_back_one;
break;
}
break;
case LS_less:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -713,7 +698,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_less_less:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -724,7 +709,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_more:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -735,7 +720,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_more_more:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -746,7 +731,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_minus:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -758,7 +743,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_arrow:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -769,7 +754,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_and:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -781,7 +766,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_or:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -793,7 +778,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_plus:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -805,7 +790,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_colon:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -816,7 +801,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_star:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -827,7 +812,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_modulo:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -838,7 +823,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_caret:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -849,7 +834,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_eq:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -860,7 +845,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
break;
}
break;
case LS_bang:
S.token.flags = CPP_TFLAG_IS_OPERATOR;
switch (c){
@ -872,12 +857,12 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
}
break;
}
switch (pos_update_rule){
case PUR_back_one:
--S.pos;
break;
case PUR_unget_whitespace:
c = chunk[--S.pos];
while (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\v' || c == '\f'){
@ -886,7 +871,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
++S.pos;
break;
}
if ((S.token.flags & CPP_TFLAG_PP_DIRECTIVE) == 0){
switch (S.pp_state){
case LSPP_include:
@ -895,7 +880,7 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
}
S.pp_state = LSPP_junk;
break;
case LSPP_macro_identifier:
if (S.fsm.state != LS_identifier){
S.token.type = CPP_TOKEN_JUNK;
@ -905,14 +890,14 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.pp_state = LSPP_body;
}
break;
case LSPP_identifier:
if (S.fsm.state != LS_identifier){
S.token.type = CPP_TOKEN_JUNK;
}
S.pp_state = LSPP_junk;
break;
case LSPP_number:
if (S.token.type != CPP_TOKEN_INTEGER_CONSTANT){
S.token.type = CPP_TOKEN_JUNK;
@ -922,14 +907,14 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
S.pp_state = LSPP_include;
}
break;
case LSPP_junk:
S.token.type = CPP_TOKEN_JUNK;
break;
}
}
}
if (S.fsm.emit_token){
S.token.start = S.token_start;
if (S.pos_overide){
@ -944,9 +929,9 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
}
S.token.state_flags = S.pp_state;
cpp_push_token_nonalloc(out_tokens, &token_i, S.token);
token_i = cpp_place_token_nonalloc(out_tokens, token_i, S.token);
if (token_i == max_token_i){
DrYield(2, 2);
DrYield(2, LexNeedTokenMemory);
}
}
}
@ -957,13 +942,199 @@ cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size, Cpp_Token_Stack *token_
}
}
DrReturn(0);
DrReturn(LexFinished);
}
#undef DrYield
#undef DrReturn
#undef DrCase
lexer_link int
cpp_lex_nonalloc(Lex_Data *S_ptr, char *chunk, int size,
Cpp_Token_Stack *token_stack_out, int max_tokens){
Cpp_Token_Stack temp_stack = *token_stack_out;
if (temp_stack.max_count > temp_stack.count + max_tokens){
temp_stack.max_count = temp_stack.count + max_tokens;
}
int result = cpp_lex_nonalloc(S_ptr, chunk, size, &temp_stack);
token_stack_out->count = temp_stack.count;
if (result == LexNeedTokenMemory){
if (token_stack_out->count < token_stack_out->max_count){
result = LexHitTokenLimit;
}
}
return(result);
}
lexer_link int
cpp_lex_size_nonalloc(Lex_Data *S_ptr, char *chunk, int size, int full_size,
Cpp_Token_Stack *token_stack_out){
int result = 0;
if (S_ptr->pos >= full_size){
char end_null = 0;
result = cpp_lex_nonalloc(S_ptr, &end_null, 1, token_stack_out);
}
else{
result = cpp_lex_nonalloc(S_ptr, chunk, size, token_stack_out);
if (result == LexNeedChunk){
if (S_ptr->pos >= full_size){
char end_null = 0;
result = cpp_lex_nonalloc(S_ptr, &end_null, 1, token_stack_out);
}
}
}
return(result);
}
lexer_link int
cpp_lex_size_nonalloc(Lex_Data *S_ptr, char *chunk, int size, int full_size,
Cpp_Token_Stack *token_stack_out, int max_tokens){
Cpp_Token_Stack temp_stack = *token_stack_out;
if (temp_stack.max_count > temp_stack.count + max_tokens){
temp_stack.max_count = temp_stack.count + max_tokens;
}
int result = cpp_lex_size_nonalloc(S_ptr, chunk, size, full_size,
&temp_stack);
token_stack_out->count = temp_stack.count;
if (result == LexNeedTokenMemory){
if (token_stack_out->count < token_stack_out->max_count){
result = LexHitTokenLimit;
}
}
return(result);
}
#if 0
lexer_link Cpp_Relex_State
cpp_relex_nonalloc_start(Cpp_File file, Cpp_Token_Stack *stack,
int start, int end, int amount, int tolerance){
Cpp_Relex_State state;
state.file = file;
state.stack = stack;
state.start = start;
state.end = end;
state.amount = amount;
state.tolerance = tolerance;
Cpp_Get_Token_Result result = new_lex::cpp_get_token(stack, start);
if (result.token_index <= 0){
state.start_token_i = 0;
}
else{
state.start_token_i = result.token_index-1;
}
result = new_lex::cpp_get_token(stack, end);
if (result.token_index < 0) result.token_index = 0;
else if (end > stack->tokens[result.token_index].start) ++result.token_index;
state.end_token_i = result.token_index;
state.relex_start = stack->tokens[state.start_token_i].start;
if (start < state.relex_start) state.relex_start = start;
state.space_request = state.end_token_i - state.start_token_i + tolerance + 1;
return(state);
}
// TODO(allen): Eliminate this once we actually store the EOF token
// in the token stack.
inline Cpp_Token
cpp__get_token(Cpp_Token_Stack *stack, Cpp_Token *tokens, int size, int index){
Cpp_Token result;
if (index < stack->count){
result = tokens[index];
}
else{
result.start = size;
result.size = 0;
result.type = CPP_TOKEN_EOF;
result.flags = 0;
result.state_flags = 0;
}
return result;
}
FCPP_LINK bool
cpp_relex_nonalloc_main(Cpp_Relex_State *state, Cpp_Token_Stack *relex_stack, int *relex_end){
Cpp_Token_Stack *stack = state->stack;
Cpp_Token *tokens = stack->tokens;
new_lex::cpp_shift_token_starts(stack, state->end_token_i, state->amount);
Lex_Data lex = {};
lex.pp_state = cpp_token_get_pp_state(tokens[state->start_token_i].state_flags);
lex.pos = state->relex_start;
int relex_end_i = state->end_token_i;
Cpp_Token match_token = cpp__get_token(stack, tokens, state->file.size, relex_end_i);
Cpp_Token end_token = match_token;
bool went_too_far = 0;
for (;;){
Cpp_Read_Result read = cpp_lex_step(state->file, &lex);
if (read.has_result){
if (read.token.start == end_token.start &&
read.token.size == end_token.size &&
read.token.flags == end_token.flags &&
read.token.state_flags == end_token.state_flags){
break;
}
cpp_push_token_nonalloc(relex_stack, read.token);
while (lex.pos > end_token.start && relex_end_i < stack->count){
++relex_end_i;
end_token = cpp__get_token(stack, tokens, state->file.size, relex_end_i);
}
if (relex_stack->count == relex_stack->max_count){
went_too_far = 1;
break;
}
}
if (lex.pos >= state->file.size) break;
}
if (!went_too_far){
if (relex_stack->count > 0){
if (state->start_token_i > 0){
Cpp_Token_Merge merge =
cpp_attempt_token_merge(tokens[state->start_token_i - 1],
relex_stack->tokens[0]);
if (merge.did_merge){
--state->start_token_i;
relex_stack->tokens[0] = merge.new_token;
}
}
if (relex_end_i < state->stack->count){
Cpp_Token_Merge merge =
cpp_attempt_token_merge(relex_stack->tokens[relex_stack->count-1],
tokens[relex_end_i]);
if (merge.did_merge){
++relex_end_i;
relex_stack->tokens[relex_stack->count-1] = merge.new_token;
}
}
}
*relex_end = relex_end_i;
}
else{
cpp_shift_token_starts(stack, state->end_token_i, -state->amount);
}
return went_too_far;
}
#endif
#endif
// BOTTOM

View File

@ -7,6 +7,9 @@
*
*/
// TODO(allen): In what corner cases, such as invalid files
// does the new lexer suffer???
// TOP
#include "../4ed_meta.h"
@ -204,104 +207,166 @@ end_t(Times *t){
}
static void
run_experiment(Experiment *exp, char *filename, int verbose, int chunks){
run_experiment(Experiment *exp, char *filename, int verbose,
int chunks, int max_tokens){
String extension = {};
Data file_data;
Cpp_File file_cpp;
new_lex::Lex_Data ld = {0};
int pass;
int k, chunk_size, is_last;
extension = file_extension(make_string_slowly(filename));
if (match(extension, "cpp") || match(extension, "h")){
file_data = dump_file(filename);
if (file_data.size < (100 << 10)){
pass = 1;
if (verbose >= 0) printf("testing on file: %s\n", filename);
exp->test_total++;
exp->correct_stack.count = 0;
exp->testing_stack.count = 0;
memset(exp->correct_stack.tokens, TOKEN_ARRAY_SIZE, 0);
memset(exp->testing_stack.tokens, TOKEN_ARRAY_SIZE, 0);
memset(exp->correct_stack.tokens, 0, TOKEN_ARRAY_SIZE);
memset(exp->testing_stack.tokens, 0, TOKEN_ARRAY_SIZE);
file_cpp.data = (char*)file_data.data;
file_cpp.size = file_data.size;
ld.tb = (char*)malloc(file_data.size + 1);
{
i64 start;
start = __rdtsc();
cpp_lex_file_nonalloc(file_cpp, &exp->correct_stack, lex_data);
time.handcoded += (__rdtsc() - start);
start = __rdtsc();
if (chunks){
int relevant_size = file_data.size + 1;
is_last = 0;
for (k = 0; k < relevant_size; k += chunks){
chunk_size = chunks;
if (chunk_size + k >= relevant_size){
chunk_size = relevant_size - k;
is_last = 1;
if (max_tokens == 0){
if (chunks){
start = __rdtsc();
int relevant_size = file_data.size + 1;
is_last = 0;
for (k = 0; k < relevant_size; k += chunks){
chunk_size = chunks;
if (chunk_size + k >= relevant_size){
chunk_size = relevant_size - k;
is_last = 1;
}
int result =
new_lex::cpp_lex_nonalloc(&ld,
(char*)file_data.data + k, chunk_size,
&exp->testing_stack);
if (result == new_lex::LexFinished ||
result == new_lex::LexNeedTokenMemory) break;
}
int result = new_lex::cpp_lex_nonalloc(&ld, (char*)file_data.data + k, chunk_size, &exp->testing_stack);
if (result == 0 || result == 2) break;
time.fsm += (__rdtsc() - start);
}
else{
start = __rdtsc();
new_lex::cpp_lex_nonalloc(&ld,
(char*)file_data.data, file_data.size,
&exp->testing_stack);
time.fsm += (__rdtsc() - start);
}
}
else{
new_lex::cpp_lex_nonalloc(&ld, (char*)file_data.data, file_data.size, &exp->testing_stack);
if (chunks){
start = __rdtsc();
int relevant_size = file_data.size + 1;
is_last = 0;
for (k = 0; k < relevant_size; k += chunks){
chunk_size = chunks;
if (chunk_size + k >= relevant_size){
chunk_size = relevant_size - k;
is_last = 1;
}
int result = 0;
int still_lexing = 1;
do{
result =
new_lex::cpp_lex_size_nonalloc(&ld,
(char*)file_data.data + k, chunk_size, file_data.size,
&exp->testing_stack,
max_tokens);
if (result == new_lex::LexFinished ||
result == new_lex::LexNeedTokenMemory ||
result == new_lex::LexNeedChunk){
still_lexing = 0;
}
} while(still_lexing);
if (result == new_lex::LexFinished ||
result == new_lex::LexNeedTokenMemory) break;
}
time.fsm += (__rdtsc() - start);
}
else{
start = __rdtsc();
int still_lexing = 1;
do{
int result =
new_lex::cpp_lex_size_nonalloc(&ld,
(char*)file_data.data, file_data.size, file_data.size,
&exp->testing_stack,
max_tokens);
if (result == new_lex::LexFinished ||
result == new_lex::LexNeedTokenMemory){
still_lexing = 0;
}
} while(still_lexing);
time.fsm += (__rdtsc() - start);
}
}
time.fsm += (__rdtsc() - start);
}
free(ld.tb);
if (exp->correct_stack.count != exp->testing_stack.count){
pass = 0;
if (verbose >= 0){
printf("error: stack size mismatch %d original and %d testing\n",
exp->correct_stack.count, exp->testing_stack.count);
exp->correct_stack.count, exp->testing_stack.count);
}
}
int min_count = exp->correct_stack.count;
if (min_count > exp->testing_stack.count) min_count = exp->testing_stack.count;
for (int j = 0; j < min_count; ++j){
Cpp_Token *correct, *testing;
correct = exp->correct_stack.tokens + j;
testing = exp->testing_stack.tokens + j;
if (correct->type != testing->type){
pass = 0;
if (verbose >= 1) printf("type mismatch at token %d\n", j);
}
if (correct->start != testing->start || correct->size != testing->size){
pass = 0;
if (verbose >= 1){
printf("token range mismatch at token %d\n"
" %d:%d original %d:%d testing\n"
" %.*s original %.*s testing\n",
j,
correct->start, correct->size, testing->start, testing->size,
correct->size, file_cpp.data + correct->start,
testing->size, file_cpp.data + testing->start);
" %d:%d original %d:%d testing\n"
" %.*s original %.*s testing\n",
j,
correct->start, correct->size, testing->start, testing->size,
correct->size, file_cpp.data + correct->start,
testing->size, file_cpp.data + testing->start);
}
}
if (correct->flags != testing->flags){
pass = 0;
if (verbose >= 1) printf("token flag mismatch at token %d\n", j);
}
}
if (pass){
exp->passed_total++;
if (verbose >= 0) printf("test passed!\n\n");
@ -310,7 +375,7 @@ run_experiment(Experiment *exp, char *filename, int verbose, int chunks){
if (verbose >= 0) printf("test failed, you failed, fix it now!\n\n");
}
}
free(file_data.data);
}
}
@ -338,12 +403,13 @@ show_time(Times t, int repeats, char *type){
int main(){
int repeats = 1;
int verbose_level = 1;
int chunk_start = 0;
int chunk_end = 0;
int verbose_level = 0;
int chunk_start = 32;
int chunk_end = 64;
#define TEST_FILE "parser_test1.cpp"
#define SINGLE_ITEM 1
#define SINGLE_ITEM 0
int token_limit = 2;
int chunks = (chunk_start > 0 && chunk_start <= chunk_end);
int c = 0;
@ -371,14 +437,14 @@ int main(){
begin_t(&chunk_exp_t);
printf("With chunks of %d\n", chunks);
for (c = chunk_start; c <= chunk_end; ++c){
run_experiment(&chunk_exp, BASE_DIR TEST_FILE, 1, c);
run_experiment(&chunk_exp, BASE_DIR TEST_FILE, 1, c, token_limit);
}
end_t(&chunk_exp_t);
}
begin_t(&exp_t);
printf("Unchunked\n");
run_experiment(&exp, BASE_DIR TEST_FILE, 1, 0);
run_experiment(&exp, BASE_DIR TEST_FILE, 1, 0, token_limit);
end_t(&exp_t);
#else
@ -391,19 +457,19 @@ int main(){
if (chunks){
begin_t(&chunk_exp_t);
for (c = chunk_start; c <= chunk_end; ++c){
run_experiment(&chunk_exp, all_files.infos[i].filename.str, verbose_level, c);
run_experiment(&chunk_exp, all_files.infos[i].filename.str, verbose_level, c, token_limit);
}
end_t(&chunk_exp_t);
}
begin_t(&exp_t);
if (verbose_level == -1 && chunks){
for (c = chunk_start; c <= chunk_end; ++c){
run_experiment(&exp, all_files.infos[i].filename.str, verbose_level, 0);
run_experiment(&exp, all_files.infos[i].filename.str, verbose_level, 0, token_limit);
}
}
else{
run_experiment(&exp, all_files.infos[i].filename.str, verbose_level, 0);
run_experiment(&exp, all_files.infos[i].filename.str, verbose_level, 0, token_limit);
}
end_t(&exp_t);
}

View File

@ -576,97 +576,6 @@ process_match_node(String_And_Flag *input, Match_Node *node, Match_Tree *tree, F
}
}
FSM_Stack
generate_keyword_fsms(){
Terminal_Lookup_Table terminal_table;
Cpp_Token_Type type;
Future_FSM_Stack unfinished_futures;
Match_Tree_Stack tree_stack;
FSM_Stack fsm_stack;
Match_Tree *tree;
FSM *fsm;
Future_FSM *future;
Match_Node *root_node;
FSM_State *root_state;
int i, j;
memset(terminal_table.type_to_state, 0, sizeof(terminal_table.type_to_state));
memset(terminal_table.state_to_type, 0, sizeof(terminal_table.state_to_type));
for (i = 0; i < ArrayCount(keyword_strings); ++i){
type = (Cpp_Token_Type)keyword_strings[i].flags;
if (terminal_table.type_to_state[type] == 0){
terminal_table.type_to_state[type] = terminal_table.state_count;
terminal_table.state_to_type[terminal_table.state_count] = type;
++terminal_table.state_count;
}
}
fsm_stack.max = 255;
fsm_stack.count = 0;
fsm_stack.fsms = (FSM*)malloc(sizeof(FSM)*fsm_stack.max);
fsm_stack.table_transition_state = 26;
tree_stack.max = 255;
tree_stack.count = 0;
tree_stack.trees = (Match_Tree*)malloc(sizeof(Match_Tree)*tree_stack.max);
unfinished_futures.max = 255;
unfinished_futures.count = 0;
unfinished_futures.futures = (Future_FSM*)malloc(sizeof(Future_FSM)*unfinished_futures.max);
fsm = get_fsm(&fsm_stack);
tree = get_tree(&tree_stack);
*fsm = fsm_init(200, fsm_stack.table_transition_state);
*tree = tree_init(200);
root_state = fsm_get_state(fsm, RealTerminateBase);
root_node = match_get_node(tree);
match_init_node(root_node, ArrayCount(keyword_strings));
for (i = 0; i < ArrayCount(keyword_strings); ++i){
root_node->words[i] = i;
}
root_node->count = ArrayCount(keyword_strings);
root_node->state = root_state;
root_node->index = -1;
push_future_fsm(&unfinished_futures, root_node);
process_match_node(keyword_strings, root_node, tree, fsm, &terminal_table, 2, &unfinished_futures);
for (i = 1; i < unfinished_futures.count; ++i){
future = unfinished_futures.futures + i;
fsm = get_fsm(&fsm_stack);
tree = get_tree(&tree_stack);
assert((int)(fsm - fsm_stack.fsms) == i);
*fsm = fsm_init(200, fsm_stack.table_transition_state);
*tree = tree_init(200);
root_state = fsm_get_state(fsm, RealTerminateBase);
root_node = match_get_node(tree);
match_copy_init_node(root_node, future->source);
root_node->state = root_state;
for (j = 0; j < root_node->count; ++j){
char space[1024];
sprintf(space, "%s\n", keyword_strings[root_node->words[j]].str);
fsm_add_comment(fsm, space);
}
process_match_node(keyword_strings, root_node, tree, fsm, &terminal_table, 12, &unfinished_futures);
}
assert(fsm_stack.count < 255);
fsm_stack.final_state = fsm_stack.table_transition_state + (unsigned char)fsm_stack.count;
return(fsm_stack);
}
Whitespace_FSM
whitespace_skip_fsm(Whitespace_FSM wfsm, char c){
if (wfsm.pp_state != LSPP_default){
@ -781,7 +690,6 @@ main_fsm(Lex_FSM fsm, unsigned char pp_state, unsigned char c){
case LS_default:
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'){
fsm.state = LS_identifier;
fsm.emit_token = 1;
}
else if (c >= '1' && c <= '9'){
fsm.state = LS_number;
@ -849,13 +757,11 @@ main_fsm(Lex_FSM fsm, unsigned char pp_state, unsigned char c){
}
break;
#if 0
case LS_identifier:
if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')){
fsm.emit_token = 1;
}
break;
#endif
case LS_pound:
switch (c){
@ -1405,39 +1311,6 @@ main(){
render_variable(file, "unsigned char", "LSDIR_count", pp_directive_fsm.count);
render_variable(file, "unsigned char", "pp_directive_terminal_base", pp_directive_fsm.terminal_base);
FSM_Stack keyword_fsms = generate_keyword_fsms();
char name[1024];
for (int i = 0; i < keyword_fsms.count; ++i){
FSM_Tables partial_keywords_table =
generate_table_from_abstract_fsm(keyword_fsms.fsms[i], keyword_fsms.final_state);
if (keyword_fsms.fsms[i].comment){
render_comment(file, keyword_fsms.fsms[i].comment);
}
sprintf(name, "keyword_part_%d_table", i);
render_fsm_table(file, partial_keywords_table, name);
}
begin_ptr_table(file, "short", "key_eq_class_tables");
for (int i = 0; i < keyword_fsms.count; ++i){
sprintf(name, "keyword_part_%d_table_eq_classes", i);
do_table_item_direct(file, name, "");
end_row(file);
}
end_table(file);
begin_ptr_table(file, "char", "key_tables");
for (int i = 0; i < keyword_fsms.count; ++i){
sprintf(name, "keyword_part_%d_table_table", i);
do_table_item_direct(file, name, "");
end_row(file);
}
end_table(file);
fprintf(file, "#define LSKEY_table_transition %d\n", (int)(keyword_fsms.table_transition_state));
fprintf(file, "#define LSKEY_totally_finished %d\n", (int)(keyword_fsms.final_state));
fclose(file);
return(0);
}