From b281746e0916c56d66acbc77e9969239cf4f4046 Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Sat, 28 Jun 2008 17:07:33 -0700 Subject: [PATCH] More robust eof_ok handling for both gzlparse and interpreter.c. --- runtime/interpreter.c | 100 +++++++++++++++++++++++++++++++++++--------------- runtime/interpreter.h | 15 ++++++-- utilities/gzlparse.c | 27 +++++++++++--- 3 files changed, 103 insertions(+), 39 deletions(-) diff --git a/runtime/interpreter.c b/runtime/interpreter.c index 74e14cb..96a8abc 100644 --- a/runtime/interpreter.c +++ b/runtime/interpreter.c @@ -35,68 +35,82 @@ void dump_stack(struct parse_state *s, struct grammar *g, FILE *output) case FRAME_TYPE_RTN: { struct rtn_frame *rtn_frame = &frame->f.rtn_frame; - fprintf(output, "RTN: %s, start_offset: %d\n", rtn_frame->rtn->name, - rtn_frame->start_offset); + fprintf(output, "RTN: %s", rtn_frame->rtn->name); break; } case FRAME_TYPE_GLA: { struct gla_frame *gla_frame = &frame->f.gla_frame; - fprintf(output, "GLA: #%d, start_offset: %d\n", gla_frame->gla - g->glas, - gla_frame->start_offset); + fprintf(output, "GLA: #%d", gla_frame->gla - g->glas); break; } case FRAME_TYPE_INTFA: { struct intfa_frame *intfa_frame = &frame->f.intfa_frame; - fprintf(output, "IntFA: #%d, start_offset: %d\n", intfa_frame->intfa - g->intfas, - intfa_frame->start_offset); + fprintf(output, "IntFA: #%d", intfa_frame->intfa - g->intfas); break; } } + fprintf(output, ", start_offset: %d, eof_ok: %d\n", frame->start_offset, frame->eof_ok); } fprintf(output, "\n"); } -struct parse_stack_frame *push_empty_frame(struct parse_state *s, enum frame_type frame_type) +struct parse_stack_frame *push_empty_frame(struct parse_state *s, enum frame_type frame_type, + int start_offset) { RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len+1); struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack); frame->frame_type = frame_type; + frame->start_offset = start_offset; return frame; } struct intfa_frame *push_intfa_frame(struct parse_state *s, struct intfa *intfa, int start_offset) { - struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_INTFA); + struct parse_stack_frame *old_frame = DYNARRAY_GET_TOP(s->parse_stack); + struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_INTFA, start_offset); struct intfa_frame *intfa_frame = &frame->f.intfa_frame; intfa_frame->intfa = intfa; intfa_frame->intfa_state = &intfa->states[0]; - intfa_frame->start_offset = start_offset; + + /* IntFA frames start out being eof_ok if the parent frame is, but become not ok + * when they transition out of the initial state. */ + frame->eof_ok = old_frame->eof_ok; + return intfa_frame; } struct parse_stack_frame *push_gla_frame(struct parse_state *s, struct gla *gla, int start_offset) { - struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_GLA); + struct parse_stack_frame *old_frame = DYNARRAY_GET_TOP(s->parse_stack); + struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_GLA, start_offset); struct gla_frame *gla_frame = &frame->f.gla_frame; gla_frame->gla = gla; gla_frame->gla_state = &gla->states[0]; - gla_frame->start_offset = start_offset; + + /* GLA frames start out being eof_ok if the parent frame is, but become not ok + * when they transition out of the initial state. */ + frame->eof_ok = old_frame->eof_ok; + return frame; } struct parse_stack_frame *push_rtn_frame(struct parse_state *s, struct rtn *rtn, int start_offset) { - struct parse_stack_frame *new_frame = push_empty_frame(s, FRAME_TYPE_RTN); + struct parse_stack_frame *old_frame = DYNARRAY_GET_TOP(s->parse_stack); + struct parse_stack_frame *new_frame = push_empty_frame(s, FRAME_TYPE_RTN, start_offset); struct rtn_frame *new_rtn_frame = &new_frame->f.rtn_frame; new_rtn_frame->rtn = rtn; new_rtn_frame->rtn_transition = NULL; new_rtn_frame->rtn_state = &new_rtn_frame->rtn->states[0]; - new_rtn_frame->start_offset = start_offset; + + /* RTN frames start out being eof_ok iff their start state is a final state + * *and* their parent is eof_ok. */ + new_frame->eof_ok = old_frame->eof_ok && rtn->states[0].is_final; /* Call start rule callback if set */ if(s->bound_grammar->start_rule_cb) @@ -130,6 +144,18 @@ struct parse_stack_frame *pop_frame(struct parse_state *s) return frame; } +void set_eof_ok_flag_for_rtn_frame(struct parse_state *s) +{ + struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack); + assert(frame->frame_type == FRAME_TYPE_RTN); + struct rtn_frame *rtn_frame = &frame->f.rtn_frame; + if(rtn_frame->rtn_state->is_final && + (s->parse_stack_len == 0 || s->parse_stack[s->parse_stack_len-2].eof_ok)) + frame->eof_ok = true; + else + frame->eof_ok = false; +} + struct parse_stack_frame *pop_rtn_frame(struct parse_state *s) { assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == FRAME_TYPE_RTN); @@ -147,6 +173,7 @@ struct parse_stack_frame *pop_rtn_frame(struct parse_state *s) if(rtn_frame->rtn_transition) { rtn_frame->rtn_state = rtn_frame->rtn_transition->dest_state; + set_eof_ok_flag_for_rtn_frame(s); } } return frame; @@ -251,6 +278,7 @@ struct parse_stack_frame *do_rtn_terminal_transition(struct parse_state *s, assert(t->transition_type == TERMINAL_TRANSITION); rtn_frame->rtn_state = t->dest_state; + set_eof_ok_flag_for_rtn_frame(s); return frame; } @@ -459,6 +487,7 @@ struct intfa_frame *do_intfa_transition(struct parse_state *s, char ch) { struct intfa_transition *t = find_intfa_transition(intfa_frame, ch); + struct parse_stack_frame *frame = GET_PARSE_STACK_FRAME(intfa_frame); /* If this character did not have any transition, but the state we're coming * from is final, then longest-match semantics say that we should return @@ -468,8 +497,8 @@ struct intfa_frame *do_intfa_transition(struct parse_state *s, { char *terminal = intfa_frame->intfa_state->final; assert(terminal); - intfa_frame = process_terminal(s, terminal, intfa_frame->start_offset, - s->offset - intfa_frame->start_offset); + intfa_frame = process_terminal(s, terminal, frame->start_offset, + s->offset - frame->start_offset); assert(intfa_frame); // if this fails, it means that we hit a hard EOF /* This must succeed this time or it is a parse error */ @@ -477,7 +506,12 @@ struct intfa_frame *do_intfa_transition(struct parse_state *s, assert(t); } + /* We increment the offset here because we have just crossed the threshold + * where we have finished processing all terminals for the previous byte and + * started processing transitions for the current byte. */ + s->offset++; intfa_frame->intfa_state = t->dest_state; + frame->eof_ok = false; /* If the current state is final and there are no outgoing transitions, * we *know* we don't have to wait any longer for the longest match. @@ -485,8 +519,8 @@ struct intfa_frame *do_intfa_transition(struct parse_state *s, if(intfa_frame->intfa_state->final && (intfa_frame->intfa_state->num_transitions == 0)) { intfa_frame = process_terminal(s, intfa_frame->intfa_state->final, - intfa_frame->start_offset, - s->offset - intfa_frame->start_offset + 1); + frame->start_offset, + s->offset - frame->start_offset); } return intfa_frame; @@ -516,29 +550,35 @@ enum parse_status parse(struct parse_state *s, char *buf, int buf_len, for(int i = 0; i < buf_len; i++) { intfa_frame = do_intfa_transition(s, intfa_frame, buf[i]); - s->offset++; if(intfa_frame == NULL) { - if (out_consumed_buf_len) *out_consumed_buf_len = i; - if (out_eof_ok) *out_eof_ok = true; + if(out_consumed_buf_len) *out_consumed_buf_len = i; + if(out_eof_ok) *out_eof_ok = true; + assert(s->parse_stack_len == 1); + pop_rtn_frame(s); return PARSE_STATUS_EOF; } } - if(s->parse_stack[1].frame_type != FRAME_TYPE_RTN && - s->parse_stack[0].f.rtn_frame.rtn_state->is_final) - { - if (out_eof_ok) *out_eof_ok = true; - } - else - { - if (out_eof_ok) *out_eof_ok = false; - } + if(out_eof_ok) *out_eof_ok = DYNARRAY_GET_TOP(s->parse_stack)->eof_ok; + if(out_consumed_buf_len) *out_consumed_buf_len = buf_len; - if (out_consumed_buf_len) *out_consumed_buf_len = buf_len; return PARSE_STATUS_OK; } +void finish_parse(struct parse_state *s) +{ + struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack); + while(s->parse_stack_len > 0) + { + assert(frame->eof_ok); + if(frame->frame_type == FRAME_TYPE_RTN) + frame = pop_rtn_frame(s); + else + frame = pop_frame(s); + } +} + void reinit_parse_state(struct parse_state *s, struct bound_grammar *bg) { s->offset = 0; diff --git a/runtime/interpreter.h b/runtime/interpreter.h index f323e73..dcbb431 100644 --- a/runtime/interpreter.h +++ b/runtime/interpreter.h @@ -14,6 +14,7 @@ #include #include +#include #include "bc_read_stream.h" #include "dynarray.h" @@ -207,22 +208,22 @@ struct parse_stack_frame struct rtn *rtn; struct rtn_state *rtn_state; struct rtn_transition *rtn_transition; - int start_offset; } rtn_frame; struct gla_frame { struct gla *gla; struct gla_state *gla_state; - int start_offset; } gla_frame; struct intfa_frame { struct intfa *intfa; struct intfa_state *intfa_state; - int start_offset; } intfa_frame; } f; + bool eof_ok; + int start_offset; + enum frame_type { FRAME_TYPE_RTN, FRAME_TYPE_GLA, @@ -230,6 +231,9 @@ struct parse_stack_frame } frame_type; }; +#define GET_PARSE_STACK_FRAME(ptr) \ + (struct parse_stack_frame*)((char*)ptr-offsetof(struct parse_stack_frame,f)) + /* A bound_grammar struct represents a grammar which has had callbacks bound * to it and has possibly been JIT-compiled. Though JIT compilation is not * supported yet, the APIs are in-place to anticipate this feature. @@ -316,6 +320,11 @@ enum parse_status { enum parse_status parse(struct parse_state *s, char *buf, int buf_len, int *out_consumed_buf_len, bool *out_eof_ok); +/* If parse() above has previously returned out_eof_ok==true and there is + * no more input, call this function to complete the parse. This primarily + * involves calling all the final callbacks. */ +void finish_parse(struct parse_state *s); + void alloc_parse_state(struct parse_state *state); void free_parse_state(struct parse_state *state); void init_parse_state(struct parse_state *state, struct bound_grammar *bg); diff --git a/utilities/gzlparse.c b/utilities/gzlparse.c index 8896671..2bc0285 100644 --- a/utilities/gzlparse.c +++ b/utilities/gzlparse.c @@ -83,7 +83,7 @@ void start_rule_callback(struct parse_state *parse_state) print_newline(user_state, false); print_indent(user_state); - printf("{\"rule\":\"%s\", \"start\": %d, ", rtn_frame->rtn->name, rtn_frame->start_offset); + printf("{\"rule\":\"%s\", \"start\": %d, ", rtn_frame->rtn->name, frame->start_offset); if(parse_state->parse_stack_len > 1) { @@ -104,12 +104,11 @@ void end_rule_callback(struct parse_state *parse_state) struct gzlparse_state *user_state = (struct gzlparse_state*)parse_state->user_data; struct parse_stack_frame *frame = DYNARRAY_GET_TOP(parse_state->parse_stack); assert(frame->frame_type == FRAME_TYPE_RTN); - struct rtn_frame *rtn_frame = &frame->f.rtn_frame; RESIZE_DYNARRAY(user_state->first_child, user_state->first_child_len-1); print_newline(user_state, true); print_indent(user_state); - printf("], \"len\": %d}", parse_state->offset - rtn_frame->start_offset); + printf("], \"len\": %d}", parse_state->offset - frame->start_offset); } int main(int argc, char *argv[]) @@ -193,19 +192,35 @@ int main(int argc, char *argv[]) int total_read = 0; while(1) { int consumed_buf_len; + bool eof_ok; int read = fread(buf, 1, sizeof(buf), file); - enum parse_status status = parse(&state, buf, read, &consumed_buf_len, NULL); + enum parse_status status = parse(&state, buf, read, &consumed_buf_len, &eof_ok); total_read += consumed_buf_len; - if(status == PARSE_STATUS_EOF || read == 0) + if(read == 0) + { + if(eof_ok) + finish_parse(&state); + else + { + printf("\n"); + fprintf(stderr, "Premature end-of-file.\n"); + dump_json = false; + } + break; + } + else if(status == PARSE_STATUS_EOF) + { + break; + } } if(dump_json) fputs("\n}\n", stdout); if(dump_total) - printf("%d bytes parsed.\n", total_read); + fprintf(stderr, "%d bytes parsed.\n", total_read); free_parse_state(&state); free_grammar(g); -- 2.11.4.GIT