Lookahead: support lookahead that says to return from a state.
[gazelle.git] / runtime / interpreter.c
blobd68bce7f0e7a4a0d16c9dd42474596a2cfcfda2e
1 /*********************************************************************
3 Gazelle: a system for building fast, reusable parsers
5 interpreter.c
7 Once a compiled grammar has been loaded into memory, the routines
8 in this file are what actually does the parsing. This file is an
9 "interpreter" in the sense that it parses the input by using the
10 grammar as a data structure -- no grammar-specific code is ever
11 generated or executed. Despite this, it is still quite fast, and
12 has a very low memory footprint.
14 The interpreter primarily consists of maintaining the parse stack
15 properly and transitioning the frames in response to the input.
17 Copyright (c) 2007-2008 Joshua Haberman. See LICENSE for details.
19 *********************************************************************/
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <assert.h>
24 #include <string.h>
25 #include "interpreter.h"
28 * A diagnostic function for dumping the current state of the stack.
30 void dump_stack(struct parse_state *s, FILE *output)
32 fprintf(output, "Stack dump:\n");
33 struct grammar *g = s->bound_grammar->grammar;
34 for(int i = 0; i < s->parse_stack_len; i++)
36 struct parse_stack_frame *frame = &s->parse_stack[i];
37 switch(frame->frame_type)
39 case FRAME_TYPE_RTN:
41 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
42 fprintf(output, "RTN: %s", rtn_frame->rtn->name);
43 break;
46 case FRAME_TYPE_GLA:
48 struct gla_frame *gla_frame = &frame->f.gla_frame;
49 fprintf(output, "GLA: #%d", gla_frame->gla - g->glas);
50 break;
53 case FRAME_TYPE_INTFA:
55 struct intfa_frame *intfa_frame = &frame->f.intfa_frame;
56 fprintf(output, "IntFA: #%d", intfa_frame->intfa - g->intfas);
57 break;
60 fprintf(output, ", start_offset: %d, eof_ok: %d\n", frame->start_offset, frame->eof_ok);
62 fprintf(output, "\n");
65 struct parse_stack_frame *push_empty_frame(struct parse_state *s, enum frame_type frame_type,
66 int start_offset)
68 RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len+1);
69 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
70 frame->frame_type = frame_type;
71 frame->start_offset = start_offset;
72 return frame;
75 struct intfa_frame *push_intfa_frame(struct parse_state *s, struct intfa *intfa, int start_offset)
77 struct parse_stack_frame *old_frame = DYNARRAY_GET_TOP(s->parse_stack);
78 struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_INTFA, start_offset);
79 struct intfa_frame *intfa_frame = &frame->f.intfa_frame;
80 intfa_frame->intfa = intfa;
81 intfa_frame->intfa_state = &intfa->states[0];
83 /* IntFA frames start out being eof_ok if the parent frame is, but become not ok
84 * when they transition out of the initial state. */
85 frame->eof_ok = old_frame->eof_ok;
87 return intfa_frame;
90 struct parse_stack_frame *push_gla_frame(struct parse_state *s, struct gla *gla, int start_offset)
92 struct parse_stack_frame *old_frame = DYNARRAY_GET_TOP(s->parse_stack);
93 struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_GLA, start_offset);
94 struct gla_frame *gla_frame = &frame->f.gla_frame;
95 gla_frame->gla = gla;
96 gla_frame->gla_state = &gla->states[0];
98 /* GLA frames start out being eof_ok if the parent frame is, but become not ok
99 * when they transition out of the initial state. */
100 frame->eof_ok = old_frame->eof_ok;
102 return frame;
105 struct parse_stack_frame *push_rtn_frame(struct parse_state *s, struct rtn *rtn, int start_offset)
107 struct parse_stack_frame *old_frame = DYNARRAY_GET_TOP(s->parse_stack);
108 struct parse_stack_frame *new_frame = push_empty_frame(s, FRAME_TYPE_RTN, start_offset);
109 struct rtn_frame *new_rtn_frame = &new_frame->f.rtn_frame;
111 new_rtn_frame->rtn = rtn;
112 new_rtn_frame->rtn_transition = NULL;
113 new_rtn_frame->rtn_state = &new_rtn_frame->rtn->states[0];
115 /* RTN frames start out being eof_ok iff their start state is a final state
116 * *and* their parent is eof_ok. */
117 new_frame->eof_ok = old_frame->eof_ok && rtn->states[0].is_final;
119 /* Call start rule callback if set */
120 if(s->bound_grammar->start_rule_cb)
122 s->bound_grammar->start_rule_cb(s);
125 return new_frame;
128 struct parse_stack_frame *push_rtn_frame_for_transition(struct parse_state *s,
129 struct rtn_transition *t,
130 int start_offset)
132 struct rtn_frame *old_rtn_frame = &DYNARRAY_GET_TOP(s->parse_stack)->f.rtn_frame;
133 old_rtn_frame->rtn_transition = t;
134 return push_rtn_frame(s, t->edge.nonterminal, start_offset);
137 struct parse_stack_frame *pop_frame(struct parse_state *s)
139 assert(s->parse_stack_len > 0);
140 RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len-1);
142 struct parse_stack_frame *frame;
143 if(s->parse_stack_len > 0)
144 frame = DYNARRAY_GET_TOP(s->parse_stack);
145 else
146 frame = NULL;
148 return frame;
151 void set_eof_ok_flag_for_rtn_frame(struct parse_state *s)
153 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
154 assert(frame->frame_type == FRAME_TYPE_RTN);
155 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
156 if(rtn_frame->rtn_state->is_final &&
157 (s->parse_stack_len == 0 || s->parse_stack[s->parse_stack_len-2].eof_ok))
158 frame->eof_ok = true;
159 else
160 frame->eof_ok = false;
163 struct parse_stack_frame *pop_rtn_frame(struct parse_state *s)
165 assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == FRAME_TYPE_RTN);
167 /* Call end rule callback if set */
168 if(s->bound_grammar->end_rule_cb)
170 s->bound_grammar->end_rule_cb(s);
173 struct parse_stack_frame *frame = pop_frame(s);
174 if(frame != NULL)
176 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
177 if(rtn_frame->rtn_transition)
179 rtn_frame->rtn_state = rtn_frame->rtn_transition->dest_state;
180 set_eof_ok_flag_for_rtn_frame(s);
183 return frame;
186 struct parse_stack_frame *pop_gla_frame(struct parse_state *s)
188 assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == FRAME_TYPE_GLA);
189 return pop_frame(s);
192 struct parse_stack_frame *pop_intfa_frame(struct parse_state *s)
194 assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == FRAME_TYPE_INTFA);
195 return pop_frame(s);
199 * descend_to_gla(): given the current parse stack, pushes any RTN or GLA
200 * stack frames representing transitions that can be taken without consuming
201 * any terminals.
203 * Preconditions:
204 * - the current frame is either an RTN frame or a GLA frame
206 * Postconditions:
207 * - the current frame is an RTN frame or a GLA frame. If a new GLA frame was
208 * entered, entered_gla is set to true.
210 struct parse_stack_frame *descend_to_gla(struct parse_state *s, bool *entered_gla, int start_offset)
212 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
213 *entered_gla = false;
215 while(frame->frame_type == FRAME_TYPE_RTN)
217 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
218 switch(rtn_frame->rtn_state->lookahead_type)
220 case STATE_HAS_INTFA:
221 return frame;
222 break;
224 case STATE_HAS_GLA:
225 *entered_gla = true;
226 return push_gla_frame(s, rtn_frame->rtn_state->d.state_gla, start_offset);
227 break;
229 case STATE_HAS_NEITHER:
230 /* An RTN state has neither an IntFA or a GLA in only two cases:
231 * - it is a final state with no outgoing transitions
232 * - it is a nonfinal state with only one transition (a nonterminal) */
233 assert(rtn_frame->rtn_state->num_transitions < 2);
234 if(rtn_frame->rtn_state->num_transitions == 0)
236 /* Final state */
237 frame = pop_rtn_frame(s);
238 if(frame == NULL) return NULL;
240 else if(rtn_frame->rtn_state->num_transitions == 1)
242 assert(rtn_frame->rtn_state->transitions[0].transition_type == NONTERM_TRANSITION);
243 frame = push_rtn_frame_for_transition(s, &rtn_frame->rtn_state->transitions[0],
244 start_offset);
246 break;
249 return frame;
252 struct intfa_frame *push_intfa_frame_for_gla_or_rtn(struct parse_state *s, int start_offset)
254 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
255 if(frame->frame_type == FRAME_TYPE_GLA)
257 assert(frame->f.gla_frame.gla_state->is_final == false);
258 return push_intfa_frame(s, frame->f.gla_frame.gla_state->d.nonfinal.intfa, start_offset);
260 else if(frame->frame_type == FRAME_TYPE_RTN)
262 return push_intfa_frame(s, frame->f.rtn_frame.rtn_state->d.state_intfa, start_offset);
264 assert(false);
265 return NULL;
268 struct parse_stack_frame *do_rtn_terminal_transition(struct parse_state *s,
269 struct rtn_transition *t,
270 struct terminal *terminal)
272 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
273 assert(frame->frame_type == FRAME_TYPE_RTN);
274 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
276 /* Call terminal callback if set */
277 if(s->bound_grammar->terminal_cb)
279 rtn_frame->rtn_transition = t;
280 s->bound_grammar->terminal_cb(s, terminal);
283 assert(t->transition_type == TERMINAL_TRANSITION);
284 rtn_frame->rtn_state = t->dest_state;
285 set_eof_ok_flag_for_rtn_frame(s);
286 return frame;
289 struct rtn_transition *find_rtn_terminal_transition(struct parse_state *s,
290 struct terminal *terminal)
292 struct parse_stack_frame *frame = &s->parse_stack[s->parse_stack_len-1];
293 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
294 for(int i = 0; i < rtn_frame->rtn_state->num_transitions; i++)
296 struct rtn_transition *t = &rtn_frame->rtn_state->transitions[i];
297 if(t->transition_type == TERMINAL_TRANSITION && t->edge.terminal_name == terminal->name)
299 return t;
303 return NULL;
307 * do_gla_transition(): transitions a GLA frame, performing the appropriate
308 * RTN transitions if this puts the GLA in a final state.
310 * Preconditions:
311 * - the current stack frame is a GLA frame
312 * - term is a terminal that came from this GLA state's intfa
314 * Postconditions:
315 * - the current stack frame is a GLA frame (this would indicate that
316 * the GLA hasn't hit a final state yet) or the current stack frame is
317 * an RTN frame (indicating we *have* hit a final state in the GLA)
319 struct parse_stack_frame *do_gla_transition(struct parse_state *s,
320 struct terminal *gla_term,
321 int *rtn_term_offset)
323 struct parse_stack_frame *frame = &s->parse_stack[s->parse_stack_len-1];
324 assert(frame->frame_type == FRAME_TYPE_GLA);
325 assert(frame->f.gla_frame.gla_state->is_final == false);
326 struct gla_state *gla_state = frame->f.gla_frame.gla_state;
327 struct gla_state *dest_gla_state = NULL;
329 for(int i = 0; i < gla_state->d.nonfinal.num_transitions; i++)
331 struct gla_transition *t = &gla_state->d.nonfinal.transitions[i];
332 if(t->term == gla_term->name)
334 frame->f.gla_frame.gla_state = dest_gla_state = t->dest_state;
337 assert(dest_gla_state);
339 if(dest_gla_state->is_final)
341 /* pop the GLA frame (since now we know what RTN transition to take)
342 * and use its information to make an RTN transition */
343 int offset = dest_gla_state->d.final.transition_offset;
344 frame = pop_gla_frame(s);
345 if(offset == 0)
347 frame = pop_rtn_frame(s);
349 else
351 struct rtn_transition *t = &frame->f.rtn_frame.rtn_state->transitions[offset-1];
352 struct terminal *next_term = &s->token_buffer[*rtn_term_offset];
353 if(t->transition_type == TERMINAL_TRANSITION)
355 /* The transition must match what we have in the token buffer */
356 (*rtn_term_offset)++;
357 assert(next_term->name == t->edge.terminal_name);
358 frame = do_rtn_terminal_transition(s, t, next_term);
360 else
362 frame = push_rtn_frame_for_transition(s, t, next_term->offset+next_term->len);
366 return frame;
370 * process_terminal(): processes a terminal that was just lexed, possibly
371 * triggering a series of RTN and/or GLA transitions.
373 * Preconditions:
374 * - the current stack frame is an intfa frame representing the intfa that
375 * just produced this terminal
376 * - the given terminal can be recognized by the current GLA or RTN state
378 * Postconditions:
379 * - the current stack frame is an intfa frame representing the state after
380 * all available GLA and RTN transitions have been taken.
383 struct intfa_frame *process_terminal(struct parse_state *s,
384 char *term_name,
385 int start_offset,
386 int len)
388 pop_intfa_frame(s);
390 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
391 int rtn_term_offset = 0;
392 int gla_term_offset = s->token_buffer_len;
394 RESIZE_DYNARRAY(s->token_buffer, s->token_buffer_len+1);
395 struct terminal *term = DYNARRAY_GET_TOP(s->token_buffer);
396 term->name = term_name;
397 term->offset = start_offset;
398 term->len = len;
400 /* Feed tokens to RTNs and GLAs until we have processed all the tokens we have */
401 while(frame != NULL &&
402 ((frame->frame_type == FRAME_TYPE_RTN && rtn_term_offset < s->token_buffer_len) ||
403 (frame->frame_type == FRAME_TYPE_GLA && gla_term_offset < s->token_buffer_len)))
405 struct terminal *rtn_term = &s->token_buffer[rtn_term_offset];
406 if(frame->frame_type == FRAME_TYPE_RTN)
408 struct rtn_transition *t;
409 rtn_term_offset++;
410 t = find_rtn_terminal_transition(s, rtn_term);
411 if(!t)
413 fprintf(stderr, "Parse error: unexpected terminal %s at offset %d\n",
414 rtn_term->name, rtn_term->offset);
415 exit(1);
418 frame = do_rtn_terminal_transition(s, t, rtn_term);
420 else
422 struct terminal *gla_term = &s->token_buffer[gla_term_offset++];
423 frame = do_gla_transition(s, gla_term, &rtn_term_offset);
425 bool entered_gla;
426 frame = descend_to_gla(s, &entered_gla, rtn_term->offset+rtn_term->len);
427 if(entered_gla)
429 gla_term_offset = rtn_term_offset;
433 /* Remove consumed terminals from token_buffer */
434 int remaining_terminals = s->token_buffer_len - rtn_term_offset;
436 if(frame == NULL)
438 /* EOF (as far as the parser is concerned */
439 assert(remaining_terminals == 0);
440 return NULL;
443 if(remaining_terminals > 0)
445 memmove(s->token_buffer, s->token_buffer + rtn_term_offset,
446 remaining_terminals * sizeof(*s->token_buffer));
448 RESIZE_DYNARRAY(s->token_buffer, remaining_terminals);
450 /* Now that we have processed all terminals that we currently can, push
451 * an intfa frame to handle the next bytes */
452 return push_intfa_frame_for_gla_or_rtn(s, start_offset+len);
456 * find_intfa_transition(): get the transition (if any) out of this state
457 * on this character.
459 struct intfa_transition *find_intfa_transition(struct intfa_frame *frame, char ch)
461 for(int i = 0; i < frame->intfa_state->num_transitions; i++)
463 struct intfa_transition *t = &frame->intfa_state->transitions[i];
464 if(ch >= t->ch_low && ch <= t->ch_high)
466 return t;
470 return NULL;
474 * do_intfa_transition(): transitions an IntFA frame according to the given
475 * char, performing the appropriate GLA/RTN transitions if this puts the IntFA
476 * in a final state.
478 * Preconditions:
479 * - the current stack frame is an IntFA frame
481 * Postconditions:
482 * - the current stack frame is an IntFA frame unless we have hit a
483 * hard EOF in which case it is an RTN frame. Note that it could be either
484 * same IntFA frame or a different one.
486 * Note: we currently implement longest-match, assuming that the first
487 * non-matching character is only one longer than the longest match.
489 struct intfa_frame *do_intfa_transition(struct parse_state *s,
490 struct intfa_frame *intfa_frame,
491 char ch)
493 struct intfa_transition *t = find_intfa_transition(intfa_frame, ch);
494 struct parse_stack_frame *frame = GET_PARSE_STACK_FRAME(intfa_frame);
496 /* If this character did not have any transition, but the state we're coming
497 * from is final, then longest-match semantics say that we should return
498 * the last character's final state as the token. But if the state we're
499 * coming from is *not* final, it's just a parse error. */
500 if(!t)
502 char *terminal = intfa_frame->intfa_state->final;
503 assert(terminal);
504 intfa_frame = process_terminal(s, terminal, frame->start_offset,
505 s->offset - frame->start_offset);
506 assert(intfa_frame); // if this fails, it means that we hit a hard EOF
508 /* This must succeed this time or it is a parse error */
509 t = find_intfa_transition(intfa_frame, ch);
510 assert(t);
513 /* We increment the offset here because we have just crossed the threshold
514 * where we have finished processing all terminals for the previous byte and
515 * started processing transitions for the current byte. */
516 s->offset++;
517 intfa_frame->intfa_state = t->dest_state;
518 frame->eof_ok = false;
520 /* If the current state is final and there are no outgoing transitions,
521 * we *know* we don't have to wait any longer for the longest match.
522 * Transition the RTN or GLA now, for more on-line behavior. */
523 if(intfa_frame->intfa_state->final && (intfa_frame->intfa_state->num_transitions == 0))
525 intfa_frame = process_terminal(s, intfa_frame->intfa_state->final,
526 frame->start_offset,
527 s->offset - frame->start_offset);
530 return intfa_frame;
533 enum parse_status parse(struct parse_state *s, char *buf, int buf_len,
534 int *out_consumed_buf_len, bool *out_eof_ok)
536 struct intfa_frame *intfa_frame;
538 /* For the first parse, we need to descend from the starting frame
539 * until we hit an IntFA frame. */
540 if(s->offset == 0)
542 bool entered_gla;
543 descend_to_gla(s, &entered_gla, 0);
544 intfa_frame = push_intfa_frame_for_gla_or_rtn(s, 0);
546 else
548 struct parse_stack_frame *frame = &s->parse_stack[s->parse_stack_len-1];
549 assert(frame->frame_type == FRAME_TYPE_INTFA);
550 intfa_frame = &frame->f.intfa_frame;
553 for(int i = 0; i < buf_len; i++)
555 intfa_frame = do_intfa_transition(s, intfa_frame, buf[i]);
556 if(intfa_frame == NULL)
558 if(out_consumed_buf_len) *out_consumed_buf_len = i+1;
559 if(out_eof_ok) *out_eof_ok = true;
560 assert(s->parse_stack_len == 0);
561 return PARSE_STATUS_EOF;
565 if(out_eof_ok) *out_eof_ok = DYNARRAY_GET_TOP(s->parse_stack)->eof_ok;
566 if(out_consumed_buf_len) *out_consumed_buf_len = buf_len;
568 return PARSE_STATUS_OK;
571 void finish_parse(struct parse_state *s)
573 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
574 while(s->parse_stack_len > 0)
576 assert(frame->eof_ok);
577 if(frame->frame_type == FRAME_TYPE_RTN)
578 frame = pop_rtn_frame(s);
579 else
580 frame = pop_frame(s);
584 void reinit_parse_state(struct parse_state *s, struct bound_grammar *bg)
586 s->offset = 0;
587 s->bound_grammar = bg;
588 RESIZE_DYNARRAY(s->parse_stack, 0);
589 RESIZE_DYNARRAY(s->token_buffer, 0);
590 push_rtn_frame(s, &bg->grammar->rtns[0], 0);
593 void free_parse_state(struct parse_state *s)
595 FREE_DYNARRAY(s->parse_stack);
596 FREE_DYNARRAY(s->token_buffer);
599 void init_parse_state(struct parse_state *s, struct bound_grammar *bg)
601 s->offset = 0;
602 s->bound_grammar = bg;
603 INIT_DYNARRAY(s->parse_stack, 0, 16);
604 INIT_DYNARRAY(s->token_buffer, 0, 2);
605 push_rtn_frame(s, &bg->grammar->rtns[0], 0);
610 * Local Variables:
611 * c-file-style: "bsd"
612 * c-basic-offset: 4
613 * indent-tabs-mode: nil
614 * End:
615 * vim:et:sts=4:sw=4