Move BIBLIOGRAPHY and FILEFORMAT to docs/
[gazelle.git] / runtime / interpreter.c
blobe3612a54f1307e1e45bbfe8c8380c0ce92506294
1 /*********************************************************************
3 Gazelle: a system for building fast, reusable parsers
5 interpreter.c
7 Once a compiled grammar has been loaded into memory, the routines
8 in this file are what actually does the parsing. This file is an
9 "interpreter" in the sense that it parses the input by using the
10 grammar as a data structure -- no grammar-specific code is ever
11 generated or executed. Despite this, it is still quite fast, and
12 has a very low memory footprint.
14 Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
16 *********************************************************************/
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <assert.h>
21 #include <string.h>
22 #include "interpreter.h"
24 struct grammar *global_g;
26 void dump_stack(struct parse_state *s, struct grammar *g, FILE *output)
28 fprintf(output, "Stack dump:\n");
29 if(g == NULL) g = global_g;
30 for(int i = 0; i < s->parse_stack_len; i++)
32 struct parse_stack_frame *frame = &s->parse_stack[i];
33 switch(frame->frame_type)
35 case FRAME_TYPE_RTN:
37 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
38 fprintf(output, "RTN: %s, start_offset: %d\n", rtn_frame->rtn->name,
39 rtn_frame->start_offset);
40 break;
43 case FRAME_TYPE_GLA:
45 struct gla_frame *gla_frame = &frame->f.gla_frame;
46 fprintf(output, "GLA: #%d, start_offset: %d\n", gla_frame->gla - g->glas,
47 gla_frame->start_offset);
48 break;
51 case FRAME_TYPE_INTFA:
53 struct intfa_frame *intfa_frame = &frame->f.intfa_frame;
54 fprintf(output, "IntFA: #%d, start_offset: %d\n", intfa_frame->intfa - g->intfas,
55 intfa_frame->start_offset);
56 break;
60 fprintf(output, "\n");
63 struct parse_stack_frame *push_empty_frame(struct parse_state *s, enum frame_type frame_type)
65 RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len+1);
66 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
67 frame->frame_type = frame_type;
68 return frame;
71 struct intfa_frame *push_intfa_frame(struct parse_state *s, struct intfa *intfa, int start_offset)
73 struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_INTFA);
74 struct intfa_frame *intfa_frame = &frame->f.intfa_frame;
75 intfa_frame->intfa = intfa;
76 intfa_frame->intfa_state = &intfa->states[0];
77 intfa_frame->start_offset = start_offset;
78 return intfa_frame;
81 struct parse_stack_frame *push_gla_frame(struct parse_state *s, struct gla *gla, int start_offset)
83 struct parse_stack_frame *frame = push_empty_frame(s, FRAME_TYPE_GLA);
84 struct gla_frame *gla_frame = &frame->f.gla_frame;
85 gla_frame->gla = gla;
86 gla_frame->gla_state = &gla->states[0];
87 gla_frame->start_offset = start_offset;
88 return frame;
91 struct parse_stack_frame *push_rtn_frame(struct parse_state *s, struct rtn *rtn, int start_offset)
93 struct parse_stack_frame *new_frame = push_empty_frame(s, FRAME_TYPE_RTN);
94 struct rtn_frame *new_rtn_frame = &new_frame->f.rtn_frame;
96 new_rtn_frame->rtn = rtn;
97 new_rtn_frame->rtn_transition = NULL;
98 new_rtn_frame->rtn_state = &new_rtn_frame->rtn->states[0];
99 new_rtn_frame->start_offset = start_offset;
101 /* Call start rule callback if set */
102 if(s->bound_grammar->start_rule_cb)
104 s->bound_grammar->start_rule_cb(s);
107 return new_frame;
110 struct parse_stack_frame *push_rtn_frame_for_transition(struct parse_state *s,
111 struct rtn_transition *t,
112 int start_offset)
114 struct rtn_frame *old_rtn_frame = &DYNARRAY_GET_TOP(s->parse_stack)->f.rtn_frame;
115 old_rtn_frame->rtn_transition = t;
116 return push_rtn_frame(s, t->edge.nonterminal, start_offset);
119 struct parse_stack_frame *pop_frame(struct parse_state *s)
121 assert(s->parse_stack_len > 0);
122 RESIZE_DYNARRAY(s->parse_stack, s->parse_stack_len-1);
124 struct parse_stack_frame *frame;
125 if(s->parse_stack_len > 0)
126 frame = DYNARRAY_GET_TOP(s->parse_stack);
127 else
128 frame = NULL;
130 return frame;
133 struct parse_stack_frame *pop_rtn_frame(struct parse_state *s)
135 assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == FRAME_TYPE_RTN);
137 /* Call end rule callback if set */
138 if(s->bound_grammar->end_rule_cb)
140 s->bound_grammar->end_rule_cb(s);
143 struct parse_stack_frame *frame = pop_frame(s);
144 if(frame != NULL)
146 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
147 if(rtn_frame->rtn_transition)
149 rtn_frame->rtn_state = rtn_frame->rtn_transition->dest_state;
152 return frame;
155 struct parse_stack_frame *pop_gla_frame(struct parse_state *s)
157 assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == FRAME_TYPE_GLA);
158 return pop_frame(s);
161 struct parse_stack_frame *pop_intfa_frame(struct parse_state *s)
163 assert(DYNARRAY_GET_TOP(s->parse_stack)->frame_type == FRAME_TYPE_INTFA);
164 return pop_frame(s);
168 * descend_to_gla(): given the current parse stack, pushes any RTN or GLA
169 * stack frames representing transitions that can be taken without consuming
170 * any terminals.
172 * Preconditions:
173 * - the current frame is either an RTN frame or a GLA frame
175 * Postconditions:
176 * - the current frame is an RTN frame or a GLA frame. If a new GLA frame was
177 * entered, entered_gla is set to true.
179 struct parse_stack_frame *descend_to_gla(struct parse_state *s, bool *entered_gla, int start_offset)
181 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
182 *entered_gla = false;
184 while(frame->frame_type == FRAME_TYPE_RTN)
186 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
187 switch(rtn_frame->rtn_state->lookahead_type)
189 case STATE_HAS_INTFA:
190 return frame;
191 break;
193 case STATE_HAS_GLA:
194 *entered_gla = true;
195 return push_gla_frame(s, rtn_frame->rtn_state->d.state_gla, start_offset);
196 break;
198 case STATE_HAS_NEITHER:
199 /* An RTN state has neither an IntFA or a GLA in only two cases:
200 * - it is a final state with no outgoing transitions
201 * - it is a nonfinal state with only one transition (a nonterminal) */
202 assert(rtn_frame->rtn_state->num_transitions < 2);
203 if(rtn_frame->rtn_state->num_transitions == 0)
205 /* Final state */
206 frame = pop_rtn_frame(s);
207 if(frame == NULL) return NULL;
209 else if(rtn_frame->rtn_state->num_transitions == 1)
211 assert(rtn_frame->rtn_state->transitions[0].transition_type == NONTERM_TRANSITION);
212 frame = push_rtn_frame_for_transition(s, &rtn_frame->rtn_state->transitions[0],
213 start_offset);
215 break;
218 return frame;
221 struct intfa_frame *push_intfa_frame_for_gla_or_rtn(struct parse_state *s, int start_offset)
223 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
224 if(frame->frame_type == FRAME_TYPE_GLA)
226 assert(frame->f.gla_frame.gla_state->is_final == false);
227 return push_intfa_frame(s, frame->f.gla_frame.gla_state->d.nonfinal.intfa, start_offset);
229 else if(frame->frame_type == FRAME_TYPE_RTN)
231 return push_intfa_frame(s, frame->f.rtn_frame.rtn_state->d.state_intfa, start_offset);
233 assert(false);
234 return NULL;
237 struct parse_stack_frame *do_rtn_terminal_transition(struct parse_state *s,
238 struct rtn_transition *t,
239 struct terminal *terminal)
241 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
242 assert(frame->frame_type == FRAME_TYPE_RTN);
243 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
245 /* Call terminal callback if set */
246 if(s->bound_grammar->terminal_cb)
248 rtn_frame->rtn_transition = t;
249 s->bound_grammar->terminal_cb(s, terminal);
252 assert(t->transition_type == TERMINAL_TRANSITION);
253 rtn_frame->rtn_state = t->dest_state;
254 return frame;
257 struct rtn_transition *find_rtn_terminal_transition(struct parse_state *s,
258 struct terminal *terminal)
260 struct parse_stack_frame *frame = &s->parse_stack[s->parse_stack_len-1];
261 struct rtn_frame *rtn_frame = &frame->f.rtn_frame;
262 for(int i = 0; i < rtn_frame->rtn_state->num_transitions; i++)
264 struct rtn_transition *t = &rtn_frame->rtn_state->transitions[i];
265 if(t->transition_type == TERMINAL_TRANSITION && t->edge.terminal_name == terminal->name)
267 return t;
271 return NULL;
275 * do_gla_transition(): transitions a GLA frame, performing the appropriate
276 * RTN transitions if this puts the GLA in a final state.
278 * Preconditions:
279 * - the current stack frame is a GLA frame
280 * - term is a terminal that came from this GLA state's intfa
282 * Postconditions:
283 * - the current stack frame is a GLA frame (this would indicate that
284 * the GLA hasn't hit a final state yet) or the current stack frame is
285 * an RTN frame (indicating we *have* hit a final state in the GLA)
287 struct parse_stack_frame *do_gla_transition(struct parse_state *s,
288 struct terminal *gla_term,
289 int *rtn_term_offset)
291 struct parse_stack_frame *frame = &s->parse_stack[s->parse_stack_len-1];
292 assert(frame->frame_type == FRAME_TYPE_GLA);
293 assert(frame->f.gla_frame.gla_state->is_final == false);
294 struct gla_state *gla_state = frame->f.gla_frame.gla_state;
295 struct gla_state *dest_gla_state = NULL;
297 for(int i = 0; i < gla_state->d.nonfinal.num_transitions; i++)
299 struct gla_transition *t = &gla_state->d.nonfinal.transitions[i];
300 if(t->term == gla_term->name)
302 frame->f.gla_frame.gla_state = dest_gla_state = t->dest_state;
305 assert(dest_gla_state);
307 if(dest_gla_state->is_final)
309 /* pop the GLA frame (since now we know what RTN transition to take)
310 * and use its information to make an RTN transition */
311 int offset = dest_gla_state->d.final.transition_offset;
312 frame = pop_gla_frame(s);
313 if(offset == 0)
315 frame = pop_rtn_frame(s);
317 else
319 struct rtn_transition *t = &frame->f.rtn_frame.rtn_state->transitions[offset-1];
320 struct terminal *next_term = &s->token_buffer[*rtn_term_offset];
321 if(t->transition_type == TERMINAL_TRANSITION)
323 /* The transition must match what we have in the token buffer */
324 (*rtn_term_offset)++;
325 assert(next_term->name == t->edge.terminal_name);
326 frame = do_rtn_terminal_transition(s, t, next_term);
328 else
330 frame = push_rtn_frame_for_transition(s, t, next_term->offset+next_term->len);
334 return frame;
338 * process_terminal(): processes a terminal that was just lexed, possibly
339 * triggering a series of RTN and/or GLA transitions.
341 * Preconditions:
342 * - the current stack frame is an intfa frame representing the intfa that
343 * just produced this terminal
344 * - the given terminal can be recognized by the current GLA or RTN state
346 * Postconditions:
347 * - the current stack frame is an intfa frame representing the state after
348 * all available GLA and RTN transitions have been taken.
351 struct intfa_frame *process_terminal(struct parse_state *s,
352 char *term_name,
353 int start_offset,
354 int len)
356 pop_intfa_frame(s);
358 struct parse_stack_frame *frame = DYNARRAY_GET_TOP(s->parse_stack);
359 int rtn_term_offset = 0;
360 int gla_term_offset = s->token_buffer_len;
362 RESIZE_DYNARRAY(s->token_buffer, s->token_buffer_len+1);
363 struct terminal *term = DYNARRAY_GET_TOP(s->token_buffer);
364 term->name = term_name;
365 term->offset = start_offset;
366 term->len = len;
368 /* Feed tokens to RTNs and GLAs until we have processed all the tokens we have */
369 while(frame != NULL &&
370 ((frame->frame_type == FRAME_TYPE_RTN && rtn_term_offset < s->token_buffer_len) ||
371 (frame->frame_type == FRAME_TYPE_GLA && gla_term_offset < s->token_buffer_len)))
373 struct terminal *rtn_term = &s->token_buffer[rtn_term_offset];
374 if(frame->frame_type == FRAME_TYPE_RTN)
376 struct rtn_transition *t;
377 rtn_term_offset++;
378 t = find_rtn_terminal_transition(s, rtn_term);
379 if(!t)
381 fprintf(stderr, "Parse error: unexpected terminal %s at offset %d\n",
382 rtn_term->name, rtn_term->offset);
383 exit(1);
386 frame = do_rtn_terminal_transition(s, t, rtn_term);
388 else
390 struct terminal *gla_term = &s->token_buffer[gla_term_offset++];
391 frame = do_gla_transition(s, gla_term, &rtn_term_offset);
393 bool entered_gla;
394 frame = descend_to_gla(s, &entered_gla, rtn_term->offset+rtn_term->len);
395 if(entered_gla)
397 gla_term_offset = rtn_term_offset;
401 /* Remove consumed terminals from token_buffer */
402 int remaining_terminals = s->token_buffer_len - rtn_term_offset;
404 if(frame == NULL)
406 /* EOF (as far as the parser is concerned */
407 assert(remaining_terminals == 0);
408 return NULL;
411 if(remaining_terminals > 0)
413 memmove(s->token_buffer, s->token_buffer + rtn_term_offset,
414 remaining_terminals * sizeof(*s->token_buffer));
416 RESIZE_DYNARRAY(s->token_buffer, remaining_terminals);
418 /* Now that we have processed all terminals that we currently can, push
419 * an intfa frame to handle the next bytes */
420 return push_intfa_frame_for_gla_or_rtn(s, start_offset+len);
424 * find_intfa_transition(): get the transition (if any) out of this state
425 * on this character.
427 struct intfa_transition *find_intfa_transition(struct intfa_frame *frame, char ch)
429 for(int i = 0; i < frame->intfa_state->num_transitions; i++)
431 struct intfa_transition *t = &frame->intfa_state->transitions[i];
432 if(ch >= t->ch_low && ch <= t->ch_high)
434 return t;
438 return NULL;
442 * do_intfa_transition(): transitions an IntFA frame according to the given
443 * char, performing the appropriate GLA/RTN transitions if this puts the IntFA
444 * in a final state.
446 * Preconditions:
447 * - the current stack frame is an IntFA frame
449 * Postconditions:
450 * - the current stack frame is an IntFA frame unless we have hit a
451 * hard EOF in which case it is an RTN frame. Note that it could be either
452 * same IntFA frame or a different one.
454 * Note: we currently implement longest-match, assuming that the first
455 * non-matching character is only one longer than the longest match.
457 struct intfa_frame *do_intfa_transition(struct parse_state *s,
458 struct intfa_frame *intfa_frame,
459 char ch)
461 struct intfa_transition *t = find_intfa_transition(intfa_frame, ch);
463 /* If this character did not have any transition, but the state we're coming
464 * from is final, then longest-match semantics say that we should return
465 * the last character's final state as the token. But if the state we're
466 * coming from is *not* final, it's just a parse error. */
467 if(!t)
469 char *terminal = intfa_frame->intfa_state->final;
470 assert(terminal);
471 intfa_frame = process_terminal(s, terminal, intfa_frame->start_offset,
472 s->offset - intfa_frame->start_offset);
473 assert(intfa_frame); // if this fails, it means that we hit a hard EOF
475 /* This must succeed this time or it is a parse error */
476 t = find_intfa_transition(intfa_frame, ch);
477 assert(t);
480 intfa_frame->intfa_state = t->dest_state;
482 /* If the current state is final and there are no outgoing transitions,
483 * we *know* we don't have to wait any longer for the longest match.
484 * Transition the RTN or GLA now, for more on-line behavior. */
485 if(intfa_frame->intfa_state->final && (intfa_frame->intfa_state->num_transitions == 0))
487 intfa_frame = process_terminal(s, intfa_frame->intfa_state->final,
488 intfa_frame->start_offset,
489 s->offset - intfa_frame->start_offset + 1);
492 return intfa_frame;
495 enum parse_status parse(struct parse_state *s, char *buf, int buf_len,
496 int *out_consumed_buf_len, bool *out_eof_ok)
498 struct intfa_frame *intfa_frame;
499 global_g = s->bound_grammar->grammar;
501 /* For the first parse, we need to descend from the starting frame
502 * until we hit an IntFA frame. */
503 if(s->offset == 0)
505 bool entered_gla;
506 descend_to_gla(s, &entered_gla, 0);
507 intfa_frame = push_intfa_frame_for_gla_or_rtn(s, 0);
509 else
511 struct parse_stack_frame *frame = &s->parse_stack[s->parse_stack_len-1];
512 assert(frame->frame_type == FRAME_TYPE_INTFA);
513 intfa_frame = &frame->f.intfa_frame;
516 for(int i = 0; i < buf_len; i++)
518 intfa_frame = do_intfa_transition(s, intfa_frame, buf[i]);
519 s->offset++;
520 if(intfa_frame == NULL)
522 if (out_consumed_buf_len) *out_consumed_buf_len = i;
523 if (out_eof_ok) *out_eof_ok = true;
524 return PARSE_STATUS_EOF;
528 if(s->parse_stack[1].frame_type != FRAME_TYPE_RTN &&
529 s->parse_stack[0].f.rtn_frame.rtn_state->is_final)
531 if (out_eof_ok) *out_eof_ok = true;
533 else
535 if (out_eof_ok) *out_eof_ok = false;
538 if (out_consumed_buf_len) *out_consumed_buf_len = buf_len;
539 return PARSE_STATUS_OK;
542 void reinit_parse_state(struct parse_state *s, struct bound_grammar *bg)
544 s->offset = 0;
545 s->bound_grammar = bg;
546 RESIZE_DYNARRAY(s->parse_stack, 0);
547 RESIZE_DYNARRAY(s->token_buffer, 0);
548 push_rtn_frame(s, &bg->grammar->rtns[0], 0);
551 void free_parse_state(struct parse_state *s)
553 FREE_DYNARRAY(s->parse_stack);
554 FREE_DYNARRAY(s->token_buffer);
557 void init_parse_state(struct parse_state *s, struct bound_grammar *bg)
559 s->offset = 0;
560 s->bound_grammar = bg;
561 INIT_DYNARRAY(s->parse_stack, 0, 16);
562 INIT_DYNARRAY(s->token_buffer, 0, 2);
563 push_rtn_frame(s, &bg->grammar->rtns[0], 0);
568 * Local Variables:
569 * c-file-style: "bsd"
570 * c-basic-offset: 4
571 * indent-tabs-mode: nil
572 * End:
573 * vim:et:sts=4:sw=4