1 /*********************************************************************
3 Gazelle: a system for building fast, reusable parsers
7 Once a compiled grammar has been loaded into memory, the routines
8 in this file are what actually does the parsing. This file is an
9 "interpreter" in the sense that it parses the input by using the
10 grammar as a data structure -- no grammar-specific code is ever
11 generated or executed. Despite this, it is still quite fast, and
12 has a very low memory footprint.
14 The interpreter primarily consists of maintaining the parse stack
15 properly and transitioning the frames in response to the input.
17 Copyright (c) 2007-2008 Joshua Haberman. See LICENSE for details.
19 *********************************************************************/
25 #include "interpreter.h"
28 * A diagnostic function for dumping the current state of the stack.
30 void dump_stack(struct parse_state
*s
, FILE *output
)
32 fprintf(output
, "Stack dump:\n");
33 struct grammar
*g
= s
->bound_grammar
->grammar
;
34 for(int i
= 0; i
< s
->parse_stack_len
; i
++)
36 struct parse_stack_frame
*frame
= &s
->parse_stack
[i
];
37 switch(frame
->frame_type
)
41 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
42 fprintf(output
, "RTN: %s", rtn_frame
->rtn
->name
);
48 struct gla_frame
*gla_frame
= &frame
->f
.gla_frame
;
49 fprintf(output
, "GLA: #%d", gla_frame
->gla
- g
->glas
);
53 case FRAME_TYPE_INTFA
:
55 struct intfa_frame
*intfa_frame
= &frame
->f
.intfa_frame
;
56 fprintf(output
, "IntFA: #%d", intfa_frame
->intfa
- g
->intfas
);
60 fprintf(output
, ", start_offset: %d, eof_ok: %d\n", frame
->start_offset
, frame
->eof_ok
);
62 fprintf(output
, "\n");
65 struct parse_stack_frame
*push_empty_frame(struct parse_state
*s
, enum frame_type frame_type
,
68 RESIZE_DYNARRAY(s
->parse_stack
, s
->parse_stack_len
+1);
69 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
70 frame
->frame_type
= frame_type
;
71 frame
->start_offset
= start_offset
;
75 struct intfa_frame
*push_intfa_frame(struct parse_state
*s
, struct intfa
*intfa
, int start_offset
)
77 struct parse_stack_frame
*old_frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
78 struct parse_stack_frame
*frame
= push_empty_frame(s
, FRAME_TYPE_INTFA
, start_offset
);
79 struct intfa_frame
*intfa_frame
= &frame
->f
.intfa_frame
;
80 intfa_frame
->intfa
= intfa
;
81 intfa_frame
->intfa_state
= &intfa
->states
[0];
83 /* IntFA frames start out being eof_ok if the parent frame is, but become not ok
84 * when they transition out of the initial state. */
85 frame
->eof_ok
= old_frame
->eof_ok
;
90 struct parse_stack_frame
*push_gla_frame(struct parse_state
*s
, struct gla
*gla
, int start_offset
)
92 struct parse_stack_frame
*old_frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
93 struct parse_stack_frame
*frame
= push_empty_frame(s
, FRAME_TYPE_GLA
, start_offset
);
94 struct gla_frame
*gla_frame
= &frame
->f
.gla_frame
;
96 gla_frame
->gla_state
= &gla
->states
[0];
98 /* GLA frames start out being eof_ok if the parent frame is, but become not ok
99 * when they transition out of the initial state. */
100 frame
->eof_ok
= old_frame
->eof_ok
;
105 struct parse_stack_frame
*push_rtn_frame(struct parse_state
*s
, struct rtn
*rtn
, int start_offset
)
107 struct parse_stack_frame
*old_frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
108 struct parse_stack_frame
*new_frame
= push_empty_frame(s
, FRAME_TYPE_RTN
, start_offset
);
109 struct rtn_frame
*new_rtn_frame
= &new_frame
->f
.rtn_frame
;
111 new_rtn_frame
->rtn
= rtn
;
112 new_rtn_frame
->rtn_transition
= NULL
;
113 new_rtn_frame
->rtn_state
= &new_rtn_frame
->rtn
->states
[0];
115 /* RTN frames start out being eof_ok iff their start state is a final state
116 * *and* their parent is eof_ok. */
117 new_frame
->eof_ok
= old_frame
->eof_ok
&& rtn
->states
[0].is_final
;
119 /* Call start rule callback if set */
120 if(s
->bound_grammar
->start_rule_cb
)
122 s
->bound_grammar
->start_rule_cb(s
);
128 struct parse_stack_frame
*push_rtn_frame_for_transition(struct parse_state
*s
,
129 struct rtn_transition
*t
,
132 struct rtn_frame
*old_rtn_frame
= &DYNARRAY_GET_TOP(s
->parse_stack
)->f
.rtn_frame
;
133 old_rtn_frame
->rtn_transition
= t
;
134 return push_rtn_frame(s
, t
->edge
.nonterminal
, start_offset
);
137 struct parse_stack_frame
*pop_frame(struct parse_state
*s
)
139 assert(s
->parse_stack_len
> 0);
140 RESIZE_DYNARRAY(s
->parse_stack
, s
->parse_stack_len
-1);
142 struct parse_stack_frame
*frame
;
143 if(s
->parse_stack_len
> 0)
144 frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
151 void set_eof_ok_flag_for_rtn_frame(struct parse_state
*s
)
153 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
154 assert(frame
->frame_type
== FRAME_TYPE_RTN
);
155 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
156 if(rtn_frame
->rtn_state
->is_final
&&
157 (s
->parse_stack_len
== 0 || s
->parse_stack
[s
->parse_stack_len
-2].eof_ok
))
158 frame
->eof_ok
= true;
160 frame
->eof_ok
= false;
163 struct parse_stack_frame
*pop_rtn_frame(struct parse_state
*s
)
165 assert(DYNARRAY_GET_TOP(s
->parse_stack
)->frame_type
== FRAME_TYPE_RTN
);
167 /* Call end rule callback if set */
168 if(s
->bound_grammar
->end_rule_cb
)
170 s
->bound_grammar
->end_rule_cb(s
);
173 struct parse_stack_frame
*frame
= pop_frame(s
);
176 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
177 if(rtn_frame
->rtn_transition
)
179 rtn_frame
->rtn_state
= rtn_frame
->rtn_transition
->dest_state
;
180 set_eof_ok_flag_for_rtn_frame(s
);
186 struct parse_stack_frame
*pop_gla_frame(struct parse_state
*s
)
188 assert(DYNARRAY_GET_TOP(s
->parse_stack
)->frame_type
== FRAME_TYPE_GLA
);
192 struct parse_stack_frame
*pop_intfa_frame(struct parse_state
*s
)
194 assert(DYNARRAY_GET_TOP(s
->parse_stack
)->frame_type
== FRAME_TYPE_INTFA
);
199 * descend_to_gla(): given the current parse stack, pushes any RTN or GLA
200 * stack frames representing transitions that can be taken without consuming
204 * - the current frame is either an RTN frame or a GLA frame
207 * - the current frame is an RTN frame or a GLA frame. If a new GLA frame was
208 * entered, entered_gla is set to true.
210 struct parse_stack_frame
*descend_to_gla(struct parse_state
*s
, bool *entered_gla
, int start_offset
)
212 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
213 *entered_gla
= false;
215 while(frame
->frame_type
== FRAME_TYPE_RTN
)
217 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
218 switch(rtn_frame
->rtn_state
->lookahead_type
)
220 case STATE_HAS_INTFA
:
226 return push_gla_frame(s
, rtn_frame
->rtn_state
->d
.state_gla
, start_offset
);
229 case STATE_HAS_NEITHER
:
230 /* An RTN state has neither an IntFA or a GLA in only two cases:
231 * - it is a final state with no outgoing transitions
232 * - it is a nonfinal state with only one transition (a nonterminal) */
233 assert(rtn_frame
->rtn_state
->num_transitions
< 2);
234 if(rtn_frame
->rtn_state
->num_transitions
== 0)
237 frame
= pop_rtn_frame(s
);
238 if(frame
== NULL
) return NULL
;
240 else if(rtn_frame
->rtn_state
->num_transitions
== 1)
242 assert(rtn_frame
->rtn_state
->transitions
[0].transition_type
== NONTERM_TRANSITION
);
243 frame
= push_rtn_frame_for_transition(s
, &rtn_frame
->rtn_state
->transitions
[0],
252 struct intfa_frame
*push_intfa_frame_for_gla_or_rtn(struct parse_state
*s
, int start_offset
)
254 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
255 if(frame
->frame_type
== FRAME_TYPE_GLA
)
257 assert(frame
->f
.gla_frame
.gla_state
->is_final
== false);
258 return push_intfa_frame(s
, frame
->f
.gla_frame
.gla_state
->d
.nonfinal
.intfa
, start_offset
);
260 else if(frame
->frame_type
== FRAME_TYPE_RTN
)
262 return push_intfa_frame(s
, frame
->f
.rtn_frame
.rtn_state
->d
.state_intfa
, start_offset
);
268 struct parse_stack_frame
*do_rtn_terminal_transition(struct parse_state
*s
,
269 struct rtn_transition
*t
,
270 struct terminal
*terminal
)
272 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
273 assert(frame
->frame_type
== FRAME_TYPE_RTN
);
274 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
276 /* Call terminal callback if set */
277 if(s
->bound_grammar
->terminal_cb
)
279 rtn_frame
->rtn_transition
= t
;
280 s
->bound_grammar
->terminal_cb(s
, terminal
);
283 assert(t
->transition_type
== TERMINAL_TRANSITION
);
284 rtn_frame
->rtn_state
= t
->dest_state
;
285 set_eof_ok_flag_for_rtn_frame(s
);
289 struct rtn_transition
*find_rtn_terminal_transition(struct parse_state
*s
,
290 struct terminal
*terminal
)
292 struct parse_stack_frame
*frame
= &s
->parse_stack
[s
->parse_stack_len
-1];
293 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
294 for(int i
= 0; i
< rtn_frame
->rtn_state
->num_transitions
; i
++)
296 struct rtn_transition
*t
= &rtn_frame
->rtn_state
->transitions
[i
];
297 if(t
->transition_type
== TERMINAL_TRANSITION
&& t
->edge
.terminal_name
== terminal
->name
)
307 * do_gla_transition(): transitions a GLA frame, performing the appropriate
308 * RTN transitions if this puts the GLA in a final state.
311 * - the current stack frame is a GLA frame
312 * - term is a terminal that came from this GLA state's intfa
315 * - the current stack frame is a GLA frame (this would indicate that
316 * the GLA hasn't hit a final state yet) or the current stack frame is
317 * an RTN frame (indicating we *have* hit a final state in the GLA)
319 struct parse_stack_frame
*do_gla_transition(struct parse_state
*s
,
320 struct terminal
*gla_term
,
321 int *rtn_term_offset
)
323 struct parse_stack_frame
*frame
= &s
->parse_stack
[s
->parse_stack_len
-1];
324 assert(frame
->frame_type
== FRAME_TYPE_GLA
);
325 assert(frame
->f
.gla_frame
.gla_state
->is_final
== false);
326 struct gla_state
*gla_state
= frame
->f
.gla_frame
.gla_state
;
327 struct gla_state
*dest_gla_state
= NULL
;
329 for(int i
= 0; i
< gla_state
->d
.nonfinal
.num_transitions
; i
++)
331 struct gla_transition
*t
= &gla_state
->d
.nonfinal
.transitions
[i
];
332 if(t
->term
== gla_term
->name
)
334 frame
->f
.gla_frame
.gla_state
= dest_gla_state
= t
->dest_state
;
337 assert(dest_gla_state
);
339 if(dest_gla_state
->is_final
)
341 /* pop the GLA frame (since now we know what RTN transition to take)
342 * and use its information to make an RTN transition */
343 int offset
= dest_gla_state
->d
.final
.transition_offset
;
344 frame
= pop_gla_frame(s
);
347 frame
= pop_rtn_frame(s
);
351 struct rtn_transition
*t
= &frame
->f
.rtn_frame
.rtn_state
->transitions
[offset
-1];
352 struct terminal
*next_term
= &s
->token_buffer
[*rtn_term_offset
];
353 if(t
->transition_type
== TERMINAL_TRANSITION
)
355 /* The transition must match what we have in the token buffer */
356 (*rtn_term_offset
)++;
357 assert(next_term
->name
== t
->edge
.terminal_name
);
358 frame
= do_rtn_terminal_transition(s
, t
, next_term
);
362 frame
= push_rtn_frame_for_transition(s
, t
, next_term
->offset
+next_term
->len
);
370 * process_terminal(): processes a terminal that was just lexed, possibly
371 * triggering a series of RTN and/or GLA transitions.
374 * - the current stack frame is an intfa frame representing the intfa that
375 * just produced this terminal
376 * - the given terminal can be recognized by the current GLA or RTN state
379 * - the current stack frame is an intfa frame representing the state after
380 * all available GLA and RTN transitions have been taken.
383 struct intfa_frame
*process_terminal(struct parse_state
*s
,
390 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
391 int rtn_term_offset
= 0;
392 int gla_term_offset
= s
->token_buffer_len
;
394 RESIZE_DYNARRAY(s
->token_buffer
, s
->token_buffer_len
+1);
395 struct terminal
*term
= DYNARRAY_GET_TOP(s
->token_buffer
);
396 term
->name
= term_name
;
397 term
->offset
= start_offset
;
400 /* Feed tokens to RTNs and GLAs until we have processed all the tokens we have */
401 while(frame
!= NULL
&&
402 ((frame
->frame_type
== FRAME_TYPE_RTN
&& rtn_term_offset
< s
->token_buffer_len
) ||
403 (frame
->frame_type
== FRAME_TYPE_GLA
&& gla_term_offset
< s
->token_buffer_len
)))
405 struct terminal
*rtn_term
= &s
->token_buffer
[rtn_term_offset
];
406 if(frame
->frame_type
== FRAME_TYPE_RTN
)
408 struct rtn_transition
*t
;
410 t
= find_rtn_terminal_transition(s
, rtn_term
);
413 fprintf(stderr
, "Parse error: unexpected terminal %s at offset %d\n",
414 rtn_term
->name
, rtn_term
->offset
);
418 frame
= do_rtn_terminal_transition(s
, t
, rtn_term
);
422 struct terminal
*gla_term
= &s
->token_buffer
[gla_term_offset
++];
423 frame
= do_gla_transition(s
, gla_term
, &rtn_term_offset
);
426 frame
= descend_to_gla(s
, &entered_gla
, rtn_term
->offset
+rtn_term
->len
);
429 gla_term_offset
= rtn_term_offset
;
433 /* Remove consumed terminals from token_buffer */
434 int remaining_terminals
= s
->token_buffer_len
- rtn_term_offset
;
438 /* EOF (as far as the parser is concerned */
439 assert(remaining_terminals
== 0);
443 if(remaining_terminals
> 0)
445 memmove(s
->token_buffer
, s
->token_buffer
+ rtn_term_offset
,
446 remaining_terminals
* sizeof(*s
->token_buffer
));
448 RESIZE_DYNARRAY(s
->token_buffer
, remaining_terminals
);
450 /* Now that we have processed all terminals that we currently can, push
451 * an intfa frame to handle the next bytes */
452 return push_intfa_frame_for_gla_or_rtn(s
, start_offset
+len
);
456 * find_intfa_transition(): get the transition (if any) out of this state
459 struct intfa_transition
*find_intfa_transition(struct intfa_frame
*frame
, char ch
)
461 for(int i
= 0; i
< frame
->intfa_state
->num_transitions
; i
++)
463 struct intfa_transition
*t
= &frame
->intfa_state
->transitions
[i
];
464 if(ch
>= t
->ch_low
&& ch
<= t
->ch_high
)
474 * do_intfa_transition(): transitions an IntFA frame according to the given
475 * char, performing the appropriate GLA/RTN transitions if this puts the IntFA
479 * - the current stack frame is an IntFA frame
482 * - the current stack frame is an IntFA frame unless we have hit a
483 * hard EOF in which case it is an RTN frame. Note that it could be either
484 * same IntFA frame or a different one.
486 * Note: we currently implement longest-match, assuming that the first
487 * non-matching character is only one longer than the longest match.
489 struct intfa_frame
*do_intfa_transition(struct parse_state
*s
,
490 struct intfa_frame
*intfa_frame
,
493 struct intfa_transition
*t
= find_intfa_transition(intfa_frame
, ch
);
494 struct parse_stack_frame
*frame
= GET_PARSE_STACK_FRAME(intfa_frame
);
496 /* If this character did not have any transition, but the state we're coming
497 * from is final, then longest-match semantics say that we should return
498 * the last character's final state as the token. But if the state we're
499 * coming from is *not* final, it's just a parse error. */
502 char *terminal
= intfa_frame
->intfa_state
->final
;
504 intfa_frame
= process_terminal(s
, terminal
, frame
->start_offset
,
505 s
->offset
- frame
->start_offset
);
506 assert(intfa_frame
); // if this fails, it means that we hit a hard EOF
508 /* This must succeed this time or it is a parse error */
509 t
= find_intfa_transition(intfa_frame
, ch
);
513 /* We increment the offset here because we have just crossed the threshold
514 * where we have finished processing all terminals for the previous byte and
515 * started processing transitions for the current byte. */
517 intfa_frame
->intfa_state
= t
->dest_state
;
518 frame
->eof_ok
= false;
520 /* If the current state is final and there are no outgoing transitions,
521 * we *know* we don't have to wait any longer for the longest match.
522 * Transition the RTN or GLA now, for more on-line behavior. */
523 if(intfa_frame
->intfa_state
->final
&& (intfa_frame
->intfa_state
->num_transitions
== 0))
525 intfa_frame
= process_terminal(s
, intfa_frame
->intfa_state
->final
,
527 s
->offset
- frame
->start_offset
);
533 enum parse_status
parse(struct parse_state
*s
, char *buf
, int buf_len
,
534 int *out_consumed_buf_len
, bool *out_eof_ok
)
536 struct intfa_frame
*intfa_frame
;
538 /* For the first parse, we need to descend from the starting frame
539 * until we hit an IntFA frame. */
543 descend_to_gla(s
, &entered_gla
, 0);
544 intfa_frame
= push_intfa_frame_for_gla_or_rtn(s
, 0);
548 struct parse_stack_frame
*frame
= &s
->parse_stack
[s
->parse_stack_len
-1];
549 assert(frame
->frame_type
== FRAME_TYPE_INTFA
);
550 intfa_frame
= &frame
->f
.intfa_frame
;
553 for(int i
= 0; i
< buf_len
; i
++)
555 intfa_frame
= do_intfa_transition(s
, intfa_frame
, buf
[i
]);
556 if(intfa_frame
== NULL
)
558 if(out_consumed_buf_len
) *out_consumed_buf_len
= i
+1;
559 if(out_eof_ok
) *out_eof_ok
= true;
560 assert(s
->parse_stack_len
== 0);
561 return PARSE_STATUS_EOF
;
565 if(out_eof_ok
) *out_eof_ok
= DYNARRAY_GET_TOP(s
->parse_stack
)->eof_ok
;
566 if(out_consumed_buf_len
) *out_consumed_buf_len
= buf_len
;
568 return PARSE_STATUS_OK
;
571 void finish_parse(struct parse_state
*s
)
573 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
574 while(s
->parse_stack_len
> 0)
576 assert(frame
->eof_ok
);
577 if(frame
->frame_type
== FRAME_TYPE_RTN
)
578 frame
= pop_rtn_frame(s
);
580 frame
= pop_frame(s
);
584 void reinit_parse_state(struct parse_state
*s
, struct bound_grammar
*bg
)
587 s
->bound_grammar
= bg
;
588 RESIZE_DYNARRAY(s
->parse_stack
, 0);
589 RESIZE_DYNARRAY(s
->token_buffer
, 0);
590 push_rtn_frame(s
, &bg
->grammar
->rtns
[0], 0);
593 void free_parse_state(struct parse_state
*s
)
595 FREE_DYNARRAY(s
->parse_stack
);
596 FREE_DYNARRAY(s
->token_buffer
);
599 void init_parse_state(struct parse_state
*s
, struct bound_grammar
*bg
)
602 s
->bound_grammar
= bg
;
603 INIT_DYNARRAY(s
->parse_stack
, 0, 16);
604 INIT_DYNARRAY(s
->token_buffer
, 0, 2);
605 push_rtn_frame(s
, &bg
->grammar
->rtns
[0], 0);
611 * c-file-style: "bsd"
613 * indent-tabs-mode: nil