1 /*********************************************************************
3 Gazelle: a system for building fast, reusable parsers
7 Once a compiled grammar has been loaded into memory, the routines
8 in this file are what actually does the parsing. This file is an
9 "interpreter" in the sense that it parses the input by using the
10 grammar as a data structure -- no grammar-specific code is ever
11 generated or executed. Despite this, it is still quite fast, and
12 has a very low memory footprint.
14 Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
16 *********************************************************************/
22 #include "interpreter.h"
24 struct grammar
*global_g
;
26 void dump_stack(struct parse_state
*s
, struct grammar
*g
, FILE *output
)
28 fprintf(output
, "Stack dump:\n");
29 if(g
== NULL
) g
= global_g
;
30 for(int i
= 0; i
< s
->parse_stack_len
; i
++)
32 struct parse_stack_frame
*frame
= &s
->parse_stack
[i
];
33 switch(frame
->frame_type
)
37 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
38 fprintf(output
, "RTN: %s, start_offset: %d\n", rtn_frame
->rtn
->name
,
39 rtn_frame
->start_offset
);
45 struct gla_frame
*gla_frame
= &frame
->f
.gla_frame
;
46 fprintf(output
, "GLA: #%d, start_offset: %d\n", gla_frame
->gla
- g
->glas
,
47 gla_frame
->start_offset
);
51 case FRAME_TYPE_INTFA
:
53 struct intfa_frame
*intfa_frame
= &frame
->f
.intfa_frame
;
54 fprintf(output
, "IntFA: #%d, start_offset: %d\n", intfa_frame
->intfa
- g
->intfas
,
55 intfa_frame
->start_offset
);
60 fprintf(output
, "\n");
63 struct parse_stack_frame
*push_empty_frame(struct parse_state
*s
, enum frame_type frame_type
)
65 RESIZE_DYNARRAY(s
->parse_stack
, s
->parse_stack_len
+1);
66 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
67 frame
->frame_type
= frame_type
;
71 struct intfa_frame
*push_intfa_frame(struct parse_state
*s
, struct intfa
*intfa
, int start_offset
)
73 struct parse_stack_frame
*frame
= push_empty_frame(s
, FRAME_TYPE_INTFA
);
74 struct intfa_frame
*intfa_frame
= &frame
->f
.intfa_frame
;
75 intfa_frame
->intfa
= intfa
;
76 intfa_frame
->intfa_state
= &intfa
->states
[0];
77 intfa_frame
->start_offset
= start_offset
;
81 struct parse_stack_frame
*push_gla_frame(struct parse_state
*s
, struct gla
*gla
, int start_offset
)
83 struct parse_stack_frame
*frame
= push_empty_frame(s
, FRAME_TYPE_GLA
);
84 struct gla_frame
*gla_frame
= &frame
->f
.gla_frame
;
86 gla_frame
->gla_state
= &gla
->states
[0];
87 gla_frame
->start_offset
= start_offset
;
91 struct parse_stack_frame
*push_rtn_frame(struct parse_state
*s
, struct rtn
*rtn
, int start_offset
)
93 struct parse_stack_frame
*new_frame
= push_empty_frame(s
, FRAME_TYPE_RTN
);
94 struct rtn_frame
*new_rtn_frame
= &new_frame
->f
.rtn_frame
;
96 new_rtn_frame
->rtn
= rtn
;
97 new_rtn_frame
->rtn_transition
= NULL
;
98 new_rtn_frame
->rtn_state
= &new_rtn_frame
->rtn
->states
[0];
99 new_rtn_frame
->start_offset
= start_offset
;
101 /* Call start rule callback if set */
102 if(s
->bound_grammar
->start_rule_cb
)
104 s
->bound_grammar
->start_rule_cb(s
);
110 struct parse_stack_frame
*push_rtn_frame_for_transition(struct parse_state
*s
,
111 struct rtn_transition
*t
,
114 struct rtn_frame
*old_rtn_frame
= &DYNARRAY_GET_TOP(s
->parse_stack
)->f
.rtn_frame
;
115 old_rtn_frame
->rtn_transition
= t
;
116 return push_rtn_frame(s
, t
->edge
.nonterminal
, start_offset
);
119 struct parse_stack_frame
*pop_frame(struct parse_state
*s
)
121 assert(s
->parse_stack_len
> 0);
122 RESIZE_DYNARRAY(s
->parse_stack
, s
->parse_stack_len
-1);
124 struct parse_stack_frame
*frame
;
125 if(s
->parse_stack_len
> 0)
126 frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
133 struct parse_stack_frame
*pop_rtn_frame(struct parse_state
*s
)
135 assert(DYNARRAY_GET_TOP(s
->parse_stack
)->frame_type
== FRAME_TYPE_RTN
);
137 /* Call end rule callback if set */
138 if(s
->bound_grammar
->end_rule_cb
)
140 s
->bound_grammar
->end_rule_cb(s
);
143 struct parse_stack_frame
*frame
= pop_frame(s
);
146 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
147 if(rtn_frame
->rtn_transition
)
149 rtn_frame
->rtn_state
= rtn_frame
->rtn_transition
->dest_state
;
155 struct parse_stack_frame
*pop_gla_frame(struct parse_state
*s
)
157 assert(DYNARRAY_GET_TOP(s
->parse_stack
)->frame_type
== FRAME_TYPE_GLA
);
161 struct parse_stack_frame
*pop_intfa_frame(struct parse_state
*s
)
163 assert(DYNARRAY_GET_TOP(s
->parse_stack
)->frame_type
== FRAME_TYPE_INTFA
);
168 * descend_to_gla(): given the current parse stack, pushes any RTN or GLA
169 * stack frames representing transitions that can be taken without consuming
173 * - the current frame is either an RTN frame or a GLA frame
176 * - the current frame is an RTN frame or a GLA frame. If a new GLA frame was
177 * entered, entered_gla is set to true.
179 struct parse_stack_frame
*descend_to_gla(struct parse_state
*s
, bool *entered_gla
, int start_offset
)
181 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
182 *entered_gla
= false;
184 while(frame
->frame_type
== FRAME_TYPE_RTN
)
186 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
187 switch(rtn_frame
->rtn_state
->lookahead_type
)
189 case STATE_HAS_INTFA
:
195 return push_gla_frame(s
, rtn_frame
->rtn_state
->d
.state_gla
, start_offset
);
198 case STATE_HAS_NEITHER
:
199 /* An RTN state has neither an IntFA or a GLA in only two cases:
200 * - it is a final state with no outgoing transitions
201 * - it is a nonfinal state with only one transition (a nonterminal) */
202 assert(rtn_frame
->rtn_state
->num_transitions
< 2);
203 if(rtn_frame
->rtn_state
->num_transitions
== 0)
206 frame
= pop_rtn_frame(s
);
207 if(frame
== NULL
) return NULL
;
209 else if(rtn_frame
->rtn_state
->num_transitions
== 1)
211 assert(rtn_frame
->rtn_state
->transitions
[0].transition_type
== NONTERM_TRANSITION
);
212 frame
= push_rtn_frame_for_transition(s
, &rtn_frame
->rtn_state
->transitions
[0],
221 struct intfa_frame
*push_intfa_frame_for_gla_or_rtn(struct parse_state
*s
, int start_offset
)
223 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
224 if(frame
->frame_type
== FRAME_TYPE_GLA
)
226 assert(frame
->f
.gla_frame
.gla_state
->is_final
== false);
227 return push_intfa_frame(s
, frame
->f
.gla_frame
.gla_state
->d
.nonfinal
.intfa
, start_offset
);
229 else if(frame
->frame_type
== FRAME_TYPE_RTN
)
231 return push_intfa_frame(s
, frame
->f
.rtn_frame
.rtn_state
->d
.state_intfa
, start_offset
);
237 struct parse_stack_frame
*do_rtn_terminal_transition(struct parse_state
*s
,
238 struct rtn_transition
*t
,
239 struct terminal
*terminal
)
241 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
242 assert(frame
->frame_type
== FRAME_TYPE_RTN
);
243 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
245 /* Call terminal callback if set */
246 if(s
->bound_grammar
->terminal_cb
)
248 rtn_frame
->rtn_transition
= t
;
249 s
->bound_grammar
->terminal_cb(s
, terminal
);
252 assert(t
->transition_type
== TERMINAL_TRANSITION
);
253 rtn_frame
->rtn_state
= t
->dest_state
;
257 struct rtn_transition
*find_rtn_terminal_transition(struct parse_state
*s
,
258 struct terminal
*terminal
)
260 struct parse_stack_frame
*frame
= &s
->parse_stack
[s
->parse_stack_len
-1];
261 struct rtn_frame
*rtn_frame
= &frame
->f
.rtn_frame
;
262 for(int i
= 0; i
< rtn_frame
->rtn_state
->num_transitions
; i
++)
264 struct rtn_transition
*t
= &rtn_frame
->rtn_state
->transitions
[i
];
265 if(t
->transition_type
== TERMINAL_TRANSITION
&& t
->edge
.terminal_name
== terminal
->name
)
275 * do_gla_transition(): transitions a GLA frame, performing the appropriate
276 * RTN transitions if this puts the GLA in a final state.
279 * - the current stack frame is a GLA frame
280 * - term is a terminal that came from this GLA state's intfa
283 * - the current stack frame is a GLA frame (this would indicate that
284 * the GLA hasn't hit a final state yet) or the current stack frame is
285 * an RTN frame (indicating we *have* hit a final state in the GLA)
287 struct parse_stack_frame
*do_gla_transition(struct parse_state
*s
,
288 struct terminal
*gla_term
,
289 int *rtn_term_offset
)
291 struct parse_stack_frame
*frame
= &s
->parse_stack
[s
->parse_stack_len
-1];
292 assert(frame
->frame_type
== FRAME_TYPE_GLA
);
293 assert(frame
->f
.gla_frame
.gla_state
->is_final
== false);
294 struct gla_state
*gla_state
= frame
->f
.gla_frame
.gla_state
;
295 struct gla_state
*dest_gla_state
= NULL
;
297 for(int i
= 0; i
< gla_state
->d
.nonfinal
.num_transitions
; i
++)
299 struct gla_transition
*t
= &gla_state
->d
.nonfinal
.transitions
[i
];
300 if(t
->term
== gla_term
->name
)
302 frame
->f
.gla_frame
.gla_state
= dest_gla_state
= t
->dest_state
;
305 assert(dest_gla_state
);
307 if(dest_gla_state
->is_final
)
309 /* pop the GLA frame (since now we know what RTN transition to take)
310 * and use its information to make an RTN transition */
311 int offset
= dest_gla_state
->d
.final
.transition_offset
;
312 frame
= pop_gla_frame(s
);
315 frame
= pop_rtn_frame(s
);
319 struct rtn_transition
*t
= &frame
->f
.rtn_frame
.rtn_state
->transitions
[offset
-1];
320 struct terminal
*next_term
= &s
->token_buffer
[*rtn_term_offset
];
321 if(t
->transition_type
== TERMINAL_TRANSITION
)
323 /* The transition must match what we have in the token buffer */
324 (*rtn_term_offset
)++;
325 assert(next_term
->name
== t
->edge
.terminal_name
);
326 frame
= do_rtn_terminal_transition(s
, t
, next_term
);
330 frame
= push_rtn_frame_for_transition(s
, t
, next_term
->offset
+next_term
->len
);
338 * process_terminal(): processes a terminal that was just lexed, possibly
339 * triggering a series of RTN and/or GLA transitions.
342 * - the current stack frame is an intfa frame representing the intfa that
343 * just produced this terminal
344 * - the given terminal can be recognized by the current GLA or RTN state
347 * - the current stack frame is an intfa frame representing the state after
348 * all available GLA and RTN transitions have been taken.
351 struct intfa_frame
*process_terminal(struct parse_state
*s
,
358 struct parse_stack_frame
*frame
= DYNARRAY_GET_TOP(s
->parse_stack
);
359 int rtn_term_offset
= 0;
360 int gla_term_offset
= s
->token_buffer_len
;
362 RESIZE_DYNARRAY(s
->token_buffer
, s
->token_buffer_len
+1);
363 struct terminal
*term
= DYNARRAY_GET_TOP(s
->token_buffer
);
364 term
->name
= term_name
;
365 term
->offset
= start_offset
;
368 /* Feed tokens to RTNs and GLAs until we have processed all the tokens we have */
369 while(frame
!= NULL
&&
370 ((frame
->frame_type
== FRAME_TYPE_RTN
&& rtn_term_offset
< s
->token_buffer_len
) ||
371 (frame
->frame_type
== FRAME_TYPE_GLA
&& gla_term_offset
< s
->token_buffer_len
)))
373 struct terminal
*rtn_term
= &s
->token_buffer
[rtn_term_offset
];
374 if(frame
->frame_type
== FRAME_TYPE_RTN
)
376 struct rtn_transition
*t
;
378 t
= find_rtn_terminal_transition(s
, rtn_term
);
381 fprintf(stderr
, "Parse error: unexpected terminal %s at offset %d\n",
382 rtn_term
->name
, rtn_term
->offset
);
386 frame
= do_rtn_terminal_transition(s
, t
, rtn_term
);
390 struct terminal
*gla_term
= &s
->token_buffer
[gla_term_offset
++];
391 frame
= do_gla_transition(s
, gla_term
, &rtn_term_offset
);
394 frame
= descend_to_gla(s
, &entered_gla
, rtn_term
->offset
+rtn_term
->len
);
397 gla_term_offset
= rtn_term_offset
;
401 /* Remove consumed terminals from token_buffer */
402 int remaining_terminals
= s
->token_buffer_len
- rtn_term_offset
;
406 /* EOF (as far as the parser is concerned */
407 assert(remaining_terminals
== 0);
411 if(remaining_terminals
> 0)
413 memmove(s
->token_buffer
, s
->token_buffer
+ rtn_term_offset
,
414 remaining_terminals
* sizeof(*s
->token_buffer
));
416 RESIZE_DYNARRAY(s
->token_buffer
, remaining_terminals
);
418 /* Now that we have processed all terminals that we currently can, push
419 * an intfa frame to handle the next bytes */
420 return push_intfa_frame_for_gla_or_rtn(s
, start_offset
+len
);
424 * find_intfa_transition(): get the transition (if any) out of this state
427 struct intfa_transition
*find_intfa_transition(struct intfa_frame
*frame
, char ch
)
429 for(int i
= 0; i
< frame
->intfa_state
->num_transitions
; i
++)
431 struct intfa_transition
*t
= &frame
->intfa_state
->transitions
[i
];
432 if(ch
>= t
->ch_low
&& ch
<= t
->ch_high
)
442 * do_intfa_transition(): transitions an IntFA frame according to the given
443 * char, performing the appropriate GLA/RTN transitions if this puts the IntFA
447 * - the current stack frame is an IntFA frame
450 * - the current stack frame is an IntFA frame unless we have hit a
451 * hard EOF in which case it is an RTN frame. Note that it could be either
452 * same IntFA frame or a different one.
454 * Note: we currently implement longest-match, assuming that the first
455 * non-matching character is only one longer than the longest match.
457 struct intfa_frame
*do_intfa_transition(struct parse_state
*s
,
458 struct intfa_frame
*intfa_frame
,
461 struct intfa_transition
*t
= find_intfa_transition(intfa_frame
, ch
);
463 /* If this character did not have any transition, but the state we're coming
464 * from is final, then longest-match semantics say that we should return
465 * the last character's final state as the token. But if the state we're
466 * coming from is *not* final, it's just a parse error. */
469 char *terminal
= intfa_frame
->intfa_state
->final
;
471 intfa_frame
= process_terminal(s
, terminal
, intfa_frame
->start_offset
,
472 s
->offset
- intfa_frame
->start_offset
);
473 assert(intfa_frame
); // if this fails, it means that we hit a hard EOF
475 /* This must succeed this time or it is a parse error */
476 t
= find_intfa_transition(intfa_frame
, ch
);
480 intfa_frame
->intfa_state
= t
->dest_state
;
482 /* If the current state is final and there are no outgoing transitions,
483 * we *know* we don't have to wait any longer for the longest match.
484 * Transition the RTN or GLA now, for more on-line behavior. */
485 if(intfa_frame
->intfa_state
->final
&& (intfa_frame
->intfa_state
->num_transitions
== 0))
487 intfa_frame
= process_terminal(s
, intfa_frame
->intfa_state
->final
,
488 intfa_frame
->start_offset
,
489 s
->offset
- intfa_frame
->start_offset
+ 1);
495 enum parse_status
parse(struct parse_state
*s
, char *buf
, int buf_len
,
496 int *out_consumed_buf_len
, bool *out_eof_ok
)
498 struct intfa_frame
*intfa_frame
;
499 global_g
= s
->bound_grammar
->grammar
;
501 /* For the first parse, we need to descend from the starting frame
502 * until we hit an IntFA frame. */
506 descend_to_gla(s
, &entered_gla
, 0);
507 intfa_frame
= push_intfa_frame_for_gla_or_rtn(s
, 0);
511 struct parse_stack_frame
*frame
= &s
->parse_stack
[s
->parse_stack_len
-1];
512 assert(frame
->frame_type
== FRAME_TYPE_INTFA
);
513 intfa_frame
= &frame
->f
.intfa_frame
;
516 for(int i
= 0; i
< buf_len
; i
++)
518 intfa_frame
= do_intfa_transition(s
, intfa_frame
, buf
[i
]);
520 if(intfa_frame
== NULL
)
522 if (out_consumed_buf_len
) *out_consumed_buf_len
= i
;
523 if (out_eof_ok
) *out_eof_ok
= true;
524 return PARSE_STATUS_EOF
;
528 if(s
->parse_stack
[1].frame_type
!= FRAME_TYPE_RTN
&&
529 s
->parse_stack
[0].f
.rtn_frame
.rtn_state
->is_final
)
531 if (out_eof_ok
) *out_eof_ok
= true;
535 if (out_eof_ok
) *out_eof_ok
= false;
538 if (out_consumed_buf_len
) *out_consumed_buf_len
= buf_len
;
539 return PARSE_STATUS_OK
;
542 void reinit_parse_state(struct parse_state
*s
, struct bound_grammar
*bg
)
545 s
->bound_grammar
= bg
;
546 RESIZE_DYNARRAY(s
->parse_stack
, 0);
547 RESIZE_DYNARRAY(s
->token_buffer
, 0);
548 push_rtn_frame(s
, &bg
->grammar
->rtns
[0], 0);
551 void free_parse_state(struct parse_state
*s
)
553 FREE_DYNARRAY(s
->parse_stack
);
554 FREE_DYNARRAY(s
->token_buffer
);
557 void init_parse_state(struct parse_state
*s
, struct bound_grammar
*bg
)
560 s
->bound_grammar
= bg
;
561 INIT_DYNARRAY(s
->parse_stack
, 0, 16);
562 INIT_DYNARRAY(s
->token_buffer
, 0, 2);
563 push_rtn_frame(s
, &bg
->grammar
->rtns
[0], 0);
569 * c-file-style: "bsd"
571 * indent-tabs-mode: nil