2 /* Parser implementation */
4 /* For a description, see the comments at end of this file */
6 /* XXX To do: error recovery */
9 #include "pgenheaders.h"
18 extern int Py_DebugFlag
;
19 #define D(x) if (!Py_DebugFlag); else x
27 static void s_reset(stack
*);
32 s
->s_top
= &s
->s_base
[MAXSTACK
];
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
38 s_push(register stack
*s
, dfa
*d
, node
*parent
)
40 register stackentry
*top
;
41 if (s
->s_top
== s
->s_base
) {
42 fprintf(stderr
, "s_push: parser stack overflow\n");
47 top
->s_parent
= parent
;
55 s_pop(register stack
*s
)
58 Py_FatalError("s_pop: parser stack underflow -- FATAL");
64 #define s_pop(s) (s)->s_top++
72 PyParser_New(grammar
*g
, int start
)
77 PyGrammar_AddAccelerators(g
);
78 ps
= (parser_state
*)PyMem_MALLOC(sizeof(parser_state
));
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
85 ps
->p_tree
= PyNode_New(start
);
86 if (ps
->p_tree
== NULL
) {
90 s_reset(&ps
->p_stack
);
91 (void) s_push(&ps
->p_stack
, PyGrammar_FindDFA(g
, start
), ps
->p_tree
);
96 PyParser_Delete(parser_state
*ps
)
98 /* NB If you want to save the parse tree,
99 you must set p_tree to NULL before calling delparser! */
100 PyNode_Free(ps
->p_tree
);
105 /* PARSER STACK OPERATIONS */
108 shift(register stack
*s
, int type
, char *str
, int newstate
, int lineno
, int col_offset
)
112 err
= PyNode_AddChild(s
->s_top
->s_parent
, type
, str
, lineno
, col_offset
);
115 s
->s_top
->s_state
= newstate
;
120 push(register stack
*s
, int type
, dfa
*d
, int newstate
, int lineno
, int col_offset
)
124 n
= s
->s_top
->s_parent
;
126 err
= PyNode_AddChild(n
, type
, (char *)NULL
, lineno
, col_offset
);
129 s
->s_top
->s_state
= newstate
;
130 return s_push(s
, d
, CHILD(n
, NCH(n
)-1));
137 classify(parser_state
*ps
, int type
, char *str
)
139 grammar
*g
= ps
->p_grammar
;
140 register int n
= g
->g_ll
.ll_nlabels
;
143 register char *s
= str
;
144 register label
*l
= g
->g_ll
.ll_label
;
146 for (i
= n
; i
> 0; i
--, l
++) {
147 if (l
->lb_type
!= NAME
|| l
->lb_str
== NULL
||
148 l
->lb_str
[0] != s
[0] ||
149 strcmp(l
->lb_str
, s
) != 0)
151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
153 /* Leaving this in as an example */
154 if (!(ps
->p_flags
& CO_FUTURE_WITH_STATEMENT
)) {
155 if (s
[0] == 'w' && strcmp(s
, "with") == 0)
156 break; /* not a keyword yet */
157 else if (s
[0] == 'a' && strcmp(s
, "as") == 0)
158 break; /* not a keyword yet */
162 D(printf("It's a keyword\n"));
168 register label
*l
= g
->g_ll
.ll_label
;
170 for (i
= n
; i
> 0; i
--, l
++) {
171 if (l
->lb_type
== type
&& l
->lb_str
== NULL
) {
172 D(printf("It's a token we know\n"));
178 D(printf("Illegal token\n"));
182 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
184 /* Leaving this in as an example */
186 future_hack(parser_state
*ps
)
188 node
*n
= ps
->p_stack
.s_top
->s_parent
;
192 /* from __future__ import ..., must have at least 4 children */
197 if (STR(ch
) == NULL
|| strcmp(STR(ch
), "from") != 0)
200 if (NCH(ch
) == 1 && STR(CHILD(ch
, 0)) &&
201 strcmp(STR(CHILD(ch
, 0)), "__future__") != 0)
204 /* ch can be a star, a parenthesis or import_as_names */
205 if (TYPE(ch
) == STAR
)
207 if (TYPE(ch
) == LPAR
)
210 for (i
= 0; i
< NCH(ch
); i
+= 2) {
212 if (NCH(cch
) >= 1 && TYPE(CHILD(cch
, 0)) == NAME
) {
213 char *str_ch
= STR(CHILD(cch
, 0));
214 if (strcmp(str_ch
, FUTURE_WITH_STATEMENT
) == 0) {
215 ps
->p_flags
|= CO_FUTURE_WITH_STATEMENT
;
216 } else if (strcmp(str_ch
, FUTURE_PRINT_FUNCTION
) == 0) {
217 ps
->p_flags
|= CO_FUTURE_PRINT_FUNCTION
;
218 } else if (strcmp(str_ch
, FUTURE_UNICODE_LITERALS
) == 0) {
219 ps
->p_flags
|= CO_FUTURE_UNICODE_LITERALS
;
225 #endif /* future keyword */
228 PyParser_AddToken(register parser_state
*ps
, register int type
, char *str
,
229 int lineno
, int col_offset
, int *expected_ret
)
234 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames
[type
], str
));
236 /* Find out which label this token is */
237 ilabel
= classify(ps
, type
, str
);
241 /* Loop until the token is shifted or an error occurred */
243 /* Fetch the current dfa and state */
244 register dfa
*d
= ps
->p_stack
.s_top
->s_dfa
;
245 register state
*s
= &d
->d_state
[ps
->p_stack
.s_top
->s_state
];
247 D(printf(" DFA '%s', state %d:",
248 d
->d_name
, ps
->p_stack
.s_top
->s_state
));
250 /* Check accelerator */
251 if (s
->s_lower
<= ilabel
&& ilabel
< s
->s_upper
) {
252 register int x
= s
->s_accel
[ilabel
- s
->s_lower
];
255 /* Push non-terminal */
256 int nt
= (x
>> 8) + NT_OFFSET
;
257 int arrow
= x
& ((1<<7)-1);
258 dfa
*d1
= PyGrammar_FindDFA(
260 if ((err
= push(&ps
->p_stack
, nt
, d1
,
261 arrow
, lineno
, col_offset
)) > 0) {
262 D(printf(" MemError: push\n"));
265 D(printf(" Push ...\n"));
269 /* Shift the token */
270 if ((err
= shift(&ps
->p_stack
, type
, str
,
271 x
, lineno
, col_offset
)) > 0) {
272 D(printf(" MemError: shift.\n"));
275 D(printf(" Shift.\n"));
276 /* Pop while we are in an accept-only state */
277 while (s
= &d
->d_state
278 [ps
->p_stack
.s_top
->s_state
],
279 s
->s_accept
&& s
->s_narcs
== 1) {
280 D(printf(" DFA '%s', state %d: "
283 ps
->p_stack
.s_top
->s_state
));
284 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
286 if (d
->d_name
[0] == 'i' &&
293 if (s_empty(&ps
->p_stack
)) {
294 D(printf(" ACCEPT.\n"));
297 d
= ps
->p_stack
.s_top
->s_dfa
;
304 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
306 if (d
->d_name
[0] == 'i' &&
307 strcmp(d
->d_name
, "import_stmt") == 0)
311 /* Pop this dfa and try again */
313 D(printf(" Pop ...\n"));
314 if (s_empty(&ps
->p_stack
)) {
315 D(printf(" Error: bottom of stack.\n"));
321 /* Stuck, report syntax error */
322 D(printf(" Error.\n"));
324 if (s
->s_lower
== s
->s_upper
- 1) {
325 /* Only one possible expected token */
326 *expected_ret
= ps
->p_grammar
->
327 g_ll
.ll_label
[s
->s_lower
].lb_type
;
342 dumptree(grammar
*g
, node
*n
)
352 printf("%s", PyGrammar_LabelRepr(&l
));
353 if (ISNONTERMINAL(TYPE(n
))) {
355 for (i
= 0; i
< NCH(n
); i
++) {
358 dumptree(g
, CHILD(n
, i
));
366 showtree(grammar
*g
, node
*n
)
372 if (ISNONTERMINAL(TYPE(n
))) {
373 for (i
= 0; i
< NCH(n
); i
++)
374 showtree(g
, CHILD(n
, i
));
376 else if (ISTERMINAL(TYPE(n
))) {
377 printf("%s", _PyParser_TokenNames
[TYPE(n
)]);
378 if (TYPE(n
) == NUMBER
|| TYPE(n
) == NAME
)
379 printf("(%s)", STR(n
));
387 printtree(parser_state
*ps
)
390 printf("Parse tree:\n");
391 dumptree(ps
->p_grammar
, ps
->p_tree
);
394 showtree(ps
->p_grammar
, ps
->p_tree
);
397 printf("Listing:\n");
398 PyNode_ListTree(ps
->p_tree
);
402 #endif /* Py_DEBUG */
409 The parser's interface is different than usual: the function addtoken()
410 must be called for each token in the input. This makes it possible to
411 turn it into an incremental parsing system later. The parsing system
412 constructs a parse tree as it goes.
414 A parsing rule is represented as a Deterministic Finite-state Automaton
415 (DFA). A node in a DFA represents a state of the parser; an arc represents
416 a transition. Transitions are either labeled with terminal symbols or
417 with non-terminals. When the parser decides to follow an arc labeled
418 with a non-terminal, it is invoked recursively with the DFA representing
419 the parsing rule for that as its initial state; when that DFA accepts,
420 the parser that invoked it continues. The parse tree constructed by the
421 recursively called parser is inserted as a child in the current parse tree.
423 The DFA's can be constructed automatically from a more conventional
424 language description. An extended LL(1) grammar (ELL(1)) is suitable.
425 Certain restrictions make the parser's life easier: rules that can produce
426 the empty string should be outlawed (there are other ways to put loops
427 or optional parts in the language). To avoid the need to construct
428 FIRST sets, we can require that all but the last alternative of a rule
429 (really: arc going out of a DFA's state) must begin with a terminal
432 As an example, consider this grammar:
434 expr: term (OP term)*
435 term: CONSTANT | '(' expr ')'
437 The DFA corresponding to the rule for expr is:
439 ------->.---term-->.------->
444 The parse tree generated for the input a+b is:
446 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))