Make python.vim output more deterministic.
[python.git] / Parser / parser.c
blob04d5817d6252f8ac50a030baf3ea1a9023b0b8f5
2 /* Parser implementation */
4 /* For a description, see the comments at end of this file */
6 /* XXX To do: error recovery */
8 #include "Python.h"
9 #include "pgenheaders.h"
10 #include "token.h"
11 #include "grammar.h"
12 #include "node.h"
13 #include "parser.h"
14 #include "errcode.h"
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
25 /* STACK DATA TYPE */
27 static void s_reset(stack *);
29 static void
30 s_reset(stack *s)
32 s->s_top = &s->s_base[MAXSTACK];
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
37 static int
38 s_push(register stack *s, dfa *d, node *parent)
40 register stackentry *top;
41 if (s->s_top == s->s_base) {
42 fprintf(stderr, "s_push: parser stack overflow\n");
43 return E_NOMEM;
45 top = --s->s_top;
46 top->s_dfa = d;
47 top->s_parent = parent;
48 top->s_state = 0;
49 return 0;
52 #ifdef Py_DEBUG
54 static void
55 s_pop(register stack *s)
57 if (s_empty(s))
58 Py_FatalError("s_pop: parser stack underflow -- FATAL");
59 s->s_top++;
62 #else /* !Py_DEBUG */
64 #define s_pop(s) (s)->s_top++
66 #endif
69 /* PARSER CREATION */
71 parser_state *
72 PyParser_New(grammar *g, int start)
74 parser_state *ps;
76 if (!g->g_accel)
77 PyGrammar_AddAccelerators(g);
78 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79 if (ps == NULL)
80 return NULL;
81 ps->p_grammar = g;
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83 ps->p_flags = 0;
84 #endif
85 ps->p_tree = PyNode_New(start);
86 if (ps->p_tree == NULL) {
87 PyMem_FREE(ps);
88 return NULL;
90 s_reset(&ps->p_stack);
91 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92 return ps;
95 void
96 PyParser_Delete(parser_state *ps)
98 /* NB If you want to save the parse tree,
99 you must set p_tree to NULL before calling delparser! */
100 PyNode_Free(ps->p_tree);
101 PyMem_FREE(ps);
105 /* PARSER STACK OPERATIONS */
107 static int
108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
110 int err;
111 assert(!s_empty(s));
112 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
113 if (err)
114 return err;
115 s->s_top->s_state = newstate;
116 return 0;
119 static int
120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
122 int err;
123 register node *n;
124 n = s->s_top->s_parent;
125 assert(!s_empty(s));
126 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
127 if (err)
128 return err;
129 s->s_top->s_state = newstate;
130 return s_push(s, d, CHILD(n, NCH(n)-1));
134 /* PARSER PROPER */
136 static int
137 classify(parser_state *ps, int type, char *str)
139 grammar *g = ps->p_grammar;
140 register int n = g->g_ll.ll_nlabels;
142 if (type == NAME) {
143 register char *s = str;
144 register label *l = g->g_ll.ll_label;
145 register int i;
146 for (i = n; i > 0; i--, l++) {
147 if (l->lb_type != NAME || l->lb_str == NULL ||
148 l->lb_str[0] != s[0] ||
149 strcmp(l->lb_str, s) != 0)
150 continue;
151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
152 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
153 if (s[0] == 'w' && strcmp(s, "with") == 0)
154 break; /* not a keyword yet */
155 else if (s[0] == 'a' && strcmp(s, "as") == 0)
156 break; /* not a keyword yet */
158 #endif
159 D(printf("It's a keyword\n"));
160 return n - i;
165 register label *l = g->g_ll.ll_label;
166 register int i;
167 for (i = n; i > 0; i--, l++) {
168 if (l->lb_type == type && l->lb_str == NULL) {
169 D(printf("It's a token we know\n"));
170 return n - i;
175 D(printf("Illegal token\n"));
176 return -1;
179 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
180 static void
181 future_hack(parser_state *ps)
183 node *n = ps->p_stack.s_top->s_parent;
184 node *ch;
185 int i;
187 /* from __future__ import ..., must have at least 4 children */
188 n = CHILD(n, 0);
189 if (NCH(n) < 4)
190 return;
191 ch = CHILD(n, 0);
192 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
193 return;
194 ch = CHILD(n, 1);
195 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
196 strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
197 return;
198 for (i = 3; i < NCH(n); i += 2) {
199 /* XXX: assume we don't have parentheses in import:
200 from __future__ import (x, y, z)
202 ch = CHILD(n, i);
203 if (NCH(ch) == 1)
204 ch = CHILD(ch, 0);
205 if (NCH(ch) >= 1 && TYPE(CHILD(ch, 0)) == NAME &&
206 strcmp(STR(CHILD(ch, 0)), "with_statement") == 0) {
207 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
208 break;
212 #endif /* future keyword */
215 PyParser_AddToken(register parser_state *ps, register int type, char *str,
216 int lineno, int col_offset, int *expected_ret)
218 register int ilabel;
219 int err;
221 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
223 /* Find out which label this token is */
224 ilabel = classify(ps, type, str);
225 if (ilabel < 0)
226 return E_SYNTAX;
228 /* Loop until the token is shifted or an error occurred */
229 for (;;) {
230 /* Fetch the current dfa and state */
231 register dfa *d = ps->p_stack.s_top->s_dfa;
232 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
234 D(printf(" DFA '%s', state %d:",
235 d->d_name, ps->p_stack.s_top->s_state));
237 /* Check accelerator */
238 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
239 register int x = s->s_accel[ilabel - s->s_lower];
240 if (x != -1) {
241 if (x & (1<<7)) {
242 /* Push non-terminal */
243 int nt = (x >> 8) + NT_OFFSET;
244 int arrow = x & ((1<<7)-1);
245 dfa *d1 = PyGrammar_FindDFA(
246 ps->p_grammar, nt);
247 if ((err = push(&ps->p_stack, nt, d1,
248 arrow, lineno, col_offset)) > 0) {
249 D(printf(" MemError: push\n"));
250 return err;
252 D(printf(" Push ...\n"));
253 continue;
256 /* Shift the token */
257 if ((err = shift(&ps->p_stack, type, str,
258 x, lineno, col_offset)) > 0) {
259 D(printf(" MemError: shift.\n"));
260 return err;
262 D(printf(" Shift.\n"));
263 /* Pop while we are in an accept-only state */
264 while (s = &d->d_state
265 [ps->p_stack.s_top->s_state],
266 s->s_accept && s->s_narcs == 1) {
267 D(printf(" DFA '%s', state %d: "
268 "Direct pop.\n",
269 d->d_name,
270 ps->p_stack.s_top->s_state));
271 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
272 if (d->d_name[0] == 'i' &&
273 strcmp(d->d_name,
274 "import_stmt") == 0)
275 future_hack(ps);
276 #endif
277 s_pop(&ps->p_stack);
278 if (s_empty(&ps->p_stack)) {
279 D(printf(" ACCEPT.\n"));
280 return E_DONE;
282 d = ps->p_stack.s_top->s_dfa;
284 return E_OK;
288 if (s->s_accept) {
289 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
290 if (d->d_name[0] == 'i' &&
291 strcmp(d->d_name, "import_stmt") == 0)
292 future_hack(ps);
293 #endif
294 /* Pop this dfa and try again */
295 s_pop(&ps->p_stack);
296 D(printf(" Pop ...\n"));
297 if (s_empty(&ps->p_stack)) {
298 D(printf(" Error: bottom of stack.\n"));
299 return E_SYNTAX;
301 continue;
304 /* Stuck, report syntax error */
305 D(printf(" Error.\n"));
306 if (expected_ret) {
307 if (s->s_lower == s->s_upper - 1) {
308 /* Only one possible expected token */
309 *expected_ret = ps->p_grammar->
310 g_ll.ll_label[s->s_lower].lb_type;
312 else
313 *expected_ret = -1;
315 return E_SYNTAX;
320 #ifdef Py_DEBUG
322 /* DEBUG OUTPUT */
324 void
325 dumptree(grammar *g, node *n)
327 int i;
329 if (n == NULL)
330 printf("NIL");
331 else {
332 label l;
333 l.lb_type = TYPE(n);
334 l.lb_str = STR(n);
335 printf("%s", PyGrammar_LabelRepr(&l));
336 if (ISNONTERMINAL(TYPE(n))) {
337 printf("(");
338 for (i = 0; i < NCH(n); i++) {
339 if (i > 0)
340 printf(",");
341 dumptree(g, CHILD(n, i));
343 printf(")");
348 void
349 showtree(grammar *g, node *n)
351 int i;
353 if (n == NULL)
354 return;
355 if (ISNONTERMINAL(TYPE(n))) {
356 for (i = 0; i < NCH(n); i++)
357 showtree(g, CHILD(n, i));
359 else if (ISTERMINAL(TYPE(n))) {
360 printf("%s", _PyParser_TokenNames[TYPE(n)]);
361 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
362 printf("(%s)", STR(n));
363 printf(" ");
365 else
366 printf("? ");
369 void
370 printtree(parser_state *ps)
372 if (Py_DebugFlag) {
373 printf("Parse tree:\n");
374 dumptree(ps->p_grammar, ps->p_tree);
375 printf("\n");
376 printf("Tokens:\n");
377 showtree(ps->p_grammar, ps->p_tree);
378 printf("\n");
380 printf("Listing:\n");
381 PyNode_ListTree(ps->p_tree);
382 printf("\n");
385 #endif /* Py_DEBUG */
389 Description
390 -----------
392 The parser's interface is different than usual: the function addtoken()
393 must be called for each token in the input. This makes it possible to
394 turn it into an incremental parsing system later. The parsing system
395 constructs a parse tree as it goes.
397 A parsing rule is represented as a Deterministic Finite-state Automaton
398 (DFA). A node in a DFA represents a state of the parser; an arc represents
399 a transition. Transitions are either labeled with terminal symbols or
400 with non-terminals. When the parser decides to follow an arc labeled
401 with a non-terminal, it is invoked recursively with the DFA representing
402 the parsing rule for that as its initial state; when that DFA accepts,
403 the parser that invoked it continues. The parse tree constructed by the
404 recursively called parser is inserted as a child in the current parse tree.
406 The DFA's can be constructed automatically from a more conventional
407 language description. An extended LL(1) grammar (ELL(1)) is suitable.
408 Certain restrictions make the parser's life easier: rules that can produce
409 the empty string should be outlawed (there are other ways to put loops
410 or optional parts in the language). To avoid the need to construct
411 FIRST sets, we can require that all but the last alternative of a rule
412 (really: arc going out of a DFA's state) must begin with a terminal
413 symbol.
415 As an example, consider this grammar:
417 expr: term (OP term)*
418 term: CONSTANT | '(' expr ')'
420 The DFA corresponding to the rule for expr is:
422 ------->.---term-->.------->
425 \----OP----/
427 The parse tree generated for the input a+b is:
429 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))