Merged revisions 85328 via svnmerge from
[python/dscho.git] / Parser / parser.c
blobb505fe0ecd337ca40ef1afe3d88ad7da4576e390
2 /* Parser implementation */
4 /* For a description, see the comments at end of this file */
6 /* XXX To do: error recovery */
8 #include "Python.h"
9 #include "pgenheaders.h"
10 #include "token.h"
11 #include "grammar.h"
12 #include "node.h"
13 #include "parser.h"
14 #include "errcode.h"
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
25 /* STACK DATA TYPE */
27 static void s_reset(stack *);
29 static void
30 s_reset(stack *s)
32 s->s_top = &s->s_base[MAXSTACK];
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
37 static int
38 s_push(register stack *s, dfa *d, node *parent)
40 register stackentry *top;
41 if (s->s_top == s->s_base) {
42 fprintf(stderr, "s_push: parser stack overflow\n");
43 return E_NOMEM;
45 top = --s->s_top;
46 top->s_dfa = d;
47 top->s_parent = parent;
48 top->s_state = 0;
49 return 0;
52 #ifdef Py_DEBUG
54 static void
55 s_pop(register stack *s)
57 if (s_empty(s))
58 Py_FatalError("s_pop: parser stack underflow -- FATAL");
59 s->s_top++;
62 #else /* !Py_DEBUG */
64 #define s_pop(s) (s)->s_top++
66 #endif
69 /* PARSER CREATION */
71 parser_state *
72 PyParser_New(grammar *g, int start)
74 parser_state *ps;
76 if (!g->g_accel)
77 PyGrammar_AddAccelerators(g);
78 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79 if (ps == NULL)
80 return NULL;
81 ps->p_grammar = g;
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83 ps->p_flags = 0;
84 #endif
85 ps->p_tree = PyNode_New(start);
86 if (ps->p_tree == NULL) {
87 PyMem_FREE(ps);
88 return NULL;
90 s_reset(&ps->p_stack);
91 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92 return ps;
95 void
96 PyParser_Delete(parser_state *ps)
98 /* NB If you want to save the parse tree,
99 you must set p_tree to NULL before calling delparser! */
100 PyNode_Free(ps->p_tree);
101 PyMem_FREE(ps);
105 /* PARSER STACK OPERATIONS */
107 static int
108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
110 int err;
111 assert(!s_empty(s));
112 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
113 if (err)
114 return err;
115 s->s_top->s_state = newstate;
116 return 0;
119 static int
120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
122 int err;
123 register node *n;
124 n = s->s_top->s_parent;
125 assert(!s_empty(s));
126 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
127 if (err)
128 return err;
129 s->s_top->s_state = newstate;
130 return s_push(s, d, CHILD(n, NCH(n)-1));
134 /* PARSER PROPER */
136 static int
137 classify(parser_state *ps, int type, char *str)
139 grammar *g = ps->p_grammar;
140 register int n = g->g_ll.ll_nlabels;
142 if (type == NAME) {
143 register char *s = str;
144 register label *l = g->g_ll.ll_label;
145 register int i;
146 for (i = n; i > 0; i--, l++) {
147 if (l->lb_type != NAME || l->lb_str == NULL ||
148 l->lb_str[0] != s[0] ||
149 strcmp(l->lb_str, s) != 0)
150 continue;
151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
152 #if 0
153 /* Leaving this in as an example */
154 if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
155 if (s[0] == 'w' && strcmp(s, "with") == 0)
156 break; /* not a keyword yet */
157 else if (s[0] == 'a' && strcmp(s, "as") == 0)
158 break; /* not a keyword yet */
160 #endif
161 #endif
162 D(printf("It's a keyword\n"));
163 return n - i;
168 register label *l = g->g_ll.ll_label;
169 register int i;
170 for (i = n; i > 0; i--, l++) {
171 if (l->lb_type == type && l->lb_str == NULL) {
172 D(printf("It's a token we know\n"));
173 return n - i;
178 D(printf("Illegal token\n"));
179 return -1;
182 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
183 #if 0
184 /* Leaving this in as an example */
185 static void
186 future_hack(parser_state *ps)
188 node *n = ps->p_stack.s_top->s_parent;
189 node *ch, *cch;
190 int i;
192 /* from __future__ import ..., must have at least 4 children */
193 n = CHILD(n, 0);
194 if (NCH(n) < 4)
195 return;
196 ch = CHILD(n, 0);
197 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
198 return;
199 ch = CHILD(n, 1);
200 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
201 strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
202 return;
203 ch = CHILD(n, 3);
204 /* ch can be a star, a parenthesis or import_as_names */
205 if (TYPE(ch) == STAR)
206 return;
207 if (TYPE(ch) == LPAR)
208 ch = CHILD(n, 4);
210 for (i = 0; i < NCH(ch); i += 2) {
211 cch = CHILD(ch, i);
212 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
213 char *str_ch = STR(CHILD(cch, 0));
214 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
215 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
216 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
217 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
218 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
219 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
224 #endif
225 #endif /* future keyword */
228 PyParser_AddToken(register parser_state *ps, register int type, char *str,
229 int lineno, int col_offset, int *expected_ret)
231 register int ilabel;
232 int err;
234 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
236 /* Find out which label this token is */
237 ilabel = classify(ps, type, str);
238 if (ilabel < 0)
239 return E_SYNTAX;
241 /* Loop until the token is shifted or an error occurred */
242 for (;;) {
243 /* Fetch the current dfa and state */
244 register dfa *d = ps->p_stack.s_top->s_dfa;
245 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
247 D(printf(" DFA '%s', state %d:",
248 d->d_name, ps->p_stack.s_top->s_state));
250 /* Check accelerator */
251 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
252 register int x = s->s_accel[ilabel - s->s_lower];
253 if (x != -1) {
254 if (x & (1<<7)) {
255 /* Push non-terminal */
256 int nt = (x >> 8) + NT_OFFSET;
257 int arrow = x & ((1<<7)-1);
258 dfa *d1 = PyGrammar_FindDFA(
259 ps->p_grammar, nt);
260 if ((err = push(&ps->p_stack, nt, d1,
261 arrow, lineno, col_offset)) > 0) {
262 D(printf(" MemError: push\n"));
263 return err;
265 D(printf(" Push ...\n"));
266 continue;
269 /* Shift the token */
270 if ((err = shift(&ps->p_stack, type, str,
271 x, lineno, col_offset)) > 0) {
272 D(printf(" MemError: shift.\n"));
273 return err;
275 D(printf(" Shift.\n"));
276 /* Pop while we are in an accept-only state */
277 while (s = &d->d_state
278 [ps->p_stack.s_top->s_state],
279 s->s_accept && s->s_narcs == 1) {
280 D(printf(" DFA '%s', state %d: "
281 "Direct pop.\n",
282 d->d_name,
283 ps->p_stack.s_top->s_state));
284 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
285 #if 0
286 if (d->d_name[0] == 'i' &&
287 strcmp(d->d_name,
288 "import_stmt") == 0)
289 future_hack(ps);
290 #endif
291 #endif
292 s_pop(&ps->p_stack);
293 if (s_empty(&ps->p_stack)) {
294 D(printf(" ACCEPT.\n"));
295 return E_DONE;
297 d = ps->p_stack.s_top->s_dfa;
299 return E_OK;
303 if (s->s_accept) {
304 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
305 #if 0
306 if (d->d_name[0] == 'i' &&
307 strcmp(d->d_name, "import_stmt") == 0)
308 future_hack(ps);
309 #endif
310 #endif
311 /* Pop this dfa and try again */
312 s_pop(&ps->p_stack);
313 D(printf(" Pop ...\n"));
314 if (s_empty(&ps->p_stack)) {
315 D(printf(" Error: bottom of stack.\n"));
316 return E_SYNTAX;
318 continue;
321 /* Stuck, report syntax error */
322 D(printf(" Error.\n"));
323 if (expected_ret) {
324 if (s->s_lower == s->s_upper - 1) {
325 /* Only one possible expected token */
326 *expected_ret = ps->p_grammar->
327 g_ll.ll_label[s->s_lower].lb_type;
329 else
330 *expected_ret = -1;
332 return E_SYNTAX;
337 #ifdef Py_DEBUG
339 /* DEBUG OUTPUT */
341 void
342 dumptree(grammar *g, node *n)
344 int i;
346 if (n == NULL)
347 printf("NIL");
348 else {
349 label l;
350 l.lb_type = TYPE(n);
351 l.lb_str = STR(n);
352 printf("%s", PyGrammar_LabelRepr(&l));
353 if (ISNONTERMINAL(TYPE(n))) {
354 printf("(");
355 for (i = 0; i < NCH(n); i++) {
356 if (i > 0)
357 printf(",");
358 dumptree(g, CHILD(n, i));
360 printf(")");
365 void
366 showtree(grammar *g, node *n)
368 int i;
370 if (n == NULL)
371 return;
372 if (ISNONTERMINAL(TYPE(n))) {
373 for (i = 0; i < NCH(n); i++)
374 showtree(g, CHILD(n, i));
376 else if (ISTERMINAL(TYPE(n))) {
377 printf("%s", _PyParser_TokenNames[TYPE(n)]);
378 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
379 printf("(%s)", STR(n));
380 printf(" ");
382 else
383 printf("? ");
386 void
387 printtree(parser_state *ps)
389 if (Py_DebugFlag) {
390 printf("Parse tree:\n");
391 dumptree(ps->p_grammar, ps->p_tree);
392 printf("\n");
393 printf("Tokens:\n");
394 showtree(ps->p_grammar, ps->p_tree);
395 printf("\n");
397 printf("Listing:\n");
398 PyNode_ListTree(ps->p_tree);
399 printf("\n");
402 #endif /* Py_DEBUG */
406 Description
407 -----------
409 The parser's interface is different than usual: the function addtoken()
410 must be called for each token in the input. This makes it possible to
411 turn it into an incremental parsing system later. The parsing system
412 constructs a parse tree as it goes.
414 A parsing rule is represented as a Deterministic Finite-state Automaton
415 (DFA). A node in a DFA represents a state of the parser; an arc represents
416 a transition. Transitions are either labeled with terminal symbols or
417 with non-terminals. When the parser decides to follow an arc labeled
418 with a non-terminal, it is invoked recursively with the DFA representing
419 the parsing rule for that as its initial state; when that DFA accepts,
420 the parser that invoked it continues. The parse tree constructed by the
421 recursively called parser is inserted as a child in the current parse tree.
423 The DFA's can be constructed automatically from a more conventional
424 language description. An extended LL(1) grammar (ELL(1)) is suitable.
425 Certain restrictions make the parser's life easier: rules that can produce
426 the empty string should be outlawed (there are other ways to put loops
427 or optional parts in the language). To avoid the need to construct
428 FIRST sets, we can require that all but the last alternative of a rule
429 (really: arc going out of a DFA's state) must begin with a terminal
430 symbol.
432 As an example, consider this grammar:
434 expr: term (OP term)*
435 term: CONSTANT | '(' expr ')'
437 The DFA corresponding to the rule for expr is:
439 ------->.---term-->.------->
442 \----OP----/
444 The parse tree generated for the input a+b is:
446 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))