Parser/parser.c

   1
   2 /* Parser implementation */
   3
   4 /* For a description, see the comments at end of this file */
   5
   6 /* XXX To do: error recovery */
   7
   8 #include "Python.h"
   9 #include "pgenheaders.h"
  10 #include "token.h"
  11 #include "grammar.h"
  12 #include "node.h"
  13 #include "parser.h"
  14 #include "errcode.h"
  15
  16
  17 #ifdef Py_DEBUG
  18 extern int Py_DebugFlag;
  19 #define D(x) if (!Py_DebugFlag); else x
  20 #else
  21 #define D(x)
  22 #endif
  23
  24
  25 /* STACK DATA TYPE */
  26
  27 static void s_reset(stack *);
  28
  29 static void
  30 s_reset(stack *s)
  31 {
  32     s->s_top = &s->s_base[MAXSTACK];
  33 }
  34
  35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
  36
  37 static int
  38 s_push(register stack *s, dfa *d, node *parent)
  39 {
  40     register stackentry *top;
  41     if (s->s_top == s->s_base) {
  42         fprintf(stderr, "s_push: parser stack overflow\n");
  43         return E_NOMEM;
  44     }
  45     top = --s->s_top;
  46     top->s_dfa = d;
  47     top->s_parent = parent;
  48     top->s_state = 0;
  49     return 0;
  50 }
  51
  52 #ifdef Py_DEBUG
  53
  54 static void
  55 s_pop(register stack *s)
  56 {
  57     if (s_empty(s))
  58         Py_FatalError("s_pop: parser stack underflow -- FATAL");
  59     s->s_top++;
  60 }
  61
  62 #else /* !Py_DEBUG */
  63
  64 #define s_pop(s) (s)->s_top++
  65
  66 #endif
  67
  68
  69 /* PARSER CREATION */
  70
  71 parser_state *
  72 PyParser_New(grammar *g, int start)
  73 {
  74     parser_state *ps;
  75
  76     if (!g->g_accel)
  77         PyGrammar_AddAccelerators(g);
  78     ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
  79     if (ps == NULL)
  80         return NULL;
  81     ps->p_grammar = g;
  82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
  83     ps->p_flags = 0;
  84 #endif
  85     ps->p_tree = PyNode_New(start);
  86     if (ps->p_tree == NULL) {
  87         PyMem_FREE(ps);
  88         return NULL;
  89     }
  90     s_reset(&ps->p_stack);
  91     (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
  92     return ps;
  93 }
  94
  95 void
  96 PyParser_Delete(parser_state *ps)
  97 {
  98     /* NB If you want to save the parse tree,
  99        you must set p_tree to NULL before calling delparser! */
 100     PyNode_Free(ps->p_tree);
 101     PyMem_FREE(ps);
 102 }
 103
 104
 105 /* PARSER STACK OPERATIONS */
 106
 107 static int
 108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
 109 {
 110     int err;
 111     assert(!s_empty(s));
 112     err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
 113     if (err)
 114         return err;
 115     s->s_top->s_state = newstate;
 116     return 0;
 117 }
 118
 119 static int
 120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
 121 {
 122     int err;
 123     register node *n;
 124     n = s->s_top->s_parent;
 125     assert(!s_empty(s));
 126     err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
 127     if (err)
 128         return err;
 129     s->s_top->s_state = newstate;
 130     return s_push(s, d, CHILD(n, NCH(n)-1));
 131 }
 132
 133
 134 /* PARSER PROPER */
 135
 136 static int
 137 classify(parser_state *ps, int type, char *str)
 138 {
 139     grammar *g = ps->p_grammar;
 140     register int n = g->g_ll.ll_nlabels;
 141
 142     if (type == NAME) {
 143         register char *s = str;
 144         register label *l = g->g_ll.ll_label;
 145         register int i;
 146         for (i = n; i > 0; i--, l++) {
 147             if (l->lb_type != NAME || l->lb_str == NULL ||
 148                 l->lb_str[0] != s[0] ||
 149                 strcmp(l->lb_str, s) != 0)
 150                 continue;
 151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 152 #if 0
 153             /* Leaving this in as an example */
 154             if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
 155                 if (s[0] == 'w' && strcmp(s, "with") == 0)
 156                     break; /* not a keyword yet */
 157                 else if (s[0] == 'a' && strcmp(s, "as") == 0)
 158                     break; /* not a keyword yet */
 159             }
 160 #endif
 161 #endif
 162             D(printf("It's a keyword\n"));
 163             return n - i;
 164         }
 165     }
 166
 167     {
 168         register label *l = g->g_ll.ll_label;
 169         register int i;
 170         for (i = n; i > 0; i--, l++) {
 171             if (l->lb_type == type && l->lb_str == NULL) {
 172                 D(printf("It's a token we know\n"));
 173                 return n - i;
 174             }
 175         }
 176     }
 177
 178     D(printf("Illegal token\n"));
 179     return -1;
 180 }
 181
 182 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 183 #if 0
 184 /* Leaving this in as an example */
 185 static void
 186 future_hack(parser_state *ps)
 187 {
 188     node *n = ps->p_stack.s_top->s_parent;
 189     node *ch, *cch;
 190     int i;
 191
 192     /* from __future__ import ..., must have at least 4 children */
 193     n = CHILD(n, 0);
 194     if (NCH(n) < 4)
 195         return;
 196     ch = CHILD(n, 0);
 197     if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
 198         return;
 199     ch = CHILD(n, 1);
 200     if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
 201         strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
 202         return;
 203     ch = CHILD(n, 3);
 204     /* ch can be a star, a parenthesis or import_as_names */
 205     if (TYPE(ch) == STAR)
 206         return;
 207     if (TYPE(ch) == LPAR)
 208         ch = CHILD(n, 4);
 209
 210     for (i = 0; i < NCH(ch); i += 2) {
 211         cch = CHILD(ch, i);
 212         if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
 213             char *str_ch = STR(CHILD(cch, 0));
 214             if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
 215                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
 216             } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
 217                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
 218             } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
 219                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
 220             }
 221         }
 222     }
 223 }
 224 #endif
 225 #endif /* future keyword */
 226
 227 int
 228 PyParser_AddToken(register parser_state *ps, register int type, char *str,
 229                   int lineno, int col_offset, int *expected_ret)
 230 {
 231     register int ilabel;
 232     int err;
 233
 234     D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
 235
 236     /* Find out which label this token is */
 237     ilabel = classify(ps, type, str);
 238     if (ilabel < 0)
 239         return E_SYNTAX;
 240
 241     /* Loop until the token is shifted or an error occurred */
 242     for (;;) {
 243         /* Fetch the current dfa and state */
 244         register dfa *d = ps->p_stack.s_top->s_dfa;
 245         register state *s = &d->d_state[ps->p_stack.s_top->s_state];
 246
 247         D(printf(" DFA '%s', state %d:",
 248             d->d_name, ps->p_stack.s_top->s_state));
 249
 250         /* Check accelerator */
 251         if (s->s_lower <= ilabel && ilabel < s->s_upper) {
 252             register int x = s->s_accel[ilabel - s->s_lower];
 253             if (x != -1) {
 254                 if (x & (1<<7)) {
 255                     /* Push non-terminal */
 256                     int nt = (x >> 8) + NT_OFFSET;
 257                     int arrow = x & ((1<<7)-1);
 258                     dfa *d1 = PyGrammar_FindDFA(
 259                         ps->p_grammar, nt);
 260                     if ((err = push(&ps->p_stack, nt, d1,
 261                         arrow, lineno, col_offset)) > 0) {
 262                         D(printf(" MemError: push\n"));
 263                         return err;
 264                     }
 265                     D(printf(" Push ...\n"));
 266                     continue;
 267                 }
 268
 269                 /* Shift the token */
 270                 if ((err = shift(&ps->p_stack, type, str,
 271                                 x, lineno, col_offset)) > 0) {
 272                     D(printf(" MemError: shift.\n"));
 273                     return err;
 274                 }
 275                 D(printf(" Shift.\n"));
 276                 /* Pop while we are in an accept-only state */
 277                 while (s = &d->d_state
 278                                 [ps->p_stack.s_top->s_state],
 279                     s->s_accept && s->s_narcs == 1) {
 280                     D(printf("  DFA '%s', state %d: "
 281                              "Direct pop.\n",
 282                              d->d_name,
 283                              ps->p_stack.s_top->s_state));
 284 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 285 #if 0
 286                     if (d->d_name[0] == 'i' &&
 287                         strcmp(d->d_name,
 288                            "import_stmt") == 0)
 289                         future_hack(ps);
 290 #endif
 291 #endif
 292                     s_pop(&ps->p_stack);
 293                     if (s_empty(&ps->p_stack)) {
 294                         D(printf("  ACCEPT.\n"));
 295                         return E_DONE;
 296                     }
 297                     d = ps->p_stack.s_top->s_dfa;
 298                 }
 299                 return E_OK;
 300             }
 301         }
 302
 303         if (s->s_accept) {
 304 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 305 #if 0
 306             if (d->d_name[0] == 'i' &&
 307                 strcmp(d->d_name, "import_stmt") == 0)
 308                 future_hack(ps);
 309 #endif
 310 #endif
 311             /* Pop this dfa and try again */
 312             s_pop(&ps->p_stack);
 313             D(printf(" Pop ...\n"));
 314             if (s_empty(&ps->p_stack)) {
 315                 D(printf(" Error: bottom of stack.\n"));
 316                 return E_SYNTAX;
 317             }
 318             continue;
 319         }
 320
 321         /* Stuck, report syntax error */
 322         D(printf(" Error.\n"));
 323         if (expected_ret) {
 324             if (s->s_lower == s->s_upper - 1) {
 325                 /* Only one possible expected token */
 326                 *expected_ret = ps->p_grammar->
 327                     g_ll.ll_label[s->s_lower].lb_type;
 328             }
 329             else
 330                 *expected_ret = -1;
 331         }
 332         return E_SYNTAX;
 333     }
 334 }
 335
 336
 337 #ifdef Py_DEBUG
 338
 339 /* DEBUG OUTPUT */
 340
 341 void
 342 dumptree(grammar *g, node *n)
 343 {
 344     int i;
 345
 346     if (n == NULL)
 347         printf("NIL");
 348     else {
 349         label l;
 350         l.lb_type = TYPE(n);
 351         l.lb_str = STR(n);
 352         printf("%s", PyGrammar_LabelRepr(&l));
 353         if (ISNONTERMINAL(TYPE(n))) {
 354             printf("(");
 355             for (i = 0; i < NCH(n); i++) {
 356                 if (i > 0)
 357                     printf(",");
 358                 dumptree(g, CHILD(n, i));
 359             }
 360             printf(")");
 361         }
 362     }
 363 }
 364
 365 void
 366 showtree(grammar *g, node *n)
 367 {
 368     int i;
 369
 370     if (n == NULL)
 371         return;
 372     if (ISNONTERMINAL(TYPE(n))) {
 373         for (i = 0; i < NCH(n); i++)
 374             showtree(g, CHILD(n, i));
 375     }
 376     else if (ISTERMINAL(TYPE(n))) {
 377         printf("%s", _PyParser_TokenNames[TYPE(n)]);
 378         if (TYPE(n) == NUMBER || TYPE(n) == NAME)
 379             printf("(%s)", STR(n));
 380         printf(" ");
 381     }
 382     else
 383         printf("? ");
 384 }
 385
 386 void
 387 printtree(parser_state *ps)
 388 {
 389     if (Py_DebugFlag) {
 390         printf("Parse tree:\n");
 391         dumptree(ps->p_grammar, ps->p_tree);
 392         printf("\n");
 393         printf("Tokens:\n");
 394         showtree(ps->p_grammar, ps->p_tree);
 395         printf("\n");
 396     }
 397     printf("Listing:\n");
 398     PyNode_ListTree(ps->p_tree);
 399     printf("\n");
 400 }
 401
 402 #endif /* Py_DEBUG */
 403
 404 /*
 405
 406 Description
 407 -----------
 408
 409 The parser's interface is different than usual: the function addtoken()
 410 must be called for each token in the input.  This makes it possible to
 411 turn it into an incremental parsing system later.  The parsing system
 412 constructs a parse tree as it goes.
 413
 414 A parsing rule is represented as a Deterministic Finite-state Automaton
 415 (DFA).  A node in a DFA represents a state of the parser; an arc represents
 416 a transition.  Transitions are either labeled with terminal symbols or
 417 with non-terminals.  When the parser decides to follow an arc labeled
 418 with a non-terminal, it is invoked recursively with the DFA representing
 419 the parsing rule for that as its initial state; when that DFA accepts,
 420 the parser that invoked it continues.  The parse tree constructed by the
 421 recursively called parser is inserted as a child in the current parse tree.
 422
 423 The DFA's can be constructed automatically from a more conventional
 424 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
 425 Certain restrictions make the parser's life easier: rules that can produce
 426 the empty string should be outlawed (there are other ways to put loops
 427 or optional parts in the language).  To avoid the need to construct
 428 FIRST sets, we can require that all but the last alternative of a rule
 429 (really: arc going out of a DFA's state) must begin with a terminal
 430 symbol.
 431
 432 As an example, consider this grammar:
 433
 434 expr:   term (OP term)*
 435 term:   CONSTANT | '(' expr ')'
 436
 437 The DFA corresponding to the rule for expr is:
 438
 439 ------->.---term-->.------->
 440     ^          |
 441     |          |
 442     \----OP----/
 443
 444 The parse tree generated for the input a+b is:
 445
 446 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
 447
 448 */