Parser/parser.c

   1
   2 /* Parser implementation */
   3
   4 /* For a description, see the comments at end of this file */
   5
   6 /* XXX To do: error recovery */
   7
   8 #include "Python.h"
   9 #include "pgenheaders.h"
  10 #include "token.h"
  11 #include "grammar.h"
  12 #include "node.h"
  13 #include "parser.h"
  14 #include "errcode.h"
  15
  16
  17 #ifdef Py_DEBUG
  18 extern int Py_DebugFlag;
  19 #define D(x) if (!Py_DebugFlag); else x
  20 #else
  21 #define D(x)
  22 #endif
  23
  24
  25 /* STACK DATA TYPE */
  26
  27 static void s_reset(stack *);
  28
  29 static void
  30 s_reset(stack *s)
  31 {
  32         s->s_top = &s->s_base[MAXSTACK];
  33 }
  34
  35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
  36
  37 static int
  38 s_push(register stack *s, dfa *d, node *parent)
  39 {
  40         register stackentry *top;
  41         if (s->s_top == s->s_base) {
  42                 fprintf(stderr, "s_push: parser stack overflow\n");
  43                 return E_NOMEM;
  44         }
  45         top = --s->s_top;
  46         top->s_dfa = d;
  47         top->s_parent = parent;
  48         top->s_state = 0;
  49         return 0;
  50 }
  51
  52 #ifdef Py_DEBUG
  53
  54 static void
  55 s_pop(register stack *s)
  56 {
  57         if (s_empty(s))
  58                 Py_FatalError("s_pop: parser stack underflow -- FATAL");
  59         s->s_top++;
  60 }
  61
  62 #else /* !Py_DEBUG */
  63
  64 #define s_pop(s) (s)->s_top++
  65
  66 #endif
  67
  68
  69 /* PARSER CREATION */
  70
  71 parser_state *
  72 PyParser_New(grammar *g, int start)
  73 {
  74         parser_state *ps;
  75
  76         if (!g->g_accel)
  77                 PyGrammar_AddAccelerators(g);
  78         ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
  79         if (ps == NULL)
  80                 return NULL;
  81         ps->p_grammar = g;
  82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
  83         ps->p_flags = 0;
  84 #endif
  85         ps->p_tree = PyNode_New(start);
  86         if (ps->p_tree == NULL) {
  87                 PyMem_FREE(ps);
  88                 return NULL;
  89         }
  90         s_reset(&ps->p_stack);
  91         (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
  92         return ps;
  93 }
  94
  95 void
  96 PyParser_Delete(parser_state *ps)
  97 {
  98         /* NB If you want to save the parse tree,
  99            you must set p_tree to NULL before calling delparser! */
 100         PyNode_Free(ps->p_tree);
 101         PyMem_FREE(ps);
 102 }
 103
 104
 105 /* PARSER STACK OPERATIONS */
 106
 107 static int
 108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
 109 {
 110         int err;
 111         assert(!s_empty(s));
 112         err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
 113         if (err)
 114                 return err;
 115         s->s_top->s_state = newstate;
 116         return 0;
 117 }
 118
 119 static int
 120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
 121 {
 122         int err;
 123         register node *n;
 124         n = s->s_top->s_parent;
 125         assert(!s_empty(s));
 126         err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
 127         if (err)
 128                 return err;
 129         s->s_top->s_state = newstate;
 130         return s_push(s, d, CHILD(n, NCH(n)-1));
 131 }
 132
 133
 134 /* PARSER PROPER */
 135
 136 static int
 137 classify(parser_state *ps, int type, char *str)
 138 {
 139         grammar *g = ps->p_grammar;
 140         register int n = g->g_ll.ll_nlabels;
 141
 142         if (type == NAME) {
 143                 register char *s = str;
 144                 register label *l = g->g_ll.ll_label;
 145                 register int i;
 146                 for (i = n; i > 0; i--, l++) {
 147                         if (l->lb_type != NAME || l->lb_str == NULL ||
 148                             l->lb_str[0] != s[0] ||
 149                             strcmp(l->lb_str, s) != 0)
 150                                 continue;
 151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 152                         if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
 153                                 if (s[0] == 'w' && strcmp(s, "with") == 0)
 154                                         break; /* not a keyword yet */
 155                                 else if (s[0] == 'a' && strcmp(s, "as") == 0)
 156                                         break; /* not a keyword yet */
 157                         }
 158 #endif
 159                         D(printf("It's a keyword\n"));
 160                         return n - i;
 161                 }
 162         }
 163
 164         {
 165                 register label *l = g->g_ll.ll_label;
 166                 register int i;
 167                 for (i = n; i > 0; i--, l++) {
 168                         if (l->lb_type == type && l->lb_str == NULL) {
 169                                 D(printf("It's a token we know\n"));
 170                                 return n - i;
 171                         }
 172                 }
 173         }
 174
 175         D(printf("Illegal token\n"));
 176         return -1;
 177 }
 178
 179 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 180 static void
 181 future_hack(parser_state *ps)
 182 {
 183         node *n = ps->p_stack.s_top->s_parent;
 184         node *ch;
 185         int i;
 186
 187         /* from __future__ import ..., must have at least 4 children */
 188         n = CHILD(n, 0);
 189         if (NCH(n) < 4)
 190                 return;
 191         ch = CHILD(n, 0);
 192         if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
 193                 return;
 194         ch = CHILD(n, 1);
 195         if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
 196             strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
 197                 return;
 198         for (i = 3; i < NCH(n); i += 2) {
 199                 /* XXX: assume we don't have parentheses in import:
 200                         from __future__ import (x, y, z)
 201                 */
 202                 ch = CHILD(n, i);
 203                 if (NCH(ch) == 1)
 204                         ch = CHILD(ch, 0);
 205                 if (NCH(ch) >= 1 && TYPE(CHILD(ch, 0)) == NAME &&
 206                     strcmp(STR(CHILD(ch, 0)), "with_statement") == 0) {
 207                         ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
 208                         break;
 209                 }
 210         }
 211 }
 212 #endif /* future keyword */
 213
 214 int
 215 PyParser_AddToken(register parser_state *ps, register int type, char *str,
 216                   int lineno, int col_offset, int *expected_ret)
 217 {
 218         register int ilabel;
 219         int err;
 220
 221         D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
 222
 223         /* Find out which label this token is */
 224         ilabel = classify(ps, type, str);
 225         if (ilabel < 0)
 226                 return E_SYNTAX;
 227
 228         /* Loop until the token is shifted or an error occurred */
 229         for (;;) {
 230                 /* Fetch the current dfa and state */
 231                 register dfa *d = ps->p_stack.s_top->s_dfa;
 232                 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
 233
 234                 D(printf(" DFA '%s', state %d:",
 235                         d->d_name, ps->p_stack.s_top->s_state));
 236
 237                 /* Check accelerator */
 238                 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
 239                         register int x = s->s_accel[ilabel - s->s_lower];
 240                         if (x != -1) {
 241                                 if (x & (1<<7)) {
 242                                         /* Push non-terminal */
 243                                         int nt = (x >> 8) + NT_OFFSET;
 244                                         int arrow = x & ((1<<7)-1);
 245                                         dfa *d1 = PyGrammar_FindDFA(
 246                                                 ps->p_grammar, nt);
 247                                         if ((err = push(&ps->p_stack, nt, d1,
 248                                                 arrow, lineno, col_offset)) > 0) {
 249                                                 D(printf(" MemError: push\n"));
 250                                                 return err;
 251                                         }
 252                                         D(printf(" Push ...\n"));
 253                                         continue;
 254                                 }
 255
 256                                 /* Shift the token */
 257                                 if ((err = shift(&ps->p_stack, type, str,
 258                                                 x, lineno, col_offset)) > 0) {
 259                                         D(printf(" MemError: shift.\n"));
 260                                         return err;
 261                                 }
 262                                 D(printf(" Shift.\n"));
 263                                 /* Pop while we are in an accept-only state */
 264                                 while (s = &d->d_state
 265                                                 [ps->p_stack.s_top->s_state],
 266                                         s->s_accept && s->s_narcs == 1) {
 267                                         D(printf("  DFA '%s', state %d: "
 268                                                  "Direct pop.\n",
 269                                                  d->d_name,
 270                                                  ps->p_stack.s_top->s_state));
 271 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 272                                         if (d->d_name[0] == 'i' &&
 273                                             strcmp(d->d_name,
 274                                                    "import_stmt") == 0)
 275                                                 future_hack(ps);
 276 #endif
 277                                         s_pop(&ps->p_stack);
 278                                         if (s_empty(&ps->p_stack)) {
 279                                                 D(printf("  ACCEPT.\n"));
 280                                                 return E_DONE;
 281                                         }
 282                                         d = ps->p_stack.s_top->s_dfa;
 283                                 }
 284                                 return E_OK;
 285                         }
 286                 }
 287
 288                 if (s->s_accept) {
 289 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 290                         if (d->d_name[0] == 'i' &&
 291                             strcmp(d->d_name, "import_stmt") == 0)
 292                                 future_hack(ps);
 293 #endif
 294                         /* Pop this dfa and try again */
 295                         s_pop(&ps->p_stack);
 296                         D(printf(" Pop ...\n"));
 297                         if (s_empty(&ps->p_stack)) {
 298                                 D(printf(" Error: bottom of stack.\n"));
 299                                 return E_SYNTAX;
 300                         }
 301                         continue;
 302                 }
 303
 304                 /* Stuck, report syntax error */
 305                 D(printf(" Error.\n"));
 306                 if (expected_ret) {
 307                         if (s->s_lower == s->s_upper - 1) {
 308                                 /* Only one possible expected token */
 309                                 *expected_ret = ps->p_grammar->
 310                                     g_ll.ll_label[s->s_lower].lb_type;
 311                         }
 312                         else
 313                                 *expected_ret = -1;
 314                 }
 315                 return E_SYNTAX;
 316         }
 317 }
 318
 319
 320 #ifdef Py_DEBUG
 321
 322 /* DEBUG OUTPUT */
 323
 324 void
 325 dumptree(grammar *g, node *n)
 326 {
 327         int i;
 328
 329         if (n == NULL)
 330                 printf("NIL");
 331         else {
 332                 label l;
 333                 l.lb_type = TYPE(n);
 334                 l.lb_str = STR(n);
 335                 printf("%s", PyGrammar_LabelRepr(&l));
 336                 if (ISNONTERMINAL(TYPE(n))) {
 337                         printf("(");
 338                         for (i = 0; i < NCH(n); i++) {
 339                                 if (i > 0)
 340                                         printf(",");
 341                                 dumptree(g, CHILD(n, i));
 342                         }
 343                         printf(")");
 344                 }
 345         }
 346 }
 347
 348 void
 349 showtree(grammar *g, node *n)
 350 {
 351         int i;
 352
 353         if (n == NULL)
 354                 return;
 355         if (ISNONTERMINAL(TYPE(n))) {
 356                 for (i = 0; i < NCH(n); i++)
 357                         showtree(g, CHILD(n, i));
 358         }
 359         else if (ISTERMINAL(TYPE(n))) {
 360                 printf("%s", _PyParser_TokenNames[TYPE(n)]);
 361                 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
 362                         printf("(%s)", STR(n));
 363                 printf(" ");
 364         }
 365         else
 366                 printf("? ");
 367 }
 368
 369 void
 370 printtree(parser_state *ps)
 371 {
 372         if (Py_DebugFlag) {
 373                 printf("Parse tree:\n");
 374                 dumptree(ps->p_grammar, ps->p_tree);
 375                 printf("\n");
 376                 printf("Tokens:\n");
 377                 showtree(ps->p_grammar, ps->p_tree);
 378                 printf("\n");
 379         }
 380         printf("Listing:\n");
 381         PyNode_ListTree(ps->p_tree);
 382         printf("\n");
 383 }
 384
 385 #endif /* Py_DEBUG */
 386
 387 /*
 388
 389 Description
 390 -----------
 391
 392 The parser's interface is different than usual: the function addtoken()
 393 must be called for each token in the input.  This makes it possible to
 394 turn it into an incremental parsing system later.  The parsing system
 395 constructs a parse tree as it goes.
 396
 397 A parsing rule is represented as a Deterministic Finite-state Automaton
 398 (DFA).  A node in a DFA represents a state of the parser; an arc represents
 399 a transition.  Transitions are either labeled with terminal symbols or
 400 with non-terminals.  When the parser decides to follow an arc labeled
 401 with a non-terminal, it is invoked recursively with the DFA representing
 402 the parsing rule for that as its initial state; when that DFA accepts,
 403 the parser that invoked it continues.  The parse tree constructed by the
 404 recursively called parser is inserted as a child in the current parse tree.
 405
 406 The DFA's can be constructed automatically from a more conventional
 407 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
 408 Certain restrictions make the parser's life easier: rules that can produce
 409 the empty string should be outlawed (there are other ways to put loops
 410 or optional parts in the language).  To avoid the need to construct
 411 FIRST sets, we can require that all but the last alternative of a rule
 412 (really: arc going out of a DFA's state) must begin with a terminal
 413 symbol.
 414
 415 As an example, consider this grammar:
 416
 417 expr:   term (OP term)*
 418 term:   CONSTANT | '(' expr ')'
 419
 420 The DFA corresponding to the rule for expr is:
 421
 422 ------->.---term-->.------->
 423         ^          |
 424         |          |
 425         \----OP----/
 426
 427 The parse tree generated for the input a+b is:
 428
 429 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
 430
 431 */