Parser/parser.c

   1
   2 /* Parser implementation */
   3
   4 /* For a description, see the comments at end of this file */
   5
   6 /* XXX To do: error recovery */
   7
   8 #include "Python.h"
   9 #include "pgenheaders.h"
  10 #include "token.h"
  11 #include "grammar.h"
  12 #include "node.h"
  13 #include "parser.h"
  14 #include "errcode.h"
  15
  16
  17 #ifdef Py_DEBUG
  18 extern int Py_DebugFlag;
  19 #define D(x) if (!Py_DebugFlag); else x
  20 #else
  21 #define D(x)
  22 #endif
  23
  24
  25 /* STACK DATA TYPE */
  26
  27 static void s_reset(stack *);
  28
  29 static void
  30 s_reset(stack *s)
  31 {
  32         s->s_top = &s->s_base[MAXSTACK];
  33 }
  34
  35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
  36
  37 static int
  38 s_push(register stack *s, dfa *d, node *parent)
  39 {
  40         register stackentry *top;
  41         if (s->s_top == s->s_base) {
  42                 fprintf(stderr, "s_push: parser stack overflow\n");
  43                 return E_NOMEM;
  44         }
  45         top = --s->s_top;
  46         top->s_dfa = d;
  47         top->s_parent = parent;
  48         top->s_state = 0;
  49         return 0;
  50 }
  51
  52 #ifdef Py_DEBUG
  53
  54 static void
  55 s_pop(register stack *s)
  56 {
  57         if (s_empty(s))
  58                 Py_FatalError("s_pop: parser stack underflow -- FATAL");
  59         s->s_top++;
  60 }
  61
  62 #else /* !Py_DEBUG */
  63
  64 #define s_pop(s) (s)->s_top++
  65
  66 #endif
  67
  68
  69 /* PARSER CREATION */
  70
  71 parser_state *
  72 PyParser_New(grammar *g, int start)
  73 {
  74         parser_state *ps;
  75
  76         if (!g->g_accel)
  77                 PyGrammar_AddAccelerators(g);
  78         ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
  79         if (ps == NULL)
  80                 return NULL;
  81         ps->p_grammar = g;
  82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
  83         ps->p_flags = 0;
  84 #endif
  85         ps->p_tree = PyNode_New(start);
  86         if (ps->p_tree == NULL) {
  87                 PyMem_FREE(ps);
  88                 return NULL;
  89         }
  90         s_reset(&ps->p_stack);
  91         (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
  92         return ps;
  93 }
  94
  95 void
  96 PyParser_Delete(parser_state *ps)
  97 {
  98         /* NB If you want to save the parse tree,
  99            you must set p_tree to NULL before calling delparser! */
 100         PyNode_Free(ps->p_tree);
 101         PyMem_FREE(ps);
 102 }
 103
 104
 105 /* PARSER STACK OPERATIONS */
 106
 107 static int
 108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
 109 {
 110         int err;
 111         assert(!s_empty(s));
 112         err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
 113         if (err)
 114                 return err;
 115         s->s_top->s_state = newstate;
 116         return 0;
 117 }
 118
 119 static int
 120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
 121 {
 122         int err;
 123         register node *n;
 124         n = s->s_top->s_parent;
 125         assert(!s_empty(s));
 126         err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
 127         if (err)
 128                 return err;
 129         s->s_top->s_state = newstate;
 130         return s_push(s, d, CHILD(n, NCH(n)-1));
 131 }
 132
 133
 134 /* PARSER PROPER */
 135
 136 static int
 137 classify(parser_state *ps, int type, char *str)
 138 {
 139         grammar *g = ps->p_grammar;
 140         register int n = g->g_ll.ll_nlabels;
 141
 142         if (type == NAME) {
 143                 register char *s = str;
 144                 register label *l = g->g_ll.ll_label;
 145                 register int i;
 146                 for (i = n; i > 0; i--, l++) {
 147                         if (l->lb_type != NAME || l->lb_str == NULL ||
 148                             l->lb_str[0] != s[0] ||
 149                             strcmp(l->lb_str, s) != 0)
 150                                 continue;
 151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 152                         if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
 153                                 if (s[0] == 'w' && strcmp(s, "with") == 0)
 154                                         break; /* not a keyword yet */
 155                                 else if (s[0] == 'a' && strcmp(s, "as") == 0)
 156                                         break; /* not a keyword yet */
 157                         }
 158 #endif
 159                         D(printf("It's a keyword\n"));
 160                         return n - i;
 161                 }
 162         }
 163
 164         {
 165                 register label *l = g->g_ll.ll_label;
 166                 register int i;
 167                 for (i = n; i > 0; i--, l++) {
 168                         if (l->lb_type == type && l->lb_str == NULL) {
 169                                 D(printf("It's a token we know\n"));
 170                                 return n - i;
 171                         }
 172                 }
 173         }
 174
 175         D(printf("Illegal token\n"));
 176         return -1;
 177 }
 178
 179 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 180 static void
 181 future_hack(parser_state *ps)
 182 {
 183         node *n = ps->p_stack.s_top->s_parent;
 184         node *ch, *cch;
 185         int i;
 186
 187         /* from __future__ import ..., must have at least 4 children */
 188         n = CHILD(n, 0);
 189         if (NCH(n) < 4)
 190                 return;
 191         ch = CHILD(n, 0);
 192         if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
 193                 return;
 194         ch = CHILD(n, 1);
 195         if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
 196             strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
 197                 return;
 198         ch = CHILD(n, 3);
 199         /* ch can be a star, a parenthesis or import_as_names */
 200         if (TYPE(ch) == STAR)
 201                 return;
 202         if (TYPE(ch) == LPAR)
 203                 ch = CHILD(n, 4);
 204
 205         for (i = 0; i < NCH(ch); i += 2) {
 206                 cch = CHILD(ch, i);
 207                 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME &&
 208                     strcmp(STR(CHILD(cch, 0)), "with_statement") == 0) {
 209                         ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
 210                         break;
 211                 }
 212         }
 213 }
 214 #endif /* future keyword */
 215
 216 int
 217 PyParser_AddToken(register parser_state *ps, register int type, char *str,
 218                   int lineno, int col_offset, int *expected_ret)
 219 {
 220         register int ilabel;
 221         int err;
 222
 223         D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
 224
 225         /* Find out which label this token is */
 226         ilabel = classify(ps, type, str);
 227         if (ilabel < 0)
 228                 return E_SYNTAX;
 229
 230         /* Loop until the token is shifted or an error occurred */
 231         for (;;) {
 232                 /* Fetch the current dfa and state */
 233                 register dfa *d = ps->p_stack.s_top->s_dfa;
 234                 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
 235
 236                 D(printf(" DFA '%s', state %d:",
 237                         d->d_name, ps->p_stack.s_top->s_state));
 238
 239                 /* Check accelerator */
 240                 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
 241                         register int x = s->s_accel[ilabel - s->s_lower];
 242                         if (x != -1) {
 243                                 if (x & (1<<7)) {
 244                                         /* Push non-terminal */
 245                                         int nt = (x >> 8) + NT_OFFSET;
 246                                         int arrow = x & ((1<<7)-1);
 247                                         dfa *d1 = PyGrammar_FindDFA(
 248                                                 ps->p_grammar, nt);
 249                                         if ((err = push(&ps->p_stack, nt, d1,
 250                                                 arrow, lineno, col_offset)) > 0) {
 251                                                 D(printf(" MemError: push\n"));
 252                                                 return err;
 253                                         }
 254                                         D(printf(" Push ...\n"));
 255                                         continue;
 256                                 }
 257
 258                                 /* Shift the token */
 259                                 if ((err = shift(&ps->p_stack, type, str,
 260                                                 x, lineno, col_offset)) > 0) {
 261                                         D(printf(" MemError: shift.\n"));
 262                                         return err;
 263                                 }
 264                                 D(printf(" Shift.\n"));
 265                                 /* Pop while we are in an accept-only state */
 266                                 while (s = &d->d_state
 267                                                 [ps->p_stack.s_top->s_state],
 268                                         s->s_accept && s->s_narcs == 1) {
 269                                         D(printf("  DFA '%s', state %d: "
 270                                                  "Direct pop.\n",
 271                                                  d->d_name,
 272                                                  ps->p_stack.s_top->s_state));
 273 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 274                                         if (d->d_name[0] == 'i' &&
 275                                             strcmp(d->d_name,
 276                                                    "import_stmt") == 0)
 277                                                 future_hack(ps);
 278 #endif
 279                                         s_pop(&ps->p_stack);
 280                                         if (s_empty(&ps->p_stack)) {
 281                                                 D(printf("  ACCEPT.\n"));
 282                                                 return E_DONE;
 283                                         }
 284                                         d = ps->p_stack.s_top->s_dfa;
 285                                 }
 286                                 return E_OK;
 287                         }
 288                 }
 289
 290                 if (s->s_accept) {
 291 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 292                         if (d->d_name[0] == 'i' &&
 293                             strcmp(d->d_name, "import_stmt") == 0)
 294                                 future_hack(ps);
 295 #endif
 296                         /* Pop this dfa and try again */
 297                         s_pop(&ps->p_stack);
 298                         D(printf(" Pop ...\n"));
 299                         if (s_empty(&ps->p_stack)) {
 300                                 D(printf(" Error: bottom of stack.\n"));
 301                                 return E_SYNTAX;
 302                         }
 303                         continue;
 304                 }
 305
 306                 /* Stuck, report syntax error */
 307                 D(printf(" Error.\n"));
 308                 if (expected_ret) {
 309                         if (s->s_lower == s->s_upper - 1) {
 310                                 /* Only one possible expected token */
 311                                 *expected_ret = ps->p_grammar->
 312                                     g_ll.ll_label[s->s_lower].lb_type;
 313                         }
 314                         else
 315                                 *expected_ret = -1;
 316                 }
 317                 return E_SYNTAX;
 318         }
 319 }
 320
 321
 322 #ifdef Py_DEBUG
 323
 324 /* DEBUG OUTPUT */
 325
 326 void
 327 dumptree(grammar *g, node *n)
 328 {
 329         int i;
 330
 331         if (n == NULL)
 332                 printf("NIL");
 333         else {
 334                 label l;
 335                 l.lb_type = TYPE(n);
 336                 l.lb_str = STR(n);
 337                 printf("%s", PyGrammar_LabelRepr(&l));
 338                 if (ISNONTERMINAL(TYPE(n))) {
 339                         printf("(");
 340                         for (i = 0; i < NCH(n); i++) {
 341                                 if (i > 0)
 342                                         printf(",");
 343                                 dumptree(g, CHILD(n, i));
 344                         }
 345                         printf(")");
 346                 }
 347         }
 348 }
 349
 350 void
 351 showtree(grammar *g, node *n)
 352 {
 353         int i;
 354
 355         if (n == NULL)
 356                 return;
 357         if (ISNONTERMINAL(TYPE(n))) {
 358                 for (i = 0; i < NCH(n); i++)
 359                         showtree(g, CHILD(n, i));
 360         }
 361         else if (ISTERMINAL(TYPE(n))) {
 362                 printf("%s", _PyParser_TokenNames[TYPE(n)]);
 363                 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
 364                         printf("(%s)", STR(n));
 365                 printf(" ");
 366         }
 367         else
 368                 printf("? ");
 369 }
 370
 371 void
 372 printtree(parser_state *ps)
 373 {
 374         if (Py_DebugFlag) {
 375                 printf("Parse tree:\n");
 376                 dumptree(ps->p_grammar, ps->p_tree);
 377                 printf("\n");
 378                 printf("Tokens:\n");
 379                 showtree(ps->p_grammar, ps->p_tree);
 380                 printf("\n");
 381         }
 382         printf("Listing:\n");
 383         PyNode_ListTree(ps->p_tree);
 384         printf("\n");
 385 }
 386
 387 #endif /* Py_DEBUG */
 388
 389 /*
 390
 391 Description
 392 -----------
 393
 394 The parser's interface is different than usual: the function addtoken()
 395 must be called for each token in the input.  This makes it possible to
 396 turn it into an incremental parsing system later.  The parsing system
 397 constructs a parse tree as it goes.
 398
 399 A parsing rule is represented as a Deterministic Finite-state Automaton
 400 (DFA).  A node in a DFA represents a state of the parser; an arc represents
 401 a transition.  Transitions are either labeled with terminal symbols or
 402 with non-terminals.  When the parser decides to follow an arc labeled
 403 with a non-terminal, it is invoked recursively with the DFA representing
 404 the parsing rule for that as its initial state; when that DFA accepts,
 405 the parser that invoked it continues.  The parse tree constructed by the
 406 recursively called parser is inserted as a child in the current parse tree.
 407
 408 The DFA's can be constructed automatically from a more conventional
 409 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
 410 Certain restrictions make the parser's life easier: rules that can produce
 411 the empty string should be outlawed (there are other ways to put loops
 412 or optional parts in the language).  To avoid the need to construct
 413 FIRST sets, we can require that all but the last alternative of a rule
 414 (really: arc going out of a DFA's state) must begin with a terminal
 415 symbol.
 416
 417 As an example, consider this grammar:
 418
 419 expr:   term (OP term)*
 420 term:   CONSTANT | '(' expr ')'
 421
 422 The DFA corresponding to the rule for expr is:
 423
 424 ------->.---term-->.------->
 425         ^          |
 426         |          |
 427         \----OP----/
 428
 429 The parse tree generated for the input a+b is:
 430
 431 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
 432
 433 */