Parser/parser.c

   1
   2 /* Parser implementation */
   3
   4 /* For a description, see the comments at end of this file */
   5
   6 /* XXX To do: error recovery */
   7
   8 #include "Python.h"
   9 #include "pgenheaders.h"
  10 #include "token.h"
  11 #include "grammar.h"
  12 #include "node.h"
  13 #include "parser.h"
  14 #include "errcode.h"
  15
  16
  17 #ifdef Py_DEBUG
  18 extern int Py_DebugFlag;
  19 #define D(x) if (!Py_DebugFlag); else x
  20 #else
  21 #define D(x)
  22 #endif
  23
  24
  25 /* STACK DATA TYPE */
  26
  27 static void s_reset(stack *);
  28
  29 static void
  30 s_reset(stack *s)
  31 {
  32         s->s_top = &s->s_base[MAXSTACK];
  33 }
  34
  35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
  36
  37 static int
  38 s_push(register stack *s, dfa *d, node *parent)
  39 {
  40         register stackentry *top;
  41         if (s->s_top == s->s_base) {
  42                 fprintf(stderr, "s_push: parser stack overflow\n");
  43                 return E_NOMEM;
  44         }
  45         top = --s->s_top;
  46         top->s_dfa = d;
  47         top->s_parent = parent;
  48         top->s_state = 0;
  49         return 0;
  50 }
  51
  52 #ifdef Py_DEBUG
  53
  54 static void
  55 s_pop(register stack *s)
  56 {
  57         if (s_empty(s))
  58                 Py_FatalError("s_pop: parser stack underflow -- FATAL");
  59         s->s_top++;
  60 }
  61
  62 #else /* !Py_DEBUG */
  63
  64 #define s_pop(s) (s)->s_top++
  65
  66 #endif
  67
  68
  69 /* PARSER CREATION */
  70
  71 parser_state *
  72 PyParser_New(grammar *g, int start)
  73 {
  74         parser_state *ps;
  75
  76         if (!g->g_accel)
  77                 PyGrammar_AddAccelerators(g);
  78         ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
  79         if (ps == NULL)
  80                 return NULL;
  81         ps->p_grammar = g;
  82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
  83         ps->p_flags = 0;
  84 #endif
  85         ps->p_tree = PyNode_New(start);
  86         if (ps->p_tree == NULL) {
  87                 PyMem_FREE(ps);
  88                 return NULL;
  89         }
  90         s_reset(&ps->p_stack);
  91         (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
  92         return ps;
  93 }
  94
  95 void
  96 PyParser_Delete(parser_state *ps)
  97 {
  98         /* NB If you want to save the parse tree,
  99            you must set p_tree to NULL before calling delparser! */
 100         PyNode_Free(ps->p_tree);
 101         PyMem_FREE(ps);
 102 }
 103
 104
 105 /* PARSER STACK OPERATIONS */
 106
 107 static int
 108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
 109 {
 110         int err;
 111         assert(!s_empty(s));
 112         err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
 113         if (err)
 114                 return err;
 115         s->s_top->s_state = newstate;
 116         return 0;
 117 }
 118
 119 static int
 120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
 121 {
 122         int err;
 123         register node *n;
 124         n = s->s_top->s_parent;
 125         assert(!s_empty(s));
 126         err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
 127         if (err)
 128                 return err;
 129         s->s_top->s_state = newstate;
 130         return s_push(s, d, CHILD(n, NCH(n)-1));
 131 }
 132
 133
 134 /* PARSER PROPER */
 135
 136 static int
 137 classify(parser_state *ps, int type, char *str)
 138 {
 139         grammar *g = ps->p_grammar;
 140         register int n = g->g_ll.ll_nlabels;
 141
 142         if (type == NAME) {
 143                 register char *s = str;
 144                 register label *l = g->g_ll.ll_label;
 145                 register int i;
 146                 for (i = n; i > 0; i--, l++) {
 147                         if (l->lb_type != NAME || l->lb_str == NULL ||
 148                             l->lb_str[0] != s[0] ||
 149                             strcmp(l->lb_str, s) != 0)
 150                                 continue;
 151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 152                         if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION &&
 153                             s[0] == 'p' && strcmp(s, "print") == 0) {
 154                                 break; /* no longer a keyword */
 155                         }
 156 #endif
 157                         D(printf("It's a keyword\n"));
 158                         return n - i;
 159                 }
 160         }
 161
 162         {
 163                 register label *l = g->g_ll.ll_label;
 164                 register int i;
 165                 for (i = n; i > 0; i--, l++) {
 166                         if (l->lb_type == type && l->lb_str == NULL) {
 167                                 D(printf("It's a token we know\n"));
 168                                 return n - i;
 169                         }
 170                 }
 171         }
 172
 173         D(printf("Illegal token\n"));
 174         return -1;
 175 }
 176
 177 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 178 static void
 179 future_hack(parser_state *ps)
 180 {
 181         node *n = ps->p_stack.s_top->s_parent;
 182         node *ch, *cch;
 183         int i;
 184
 185         /* from __future__ import ..., must have at least 4 children */
 186         n = CHILD(n, 0);
 187         if (NCH(n) < 4)
 188                 return;
 189         ch = CHILD(n, 0);
 190         if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
 191                 return;
 192         ch = CHILD(n, 1);
 193         if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
 194             strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
 195                 return;
 196         ch = CHILD(n, 3);
 197         /* ch can be a star, a parenthesis or import_as_names */
 198         if (TYPE(ch) == STAR)
 199                 return;
 200         if (TYPE(ch) == LPAR)
 201                 ch = CHILD(n, 4);
 202
 203         for (i = 0; i < NCH(ch); i += 2) {
 204                 cch = CHILD(ch, i);
 205                 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
 206                         char *str_ch = STR(CHILD(cch, 0));
 207                         if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
 208                                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
 209                                 break;
 210                         } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
 211                                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
 212                                 break;
 213                         } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
 214                                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
 215                                 break;
 216                         }
 217                 }
 218         }
 219 }
 220 #endif /* future keyword */
 221
 222 int
 223 PyParser_AddToken(register parser_state *ps, register int type, char *str,
 224                   int lineno, int col_offset, int *expected_ret)
 225 {
 226         register int ilabel;
 227         int err;
 228
 229         D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
 230
 231         /* Find out which label this token is */
 232         ilabel = classify(ps, type, str);
 233         if (ilabel < 0)
 234                 return E_SYNTAX;
 235
 236         /* Loop until the token is shifted or an error occurred */
 237         for (;;) {
 238                 /* Fetch the current dfa and state */
 239                 register dfa *d = ps->p_stack.s_top->s_dfa;
 240                 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
 241
 242                 D(printf(" DFA '%s', state %d:",
 243                         d->d_name, ps->p_stack.s_top->s_state));
 244
 245                 /* Check accelerator */
 246                 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
 247                         register int x = s->s_accel[ilabel - s->s_lower];
 248                         if (x != -1) {
 249                                 if (x & (1<<7)) {
 250                                         /* Push non-terminal */
 251                                         int nt = (x >> 8) + NT_OFFSET;
 252                                         int arrow = x & ((1<<7)-1);
 253                                         dfa *d1 = PyGrammar_FindDFA(
 254                                                 ps->p_grammar, nt);
 255                                         if ((err = push(&ps->p_stack, nt, d1,
 256                                                 arrow, lineno, col_offset)) > 0) {
 257                                                 D(printf(" MemError: push\n"));
 258                                                 return err;
 259                                         }
 260                                         D(printf(" Push ...\n"));
 261                                         continue;
 262                                 }
 263
 264                                 /* Shift the token */
 265                                 if ((err = shift(&ps->p_stack, type, str,
 266                                                 x, lineno, col_offset)) > 0) {
 267                                         D(printf(" MemError: shift.\n"));
 268                                         return err;
 269                                 }
 270                                 D(printf(" Shift.\n"));
 271                                 /* Pop while we are in an accept-only state */
 272                                 while (s = &d->d_state
 273                                                 [ps->p_stack.s_top->s_state],
 274                                         s->s_accept && s->s_narcs == 1) {
 275                                         D(printf("  DFA '%s', state %d: "
 276                                                  "Direct pop.\n",
 277                                                  d->d_name,
 278                                                  ps->p_stack.s_top->s_state));
 279 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 280                                         if (d->d_name[0] == 'i' &&
 281                                             strcmp(d->d_name,
 282                                                    "import_stmt") == 0)
 283                                                 future_hack(ps);
 284 #endif
 285                                         s_pop(&ps->p_stack);
 286                                         if (s_empty(&ps->p_stack)) {
 287                                                 D(printf("  ACCEPT.\n"));
 288                                                 return E_DONE;
 289                                         }
 290                                         d = ps->p_stack.s_top->s_dfa;
 291                                 }
 292                                 return E_OK;
 293                         }
 294                 }
 295
 296                 if (s->s_accept) {
 297 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 298                         if (d->d_name[0] == 'i' &&
 299                             strcmp(d->d_name, "import_stmt") == 0)
 300                                 future_hack(ps);
 301 #endif
 302                         /* Pop this dfa and try again */
 303                         s_pop(&ps->p_stack);
 304                         D(printf(" Pop ...\n"));
 305                         if (s_empty(&ps->p_stack)) {
 306                                 D(printf(" Error: bottom of stack.\n"));
 307                                 return E_SYNTAX;
 308                         }
 309                         continue;
 310                 }
 311
 312                 /* Stuck, report syntax error */
 313                 D(printf(" Error.\n"));
 314                 if (expected_ret) {
 315                         if (s->s_lower == s->s_upper - 1) {
 316                                 /* Only one possible expected token */
 317                                 *expected_ret = ps->p_grammar->
 318                                     g_ll.ll_label[s->s_lower].lb_type;
 319                         }
 320                         else
 321                                 *expected_ret = -1;
 322                 }
 323                 return E_SYNTAX;
 324         }
 325 }
 326
 327
 328 #ifdef Py_DEBUG
 329
 330 /* DEBUG OUTPUT */
 331
 332 void
 333 dumptree(grammar *g, node *n)
 334 {
 335         int i;
 336
 337         if (n == NULL)
 338                 printf("NIL");
 339         else {
 340                 label l;
 341                 l.lb_type = TYPE(n);
 342                 l.lb_str = STR(n);
 343                 printf("%s", PyGrammar_LabelRepr(&l));
 344                 if (ISNONTERMINAL(TYPE(n))) {
 345                         printf("(");
 346                         for (i = 0; i < NCH(n); i++) {
 347                                 if (i > 0)
 348                                         printf(",");
 349                                 dumptree(g, CHILD(n, i));
 350                         }
 351                         printf(")");
 352                 }
 353         }
 354 }
 355
 356 void
 357 showtree(grammar *g, node *n)
 358 {
 359         int i;
 360
 361         if (n == NULL)
 362                 return;
 363         if (ISNONTERMINAL(TYPE(n))) {
 364                 for (i = 0; i < NCH(n); i++)
 365                         showtree(g, CHILD(n, i));
 366         }
 367         else if (ISTERMINAL(TYPE(n))) {
 368                 printf("%s", _PyParser_TokenNames[TYPE(n)]);
 369                 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
 370                         printf("(%s)", STR(n));
 371                 printf(" ");
 372         }
 373         else
 374                 printf("? ");
 375 }
 376
 377 void
 378 printtree(parser_state *ps)
 379 {
 380         if (Py_DebugFlag) {
 381                 printf("Parse tree:\n");
 382                 dumptree(ps->p_grammar, ps->p_tree);
 383                 printf("\n");
 384                 printf("Tokens:\n");
 385                 showtree(ps->p_grammar, ps->p_tree);
 386                 printf("\n");
 387         }
 388         printf("Listing:\n");
 389         PyNode_ListTree(ps->p_tree);
 390         printf("\n");
 391 }
 392
 393 #endif /* Py_DEBUG */
 394
 395 /*
 396
 397 Description
 398 -----------
 399
 400 The parser's interface is different than usual: the function addtoken()
 401 must be called for each token in the input.  This makes it possible to
 402 turn it into an incremental parsing system later.  The parsing system
 403 constructs a parse tree as it goes.
 404
 405 A parsing rule is represented as a Deterministic Finite-state Automaton
 406 (DFA).  A node in a DFA represents a state of the parser; an arc represents
 407 a transition.  Transitions are either labeled with terminal symbols or
 408 with non-terminals.  When the parser decides to follow an arc labeled
 409 with a non-terminal, it is invoked recursively with the DFA representing
 410 the parsing rule for that as its initial state; when that DFA accepts,
 411 the parser that invoked it continues.  The parse tree constructed by the
 412 recursively called parser is inserted as a child in the current parse tree.
 413
 414 The DFA's can be constructed automatically from a more conventional
 415 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
 416 Certain restrictions make the parser's life easier: rules that can produce
 417 the empty string should be outlawed (there are other ways to put loops
 418 or optional parts in the language).  To avoid the need to construct
 419 FIRST sets, we can require that all but the last alternative of a rule
 420 (really: arc going out of a DFA's state) must begin with a terminal
 421 symbol.
 422
 423 As an example, consider this grammar:
 424
 425 expr:   term (OP term)*
 426 term:   CONSTANT | '(' expr ')'
 427
 428 The DFA corresponding to the rule for expr is:
 429
 430 ------->.---term-->.------->
 431         ^          |
 432         |          |
 433         \----OP----/
 434
 435 The parse tree generated for the input a+b is:
 436
 437 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
 438
 439 */