Parser/parser.c

   1
   2 /* Parser implementation */
   3
   4 /* For a description, see the comments at end of this file */
   5
   6 /* XXX To do: error recovery */
   7
   8 #include "Python.h"
   9 #include "pgenheaders.h"
  10 #include "token.h"
  11 #include "grammar.h"
  12 #include "node.h"
  13 #include "parser.h"
  14 #include "errcode.h"
  15
  16
  17 #ifdef Py_DEBUG
  18 extern int Py_DebugFlag;
  19 #define D(x) if (!Py_DebugFlag); else x
  20 #else
  21 #define D(x)
  22 #endif
  23
  24
  25 /* STACK DATA TYPE */
  26
  27 static void s_reset(stack *);
  28
  29 static void
  30 s_reset(stack *s)
  31 {
  32         s->s_top = &s->s_base[MAXSTACK];
  33 }
  34
  35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
  36
  37 static int
  38 s_push(register stack *s, dfa *d, node *parent)
  39 {
  40         register stackentry *top;
  41         if (s->s_top == s->s_base) {
  42                 fprintf(stderr, "s_push: parser stack overflow\n");
  43                 return E_NOMEM;
  44         }
  45         top = --s->s_top;
  46         top->s_dfa = d;
  47         top->s_parent = parent;
  48         top->s_state = 0;
  49         return 0;
  50 }
  51
  52 #ifdef Py_DEBUG
  53
  54 static void
  55 s_pop(register stack *s)
  56 {
  57         if (s_empty(s))
  58                 Py_FatalError("s_pop: parser stack underflow -- FATAL");
  59         s->s_top++;
  60 }
  61
  62 #else /* !Py_DEBUG */
  63
  64 #define s_pop(s) (s)->s_top++
  65
  66 #endif
  67
  68
  69 /* PARSER CREATION */
  70
  71 parser_state *
  72 PyParser_New(grammar *g, int start)
  73 {
  74         parser_state *ps;
  75
  76         if (!g->g_accel)
  77                 PyGrammar_AddAccelerators(g);
  78         ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
  79         if (ps == NULL)
  80                 return NULL;
  81         ps->p_grammar = g;
  82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
  83         ps->p_flags = 0;
  84 #endif
  85         ps->p_tree = PyNode_New(start);
  86         if (ps->p_tree == NULL) {
  87                 PyMem_FREE(ps);
  88                 return NULL;
  89         }
  90         s_reset(&ps->p_stack);
  91         (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
  92         return ps;
  93 }
  94
  95 void
  96 PyParser_Delete(parser_state *ps)
  97 {
  98         /* NB If you want to save the parse tree,
  99            you must set p_tree to NULL before calling delparser! */
 100         PyNode_Free(ps->p_tree);
 101         PyMem_FREE(ps);
 102 }
 103
 104
 105 /* PARSER STACK OPERATIONS */
 106
 107 static int
 108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
 109 {
 110         int err;
 111         assert(!s_empty(s));
 112         err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
 113         if (err)
 114                 return err;
 115         s->s_top->s_state = newstate;
 116         return 0;
 117 }
 118
 119 static int
 120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
 121 {
 122         int err;
 123         register node *n;
 124         n = s->s_top->s_parent;
 125         assert(!s_empty(s));
 126         err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
 127         if (err)
 128                 return err;
 129         s->s_top->s_state = newstate;
 130         return s_push(s, d, CHILD(n, NCH(n)-1));
 131 }
 132
 133
 134 /* PARSER PROPER */
 135
 136 static int
 137 classify(parser_state *ps, int type, char *str)
 138 {
 139         grammar *g = ps->p_grammar;
 140         register int n = g->g_ll.ll_nlabels;
 141
 142         if (type == NAME) {
 143                 register char *s = str;
 144                 register label *l = g->g_ll.ll_label;
 145                 register int i;
 146                 for (i = n; i > 0; i--, l++) {
 147                         if (l->lb_type != NAME || l->lb_str == NULL ||
 148                             l->lb_str[0] != s[0] ||
 149                             strcmp(l->lb_str, s) != 0)
 150                                 continue;
 151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 152                         if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION &&
 153                             s[0] == 'p' && strcmp(s, "print") == 0) {
 154                                 break; /* no longer a keyword */
 155                         }
 156 #endif
 157                         D(printf("It's a keyword\n"));
 158                         return n - i;
 159                 }
 160         }
 161
 162         {
 163                 register label *l = g->g_ll.ll_label;
 164                 register int i;
 165                 for (i = n; i > 0; i--, l++) {
 166                         if (l->lb_type == type && l->lb_str == NULL) {
 167                                 D(printf("It's a token we know\n"));
 168                                 return n - i;
 169                         }
 170                 }
 171         }
 172
 173         D(printf("Illegal token\n"));
 174         return -1;
 175 }
 176
 177 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 178 static void
 179 future_hack(parser_state *ps)
 180 {
 181         node *n = ps->p_stack.s_top->s_parent;
 182         node *ch, *cch;
 183         int i;
 184
 185         /* from __future__ import ..., must have at least 4 children */
 186         n = CHILD(n, 0);
 187         if (NCH(n) < 4)
 188                 return;
 189         ch = CHILD(n, 0);
 190         if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
 191                 return;
 192         ch = CHILD(n, 1);
 193         if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
 194             strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
 195                 return;
 196         ch = CHILD(n, 3);
 197         /* ch can be a star, a parenthesis or import_as_names */
 198         if (TYPE(ch) == STAR)
 199                 return;
 200         if (TYPE(ch) == LPAR)
 201                 ch = CHILD(n, 4);
 202
 203         for (i = 0; i < NCH(ch); i += 2) {
 204                 cch = CHILD(ch, i);
 205                 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
 206                         char *str_ch = STR(CHILD(cch, 0));
 207                         if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
 208                                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
 209                         } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
 210                                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
 211                         } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
 212                                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
 213                         }
 214                 }
 215         }
 216 }
 217 #endif /* future keyword */
 218
 219 int
 220 PyParser_AddToken(register parser_state *ps, register int type, char *str,
 221                   int lineno, int col_offset, int *expected_ret)
 222 {
 223         register int ilabel;
 224         int err;
 225
 226         D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
 227
 228         /* Find out which label this token is */
 229         ilabel = classify(ps, type, str);
 230         if (ilabel < 0)
 231                 return E_SYNTAX;
 232
 233         /* Loop until the token is shifted or an error occurred */
 234         for (;;) {
 235                 /* Fetch the current dfa and state */
 236                 register dfa *d = ps->p_stack.s_top->s_dfa;
 237                 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
 238
 239                 D(printf(" DFA '%s', state %d:",
 240                         d->d_name, ps->p_stack.s_top->s_state));
 241
 242                 /* Check accelerator */
 243                 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
 244                         register int x = s->s_accel[ilabel - s->s_lower];
 245                         if (x != -1) {
 246                                 if (x & (1<<7)) {
 247                                         /* Push non-terminal */
 248                                         int nt = (x >> 8) + NT_OFFSET;
 249                                         int arrow = x & ((1<<7)-1);
 250                                         dfa *d1 = PyGrammar_FindDFA(
 251                                                 ps->p_grammar, nt);
 252                                         if ((err = push(&ps->p_stack, nt, d1,
 253                                                 arrow, lineno, col_offset)) > 0) {
 254                                                 D(printf(" MemError: push\n"));
 255                                                 return err;
 256                                         }
 257                                         D(printf(" Push ...\n"));
 258                                         continue;
 259                                 }
 260
 261                                 /* Shift the token */
 262                                 if ((err = shift(&ps->p_stack, type, str,
 263                                                 x, lineno, col_offset)) > 0) {
 264                                         D(printf(" MemError: shift.\n"));
 265                                         return err;
 266                                 }
 267                                 D(printf(" Shift.\n"));
 268                                 /* Pop while we are in an accept-only state */
 269                                 while (s = &d->d_state
 270                                                 [ps->p_stack.s_top->s_state],
 271                                         s->s_accept && s->s_narcs == 1) {
 272                                         D(printf("  DFA '%s', state %d: "
 273                                                  "Direct pop.\n",
 274                                                  d->d_name,
 275                                                  ps->p_stack.s_top->s_state));
 276 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 277                                         if (d->d_name[0] == 'i' &&
 278                                             strcmp(d->d_name,
 279                                                    "import_stmt") == 0)
 280                                                 future_hack(ps);
 281 #endif
 282                                         s_pop(&ps->p_stack);
 283                                         if (s_empty(&ps->p_stack)) {
 284                                                 D(printf("  ACCEPT.\n"));
 285                                                 return E_DONE;
 286                                         }
 287                                         d = ps->p_stack.s_top->s_dfa;
 288                                 }
 289                                 return E_OK;
 290                         }
 291                 }
 292
 293                 if (s->s_accept) {
 294 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
 295                         if (d->d_name[0] == 'i' &&
 296                             strcmp(d->d_name, "import_stmt") == 0)
 297                                 future_hack(ps);
 298 #endif
 299                         /* Pop this dfa and try again */
 300                         s_pop(&ps->p_stack);
 301                         D(printf(" Pop ...\n"));
 302                         if (s_empty(&ps->p_stack)) {
 303                                 D(printf(" Error: bottom of stack.\n"));
 304                                 return E_SYNTAX;
 305                         }
 306                         continue;
 307                 }
 308
 309                 /* Stuck, report syntax error */
 310                 D(printf(" Error.\n"));
 311                 if (expected_ret) {
 312                         if (s->s_lower == s->s_upper - 1) {
 313                                 /* Only one possible expected token */
 314                                 *expected_ret = ps->p_grammar->
 315                                     g_ll.ll_label[s->s_lower].lb_type;
 316                         }
 317                         else
 318                                 *expected_ret = -1;
 319                 }
 320                 return E_SYNTAX;
 321         }
 322 }
 323
 324
 325 #ifdef Py_DEBUG
 326
 327 /* DEBUG OUTPUT */
 328
 329 void
 330 dumptree(grammar *g, node *n)
 331 {
 332         int i;
 333
 334         if (n == NULL)
 335                 printf("NIL");
 336         else {
 337                 label l;
 338                 l.lb_type = TYPE(n);
 339                 l.lb_str = STR(n);
 340                 printf("%s", PyGrammar_LabelRepr(&l));
 341                 if (ISNONTERMINAL(TYPE(n))) {
 342                         printf("(");
 343                         for (i = 0; i < NCH(n); i++) {
 344                                 if (i > 0)
 345                                         printf(",");
 346                                 dumptree(g, CHILD(n, i));
 347                         }
 348                         printf(")");
 349                 }
 350         }
 351 }
 352
 353 void
 354 showtree(grammar *g, node *n)
 355 {
 356         int i;
 357
 358         if (n == NULL)
 359                 return;
 360         if (ISNONTERMINAL(TYPE(n))) {
 361                 for (i = 0; i < NCH(n); i++)
 362                         showtree(g, CHILD(n, i));
 363         }
 364         else if (ISTERMINAL(TYPE(n))) {
 365                 printf("%s", _PyParser_TokenNames[TYPE(n)]);
 366                 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
 367                         printf("(%s)", STR(n));
 368                 printf(" ");
 369         }
 370         else
 371                 printf("? ");
 372 }
 373
 374 void
 375 printtree(parser_state *ps)
 376 {
 377         if (Py_DebugFlag) {
 378                 printf("Parse tree:\n");
 379                 dumptree(ps->p_grammar, ps->p_tree);
 380                 printf("\n");
 381                 printf("Tokens:\n");
 382                 showtree(ps->p_grammar, ps->p_tree);
 383                 printf("\n");
 384         }
 385         printf("Listing:\n");
 386         PyNode_ListTree(ps->p_tree);
 387         printf("\n");
 388 }
 389
 390 #endif /* Py_DEBUG */
 391
 392 /*
 393
 394 Description
 395 -----------
 396
 397 The parser's interface is different than usual: the function addtoken()
 398 must be called for each token in the input.  This makes it possible to
 399 turn it into an incremental parsing system later.  The parsing system
 400 constructs a parse tree as it goes.
 401
 402 A parsing rule is represented as a Deterministic Finite-state Automaton
 403 (DFA).  A node in a DFA represents a state of the parser; an arc represents
 404 a transition.  Transitions are either labeled with terminal symbols or
 405 with non-terminals.  When the parser decides to follow an arc labeled
 406 with a non-terminal, it is invoked recursively with the DFA representing
 407 the parsing rule for that as its initial state; when that DFA accepts,
 408 the parser that invoked it continues.  The parse tree constructed by the
 409 recursively called parser is inserted as a child in the current parse tree.
 410
 411 The DFA's can be constructed automatically from a more conventional
 412 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
 413 Certain restrictions make the parser's life easier: rules that can produce
 414 the empty string should be outlawed (there are other ways to put loops
 415 or optional parts in the language).  To avoid the need to construct
 416 FIRST sets, we can require that all but the last alternative of a rule
 417 (really: arc going out of a DFA's state) must begin with a terminal
 418 symbol.
 419
 420 As an example, consider this grammar:
 421
 422 expr:   term (OP term)*
 423 term:   CONSTANT | '(' expr ')'
 424
 425 The DFA corresponding to the rule for expr is:
 426
 427 ------->.---term-->.------->
 428         ^          |
 429         |          |
 430         \----OP----/
 431
 432 The parse tree generated for the input a+b is:
 433
 434 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
 435
 436 */