Parser/parser.c

   1
   2 /* Parser implementation */
   3
   4 /* For a description, see the comments at end of this file */
   5
   6 /* XXX To do: error recovery */
   7
   8 #include "Python.h"
   9 #include "pgenheaders.h"
  10 #include "token.h"
  11 #include "grammar.h"
  12 #include "node.h"
  13 #include "parser.h"
  14 #include "errcode.h"
  15
  16
  17 #ifdef Py_DEBUG
  18 extern int Py_DebugFlag;
  19 #define D(x) if (!Py_DebugFlag); else x
  20 #else
  21 #define D(x)
  22 #endif
  23
  24
  25 /* STACK DATA TYPE */
  26
  27 static void s_reset(stack *);
  28
  29 static void
  30 s_reset(stack *s)
  31 {
  32         s->s_top = &s->s_base[MAXSTACK];
  33 }
  34
  35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
  36
  37 static int
  38 s_push(register stack *s, dfa *d, node *parent)
  39 {
  40         register stackentry *top;
  41         if (s->s_top == s->s_base) {
  42                 fprintf(stderr, "s_push: parser stack overflow\n");
  43                 return E_NOMEM;
  44         }
  45         top = --s->s_top;
  46         top->s_dfa = d;
  47         top->s_parent = parent;
  48         top->s_state = 0;
  49         return 0;
  50 }
  51
  52 #ifdef Py_DEBUG
  53
  54 static void
  55 s_pop(register stack *s)
  56 {
  57         if (s_empty(s))
  58                 Py_FatalError("s_pop: parser stack underflow -- FATAL");
  59         s->s_top++;
  60 }
  61
  62 #else /* !Py_DEBUG */
  63
  64 #define s_pop(s) (s)->s_top++
  65
  66 #endif
  67
  68
  69 /* PARSER CREATION */
  70
  71 parser_state *
  72 PyParser_New(grammar *g, int start)
  73 {
  74         parser_state *ps;
  75
  76         if (!g->g_accel)
  77                 PyGrammar_AddAccelerators(g);
  78         ps = PyMem_NEW(parser_state, 1);
  79         if (ps == NULL)
  80                 return NULL;
  81         ps->p_grammar = g;
  82 #if 0 /* future keyword */
  83         ps->p_generators = 0;
  84 #endif
  85         ps->p_tree = PyNode_New(start);
  86         if (ps->p_tree == NULL) {
  87                 PyMem_DEL(ps);
  88                 return NULL;
  89         }
  90         s_reset(&ps->p_stack);
  91         (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
  92         return ps;
  93 }
  94
  95 void
  96 PyParser_Delete(parser_state *ps)
  97 {
  98         /* NB If you want to save the parse tree,
  99            you must set p_tree to NULL before calling delparser! */
 100         PyNode_Free(ps->p_tree);
 101         PyMem_DEL(ps);
 102 }
 103
 104
 105 /* PARSER STACK OPERATIONS */
 106
 107 static int
 108 shift(register stack *s, int type, char *str, int newstate, int lineno)
 109 {
 110         int err;
 111         assert(!s_empty(s));
 112         err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno);
 113         if (err)
 114                 return err;
 115         s->s_top->s_state = newstate;
 116         return 0;
 117 }
 118
 119 static int
 120 push(register stack *s, int type, dfa *d, int newstate, int lineno)
 121 {
 122         int err;
 123         register node *n;
 124         n = s->s_top->s_parent;
 125         assert(!s_empty(s));
 126         err = PyNode_AddChild(n, type, (char *)NULL, lineno);
 127         if (err)
 128                 return err;
 129         s->s_top->s_state = newstate;
 130         return s_push(s, d, CHILD(n, NCH(n)-1));
 131 }
 132
 133
 134 /* PARSER PROPER */
 135
 136 static int
 137 classify(parser_state *ps, int type, char *str)
 138 {
 139         grammar *g = ps->p_grammar;
 140         register int n = g->g_ll.ll_nlabels;
 141
 142         if (type == NAME) {
 143                 register char *s = str;
 144                 register label *l = g->g_ll.ll_label;
 145                 register int i;
 146                 for (i = n; i > 0; i--, l++) {
 147                         if (l->lb_type == NAME && l->lb_str != NULL &&
 148                                         l->lb_str[0] == s[0] &&
 149                                         strcmp(l->lb_str, s) == 0) {
 150 #if 0 /* future keyword */
 151                                 if (!ps->p_generators &&
 152                                     s[0] == 'y' &&
 153                                     strcmp(s, "yield") == 0)
 154                                         break; /* not a keyword */
 155 #endif
 156                                 D(printf("It's a keyword\n"));
 157                                 return n - i;
 158                         }
 159                 }
 160         }
 161
 162         {
 163                 register label *l = g->g_ll.ll_label;
 164                 register int i;
 165                 for (i = n; i > 0; i--, l++) {
 166                         if (l->lb_type == type && l->lb_str == NULL) {
 167                                 D(printf("It's a token we know\n"));
 168                                 return n - i;
 169                         }
 170                 }
 171         }
 172
 173         D(printf("Illegal token\n"));
 174         return -1;
 175 }
 176
 177 #if 0 /* future keyword */
 178 static void
 179 future_hack(parser_state *ps)
 180 {
 181         node *n = ps->p_stack.s_top->s_parent;
 182         node *ch;
 183         int i;
 184
 185         if (strcmp(STR(CHILD(n, 0)), "from") != 0)
 186                 return;
 187         ch = CHILD(n, 1);
 188         if (strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
 189                 return;
 190         for (i = 3; i < NCH(n); i += 2) {
 191                 ch = CHILD(n, i);
 192                 if (NCH(ch) >= 1 && TYPE(CHILD(ch, 0)) == NAME &&
 193                     strcmp(STR(CHILD(ch, 0)), "generators") == 0) {
 194                         ps->p_generators = 1;
 195                         break;
 196                 }
 197         }
 198 }
 199 #endif /* future keyword */
 200
 201 int
 202 PyParser_AddToken(register parser_state *ps, register int type, char *str,
 203                   int lineno, int *expected_ret)
 204 {
 205         register int ilabel;
 206         int err;
 207
 208         D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
 209
 210         /* Find out which label this token is */
 211         ilabel = classify(ps, type, str);
 212         if (ilabel < 0)
 213                 return E_SYNTAX;
 214
 215         /* Loop until the token is shifted or an error occurred */
 216         for (;;) {
 217                 /* Fetch the current dfa and state */
 218                 register dfa *d = ps->p_stack.s_top->s_dfa;
 219                 register state *s = &d->d_state[ps->p_stack.s_top->s_state];
 220
 221                 D(printf(" DFA '%s', state %d:",
 222                         d->d_name, ps->p_stack.s_top->s_state));
 223
 224                 /* Check accelerator */
 225                 if (s->s_lower <= ilabel && ilabel < s->s_upper) {
 226                         register int x = s->s_accel[ilabel - s->s_lower];
 227                         if (x != -1) {
 228                                 if (x & (1<<7)) {
 229                                         /* Push non-terminal */
 230                                         int nt = (x >> 8) + NT_OFFSET;
 231                                         int arrow = x & ((1<<7)-1);
 232                                         dfa *d1 = PyGrammar_FindDFA(
 233                                                 ps->p_grammar, nt);
 234                                         if ((err = push(&ps->p_stack, nt, d1,
 235                                                 arrow, lineno)) > 0) {
 236                                                 D(printf(" MemError: push\n"));
 237                                                 return err;
 238                                         }
 239                                         D(printf(" Push ...\n"));
 240                                         continue;
 241                                 }
 242
 243                                 /* Shift the token */
 244                                 if ((err = shift(&ps->p_stack, type, str,
 245                                                 x, lineno)) > 0) {
 246                                         D(printf(" MemError: shift.\n"));
 247                                         return err;
 248                                 }
 249                                 D(printf(" Shift.\n"));
 250                                 /* Pop while we are in an accept-only state */
 251                                 while (s = &d->d_state
 252                                                 [ps->p_stack.s_top->s_state],
 253                                         s->s_accept && s->s_narcs == 1) {
 254                                         D(printf("  DFA '%s', state %d: "
 255                                                  "Direct pop.\n",
 256                                                  d->d_name,
 257                                                  ps->p_stack.s_top->s_state));
 258 #if 0 /* future keyword */
 259                                         if (d->d_name[0] == 'i' &&
 260                                             strcmp(d->d_name,
 261                                                    "import_stmt") == 0)
 262                                                 future_hack(ps);
 263 #endif
 264                                         s_pop(&ps->p_stack);
 265                                         if (s_empty(&ps->p_stack)) {
 266                                                 D(printf("  ACCEPT.\n"));
 267                                                 return E_DONE;
 268                                         }
 269                                         d = ps->p_stack.s_top->s_dfa;
 270                                 }
 271                                 return E_OK;
 272                         }
 273                 }
 274
 275                 if (s->s_accept) {
 276 #if 0 /* future keyword */
 277                         if (d->d_name[0] == 'i' &&
 278                             strcmp(d->d_name, "import_stmt") == 0)
 279                                 future_hack(ps);
 280 #endif
 281                         /* Pop this dfa and try again */
 282                         s_pop(&ps->p_stack);
 283                         D(printf(" Pop ...\n"));
 284                         if (s_empty(&ps->p_stack)) {
 285                                 D(printf(" Error: bottom of stack.\n"));
 286                                 return E_SYNTAX;
 287                         }
 288                         continue;
 289                 }
 290
 291                 /* Stuck, report syntax error */
 292                 D(printf(" Error.\n"));
 293                 if (expected_ret) {
 294                         if (s->s_lower == s->s_upper - 1) {
 295                                 /* Only one possible expected token */
 296                                 *expected_ret = ps->p_grammar->
 297                                     g_ll.ll_label[s->s_lower].lb_type;
 298                         }
 299                         else
 300                                 *expected_ret = -1;
 301                 }
 302                 return E_SYNTAX;
 303         }
 304 }
 305
 306
 307 #ifdef Py_DEBUG
 308
 309 /* DEBUG OUTPUT */
 310
 311 void
 312 dumptree(grammar *g, node *n)
 313 {
 314         int i;
 315
 316         if (n == NULL)
 317                 printf("NIL");
 318         else {
 319                 label l;
 320                 l.lb_type = TYPE(n);
 321                 l.lb_str = STR(n);
 322                 printf("%s", PyGrammar_LabelRepr(&l));
 323                 if (ISNONTERMINAL(TYPE(n))) {
 324                         printf("(");
 325                         for (i = 0; i < NCH(n); i++) {
 326                                 if (i > 0)
 327                                         printf(",");
 328                                 dumptree(g, CHILD(n, i));
 329                         }
 330                         printf(")");
 331                 }
 332         }
 333 }
 334
 335 void
 336 showtree(grammar *g, node *n)
 337 {
 338         int i;
 339
 340         if (n == NULL)
 341                 return;
 342         if (ISNONTERMINAL(TYPE(n))) {
 343                 for (i = 0; i < NCH(n); i++)
 344                         showtree(g, CHILD(n, i));
 345         }
 346         else if (ISTERMINAL(TYPE(n))) {
 347                 printf("%s", _PyParser_TokenNames[TYPE(n)]);
 348                 if (TYPE(n) == NUMBER || TYPE(n) == NAME)
 349                         printf("(%s)", STR(n));
 350                 printf(" ");
 351         }
 352         else
 353                 printf("? ");
 354 }
 355
 356 void
 357 printtree(parser_state *ps)
 358 {
 359         if (Py_DebugFlag) {
 360                 printf("Parse tree:\n");
 361                 dumptree(ps->p_grammar, ps->p_tree);
 362                 printf("\n");
 363                 printf("Tokens:\n");
 364                 showtree(ps->p_grammar, ps->p_tree);
 365                 printf("\n");
 366         }
 367         printf("Listing:\n");
 368         PyNode_ListTree(ps->p_tree);
 369         printf("\n");
 370 }
 371
 372 #endif /* Py_DEBUG */
 373
 374 /*
 375
 376 Description
 377 -----------
 378
 379 The parser's interface is different than usual: the function addtoken()
 380 must be called for each token in the input.  This makes it possible to
 381 turn it into an incremental parsing system later.  The parsing system
 382 constructs a parse tree as it goes.
 383
 384 A parsing rule is represented as a Deterministic Finite-state Automaton
 385 (DFA).  A node in a DFA represents a state of the parser; an arc represents
 386 a transition.  Transitions are either labeled with terminal symbols or
 387 with non-terminals.  When the parser decides to follow an arc labeled
 388 with a non-terminal, it is invoked recursively with the DFA representing
 389 the parsing rule for that as its initial state; when that DFA accepts,
 390 the parser that invoked it continues.  The parse tree constructed by the
 391 recursively called parser is inserted as a child in the current parse tree.
 392
 393 The DFA's can be constructed automatically from a more conventional
 394 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
 395 Certain restrictions make the parser's life easier: rules that can produce
 396 the empty string should be outlawed (there are other ways to put loops
 397 or optional parts in the language).  To avoid the need to construct
 398 FIRST sets, we can require that all but the last alternative of a rule
 399 (really: arc going out of a DFA's state) must begin with a terminal
 400 symbol.
 401
 402 As an example, consider this grammar:
 403
 404 expr:   term (OP term)*
 405 term:   CONSTANT | '(' expr ')'
 406
 407 The DFA corresponding to the rule for expr is:
 408
 409 ------->.---term-->.------->
 410         ^          |
 411         |          |
 412         \----OP----/
 413
 414 The parse tree generated for the input a+b is:
 415
 416 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
 417
 418 */