3 /* For a description, see the comments at end of this file */
6 #include "pgenheaders.h"
10 #include "metagrammar.h"
13 extern int Py_DebugFlag
;
14 extern int Py_IgnoreEnvironmentFlag
; /* needed by Py_GETENV */
17 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
19 typedef struct _nfaarc
{
24 typedef struct _nfastate
{
34 int nf_start
, nf_finish
;
38 static void compile_rhs(labellist
*ll
,
39 nfa
*nf
, node
*n
, int *pa
, int *pb
);
40 static void compile_alt(labellist
*ll
,
41 nfa
*nf
, node
*n
, int *pa
, int *pb
);
42 static void compile_item(labellist
*ll
,
43 nfa
*nf
, node
*n
, int *pa
, int *pb
);
44 static void compile_atom(labellist
*ll
,
45 nfa
*nf
, node
*n
, int *pa
, int *pb
);
52 nf
->nf_state
= (nfastate
*)PyObject_REALLOC(nf
->nf_state
,
53 sizeof(nfastate
) * (nf
->nf_nstates
+ 1));
54 if (nf
->nf_state
== NULL
)
55 Py_FatalError("out of mem");
56 st
= &nf
->nf_state
[nf
->nf_nstates
++];
59 return st
- nf
->nf_state
;
63 addnfaarc(nfa
*nf
, int from
, int to
, int lbl
)
68 st
= &nf
->nf_state
[from
];
69 st
->st_arc
= (nfaarc
*)PyObject_REALLOC(st
->st_arc
,
70 sizeof(nfaarc
) * (st
->st_narcs
+ 1));
71 if (st
->st_arc
== NULL
)
72 Py_FatalError("out of mem");
73 ar
= &st
->st_arc
[st
->st_narcs
++];
82 static int type
= NT_OFFSET
; /* All types will be disjunct */
84 nf
= (nfa
*)PyObject_MALLOC(sizeof(nfa
));
86 Py_FatalError("no mem for new nfa");
88 nf
->nf_name
= name
; /* XXX strdup(name) ??? */
91 nf
->nf_start
= nf
->nf_finish
= -1;
95 typedef struct _nfagrammar
{
102 static void compile_rule(nfagrammar
*gr
, node
*n
);
109 gr
= (nfagrammar
*)PyObject_MALLOC(sizeof(nfagrammar
));
111 Py_FatalError("no mem for new nfa grammar");
114 gr
->gr_ll
.ll_nlabels
= 0;
115 gr
->gr_ll
.ll_label
= NULL
;
116 addlabel(&gr
->gr_ll
, ENDMARKER
, "EMPTY");
121 addnfa(nfagrammar
*gr
, char *name
)
126 gr
->gr_nfa
= (nfa
**)PyObject_REALLOC(gr
->gr_nfa
,
127 sizeof(nfa
*) * (gr
->gr_nnfas
+ 1));
128 if (gr
->gr_nfa
== NULL
)
129 Py_FatalError("out of mem");
130 gr
->gr_nfa
[gr
->gr_nnfas
++] = nf
;
131 addlabel(&gr
->gr_ll
, NAME
, nf
->nf_name
);
137 static char REQNFMT
[] = "metacompile: less than %d children\n";
139 #define REQN(i, count) \
141 fprintf(stderr, REQNFMT, count); \
142 Py_FatalError("REQN"); \
146 #define REQN(i, count) /* empty */
156 printf("Compiling (meta-) parse tree into NFA grammar\n");
157 gr
= newnfagrammar();
159 i
= n
->n_nchildren
- 1; /* Last child is ENDMARKER */
161 for (; --i
>= 0; n
++) {
162 if (n
->n_type
!= NEWLINE
)
169 compile_rule(nfagrammar
*gr
, node
*n
)
174 REQN(n
->n_nchildren
, 4);
177 nf
= addnfa(gr
, n
->n_str
);
182 compile_rhs(&gr
->gr_ll
, nf
, n
, &nf
->nf_start
, &nf
->nf_finish
);
188 compile_rhs(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
198 compile_alt(ll
, nf
, n
, pa
, pb
);
204 *pa
= addnfastate(nf
);
205 *pb
= addnfastate(nf
);
206 addnfaarc(nf
, *pa
, a
, EMPTY
);
207 addnfaarc(nf
, b
, *pb
, EMPTY
);
208 for (; --i
>= 0; n
++) {
214 compile_alt(ll
, nf
, n
, &a
, &b
);
215 addnfaarc(nf
, *pa
, a
, EMPTY
);
216 addnfaarc(nf
, b
, *pb
, EMPTY
);
221 compile_alt(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
231 compile_item(ll
, nf
, n
, pa
, pb
);
234 for (; --i
>= 0; n
++) {
236 compile_item(ll
, nf
, n
, &a
, &b
);
237 addnfaarc(nf
, *pb
, a
, EMPTY
);
243 compile_item(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
252 if (n
->n_type
== LSQB
) {
256 *pa
= addnfastate(nf
);
257 *pb
= addnfastate(nf
);
258 addnfaarc(nf
, *pa
, *pb
, EMPTY
);
259 compile_rhs(ll
, nf
, n
, &a
, &b
);
260 addnfaarc(nf
, *pa
, a
, EMPTY
);
261 addnfaarc(nf
, b
, *pb
, EMPTY
);
267 compile_atom(ll
, nf
, n
, pa
, pb
);
271 addnfaarc(nf
, *pb
, *pa
, EMPTY
);
272 if (n
->n_type
== STAR
)
280 compile_atom(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
288 if (n
->n_type
== LPAR
) {
292 compile_rhs(ll
, nf
, n
, pa
, pb
);
296 else if (n
->n_type
== NAME
|| n
->n_type
== STRING
) {
297 *pa
= addnfastate(nf
);
298 *pb
= addnfastate(nf
);
299 addnfaarc(nf
, *pa
, *pb
, addlabel(ll
, n
->n_type
, n
->n_str
));
306 dumpstate(labellist
*ll
, nfa
*nf
, int istate
)
313 istate
== nf
->nf_start
? '*' : ' ',
315 istate
== nf
->nf_finish
? '.' : ' ');
316 st
= &nf
->nf_state
[istate
];
318 for (i
= 0; i
< st
->st_narcs
; i
++) {
321 printf("-> %2d %s", ar
->ar_arrow
,
322 PyGrammar_LabelRepr(&ll
->ll_label
[ar
->ar_label
]));
329 dumpnfa(labellist
*ll
, nfa
*nf
)
333 printf("NFA '%s' has %d states; start %d, finish %d\n",
334 nf
->nf_name
, nf
->nf_nstates
, nf
->nf_start
, nf
->nf_finish
);
335 for (i
= 0; i
< nf
->nf_nstates
; i
++)
336 dumpstate(ll
, nf
, i
);
340 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
343 addclosure(bitset ss
, nfa
*nf
, int istate
)
345 if (addbit(ss
, istate
)) {
346 nfastate
*st
= &nf
->nf_state
[istate
];
347 nfaarc
*ar
= st
->st_arc
;
350 for (i
= st
->st_narcs
; --i
>= 0; ) {
351 if (ar
->ar_label
== EMPTY
)
352 addclosure(ss
, nf
, ar
->ar_arrow
);
358 typedef struct _ss_arc
{
364 typedef struct _ss_state
{
367 struct _ss_arc
*ss_arc
;
373 typedef struct _ss_dfa
{
379 static void printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
380 labellist
*ll
, char *msg
);
381 static void simplify(int xx_nstates
, ss_state
*xx_state
);
382 static void convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
);
385 makedfa(nfagrammar
*gr
, nfa
*nf
, dfa
*d
)
387 int nbits
= nf
->nf_nstates
;
390 ss_state
*xx_state
, *yy
;
392 int istate
, jstate
, iarc
, jarc
, ibit
;
396 ss
= newbitset(nbits
);
397 addclosure(ss
, nf
, nf
->nf_start
);
398 xx_state
= (ss_state
*)PyObject_MALLOC(sizeof(ss_state
));
399 if (xx_state
== NULL
)
400 Py_FatalError("no mem for xx_state in makedfa");
407 yy
->ss_finish
= testbit(ss
, nf
->nf_finish
);
409 printf("Error: nonterminal '%s' may produce empty.\n",
412 /* This algorithm is from a book written before
413 the invention of structured programming... */
415 /* For each unmarked state... */
416 for (istate
= 0; istate
< xx_nstates
; ++istate
) {
418 yy
= &xx_state
[istate
];
420 /* For all its states... */
421 for (ibit
= 0; ibit
< nf
->nf_nstates
; ++ibit
) {
422 if (!testbit(ss
, ibit
))
424 st
= &nf
->nf_state
[ibit
];
425 /* For all non-empty arcs from this state... */
426 for (iarc
= 0; iarc
< st
->st_narcs
; iarc
++) {
427 ar
= &st
->st_arc
[iarc
];
428 if (ar
->ar_label
== EMPTY
)
430 /* Look up in list of arcs from this state */
431 for (jarc
= 0; jarc
< yy
->ss_narcs
; ++jarc
) {
432 zz
= &yy
->ss_arc
[jarc
];
433 if (ar
->ar_label
== zz
->sa_label
)
436 /* Add new arc for this state */
437 size
= sizeof(ss_arc
) * (yy
->ss_narcs
+ 1);
438 yy
->ss_arc
= (ss_arc
*)PyObject_REALLOC(
440 if (yy
->ss_arc
== NULL
)
441 Py_FatalError("out of mem");
442 zz
= &yy
->ss_arc
[yy
->ss_narcs
++];
443 zz
->sa_label
= ar
->ar_label
;
444 zz
->sa_bitset
= newbitset(nbits
);
447 /* Add destination */
448 addclosure(zz
->sa_bitset
, nf
, ar
->ar_arrow
);
451 /* Now look up all the arrow states */
452 for (jarc
= 0; jarc
< xx_state
[istate
].ss_narcs
; jarc
++) {
453 zz
= &xx_state
[istate
].ss_arc
[jarc
];
454 for (jstate
= 0; jstate
< xx_nstates
; jstate
++) {
455 if (samebitset(zz
->sa_bitset
,
456 xx_state
[jstate
].ss_ss
, nbits
)) {
457 zz
->sa_arrow
= jstate
;
461 size
= sizeof(ss_state
) * (xx_nstates
+ 1);
462 xx_state
= (ss_state
*)PyObject_REALLOC(xx_state
,
464 if (xx_state
== NULL
)
465 Py_FatalError("out of mem");
466 zz
->sa_arrow
= xx_nstates
;
467 yy
= &xx_state
[xx_nstates
++];
468 yy
->ss_ss
= zz
->sa_bitset
;
472 yy
->ss_finish
= testbit(yy
->ss_ss
, nf
->nf_finish
);
478 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
479 "before minimizing");
481 simplify(xx_nstates
, xx_state
);
484 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
487 convert(d
, xx_nstates
, xx_state
);
490 PyObject_FREE(xx_state
);
494 printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
495 labellist
*ll
, char *msg
)
501 printf("Subset DFA %s\n", msg
);
502 for (i
= 0; i
< xx_nstates
; i
++) {
506 printf(" Subset %d", i
);
510 for (ibit
= 0; ibit
< nbits
; ibit
++) {
511 if (testbit(yy
->ss_ss
, ibit
))
515 for (iarc
= 0; iarc
< yy
->ss_narcs
; iarc
++) {
516 zz
= &yy
->ss_arc
[iarc
];
517 printf(" Arc to state %d, label %s\n",
520 &ll
->ll_label
[zz
->sa_label
]));
526 /* PART THREE -- SIMPLIFY DFA */
528 /* Simplify the DFA by repeatedly eliminating states that are
529 equivalent to another oner. This is NOT Algorithm 3.3 from
530 [Aho&Ullman 77]. It does not always finds the minimal DFA,
531 but it does usually make a much smaller one... (For an example
532 of sub-optimal behavior, try S: x a b+ | y a b+.)
536 samestate(ss_state
*s1
, ss_state
*s2
)
540 if (s1
->ss_narcs
!= s2
->ss_narcs
|| s1
->ss_finish
!= s2
->ss_finish
)
542 for (i
= 0; i
< s1
->ss_narcs
; i
++) {
543 if (s1
->ss_arc
[i
].sa_arrow
!= s2
->ss_arc
[i
].sa_arrow
||
544 s1
->ss_arc
[i
].sa_label
!= s2
->ss_arc
[i
].sa_label
)
551 renamestates(int xx_nstates
, ss_state
*xx_state
, int from
, int to
)
556 printf("Rename state %d to %d.\n", from
, to
);
557 for (i
= 0; i
< xx_nstates
; i
++) {
558 if (xx_state
[i
].ss_deleted
)
560 for (j
= 0; j
< xx_state
[i
].ss_narcs
; j
++) {
561 if (xx_state
[i
].ss_arc
[j
].sa_arrow
== from
)
562 xx_state
[i
].ss_arc
[j
].sa_arrow
= to
;
568 simplify(int xx_nstates
, ss_state
*xx_state
)
575 for (i
= 1; i
< xx_nstates
; i
++) {
576 if (xx_state
[i
].ss_deleted
)
578 for (j
= 0; j
< i
; j
++) {
579 if (xx_state
[j
].ss_deleted
)
581 if (samestate(&xx_state
[i
], &xx_state
[j
])) {
582 xx_state
[i
].ss_deleted
++;
583 renamestates(xx_nstates
, xx_state
,
594 /* PART FOUR -- GENERATE PARSING TABLES */
596 /* Convert the DFA into a grammar that can be used by our parser */
599 convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
)
605 for (i
= 0; i
< xx_nstates
; i
++) {
609 yy
->ss_rename
= addstate(d
);
612 for (i
= 0; i
< xx_nstates
; i
++) {
616 for (j
= 0; j
< yy
->ss_narcs
; j
++) {
618 addarc(d
, yy
->ss_rename
,
619 xx_state
[zz
->sa_arrow
].ss_rename
,
623 addarc(d
, yy
->ss_rename
, yy
->ss_rename
, 0);
630 /* PART FIVE -- GLUE IT ALL TOGETHER */
633 maketables(nfagrammar
*gr
)
640 if (gr
->gr_nnfas
== 0)
642 g
= newgrammar(gr
->gr_nfa
[0]->nf_type
);
643 /* XXX first rule must be start rule */
646 for (i
= 0; i
< gr
->gr_nnfas
; i
++) {
649 printf("Dump of NFA for '%s' ...\n", nf
->nf_name
);
650 dumpnfa(&gr
->gr_ll
, nf
);
651 printf("Making DFA for '%s' ...\n", nf
->nf_name
);
653 d
= adddfa(g
, nf
->nf_type
, nf
->nf_name
);
654 makedfa(gr
, gr
->gr_nfa
[i
], d
);
685 Input is a grammar in extended BNF (using * for repetition, + for
686 at-least-once repetition, [] for optional parts, | for alternatives and
687 () for grouping). This has already been parsed and turned into a parse
690 Each rule is considered as a regular expression in its own right.
691 It is turned into a Non-deterministic Finite Automaton (NFA), which
692 is then turned into a Deterministic Finite Automaton (DFA), which is then
693 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
694 or similar compiler books (this technique is more often used for lexical
697 The DFA's are used by the parser as parsing tables in a special way
698 that's probably unique. Before they are usable, the FIRST sets of all
699 non-terminals are computed.
705 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977