3 /* For a description, see the comments at end of this file */
6 #include "pgenheaders.h"
10 #include "metagrammar.h"
13 extern int Py_DebugFlag
;
14 extern int Py_IgnoreEnvironmentFlag
; /* needed by Py_GETENV */
17 /* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
19 typedef struct _nfaarc
{
24 typedef struct _nfastate
{
34 int nf_start
, nf_finish
;
38 static void compile_rhs(labellist
*ll
,
39 nfa
*nf
, node
*n
, int *pa
, int *pb
);
40 static void compile_alt(labellist
*ll
,
41 nfa
*nf
, node
*n
, int *pa
, int *pb
);
42 static void compile_item(labellist
*ll
,
43 nfa
*nf
, node
*n
, int *pa
, int *pb
);
44 static void compile_atom(labellist
*ll
,
45 nfa
*nf
, node
*n
, int *pa
, int *pb
);
52 PyMem_RESIZE(nf
->nf_state
, nfastate
, nf
->nf_nstates
+ 1);
53 if (nf
->nf_state
== NULL
)
54 Py_FatalError("out of mem");
55 st
= &nf
->nf_state
[nf
->nf_nstates
++];
58 return st
- nf
->nf_state
;
62 addnfaarc(nfa
*nf
, int from
, int to
, int lbl
)
67 st
= &nf
->nf_state
[from
];
68 PyMem_RESIZE(st
->st_arc
, nfaarc
, st
->st_narcs
+ 1);
69 if (st
->st_arc
== NULL
)
70 Py_FatalError("out of mem");
71 ar
= &st
->st_arc
[st
->st_narcs
++];
80 static int type
= NT_OFFSET
; /* All types will be disjunct */
82 nf
= PyMem_NEW(nfa
, 1);
84 Py_FatalError("no mem for new nfa");
86 nf
->nf_name
= name
; /* XXX strdup(name) ??? */
89 nf
->nf_start
= nf
->nf_finish
= -1;
93 typedef struct _nfagrammar
{
100 static void compile_rule(nfagrammar
*gr
, node
*n
);
107 gr
= PyMem_NEW(nfagrammar
, 1);
109 Py_FatalError("no mem for new nfa grammar");
112 gr
->gr_ll
.ll_nlabels
= 0;
113 gr
->gr_ll
.ll_label
= NULL
;
114 addlabel(&gr
->gr_ll
, ENDMARKER
, "EMPTY");
119 addnfa(nfagrammar
*gr
, char *name
)
124 PyMem_RESIZE(gr
->gr_nfa
, nfa
*, gr
->gr_nnfas
+ 1);
125 if (gr
->gr_nfa
== NULL
)
126 Py_FatalError("out of mem");
127 gr
->gr_nfa
[gr
->gr_nnfas
++] = nf
;
128 addlabel(&gr
->gr_ll
, NAME
, nf
->nf_name
);
134 static char REQNFMT
[] = "metacompile: less than %d children\n";
136 #define REQN(i, count) \
138 fprintf(stderr, REQNFMT, count); \
139 Py_FatalError("REQN"); \
143 #define REQN(i, count) /* empty */
153 printf("Compiling (meta-) parse tree into NFA grammar\n");
154 gr
= newnfagrammar();
156 i
= n
->n_nchildren
- 1; /* Last child is ENDMARKER */
158 for (; --i
>= 0; n
++) {
159 if (n
->n_type
!= NEWLINE
)
166 compile_rule(nfagrammar
*gr
, node
*n
)
171 REQN(n
->n_nchildren
, 4);
174 nf
= addnfa(gr
, n
->n_str
);
179 compile_rhs(&gr
->gr_ll
, nf
, n
, &nf
->nf_start
, &nf
->nf_finish
);
185 compile_rhs(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
195 compile_alt(ll
, nf
, n
, pa
, pb
);
201 *pa
= addnfastate(nf
);
202 *pb
= addnfastate(nf
);
203 addnfaarc(nf
, *pa
, a
, EMPTY
);
204 addnfaarc(nf
, b
, *pb
, EMPTY
);
205 for (; --i
>= 0; n
++) {
211 compile_alt(ll
, nf
, n
, &a
, &b
);
212 addnfaarc(nf
, *pa
, a
, EMPTY
);
213 addnfaarc(nf
, b
, *pb
, EMPTY
);
218 compile_alt(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
228 compile_item(ll
, nf
, n
, pa
, pb
);
231 for (; --i
>= 0; n
++) {
233 compile_item(ll
, nf
, n
, &a
, &b
);
234 addnfaarc(nf
, *pb
, a
, EMPTY
);
240 compile_item(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
249 if (n
->n_type
== LSQB
) {
253 *pa
= addnfastate(nf
);
254 *pb
= addnfastate(nf
);
255 addnfaarc(nf
, *pa
, *pb
, EMPTY
);
256 compile_rhs(ll
, nf
, n
, &a
, &b
);
257 addnfaarc(nf
, *pa
, a
, EMPTY
);
258 addnfaarc(nf
, b
, *pb
, EMPTY
);
264 compile_atom(ll
, nf
, n
, pa
, pb
);
268 addnfaarc(nf
, *pb
, *pa
, EMPTY
);
269 if (n
->n_type
== STAR
)
277 compile_atom(labellist
*ll
, nfa
*nf
, node
*n
, int *pa
, int *pb
)
285 if (n
->n_type
== LPAR
) {
289 compile_rhs(ll
, nf
, n
, pa
, pb
);
293 else if (n
->n_type
== NAME
|| n
->n_type
== STRING
) {
294 *pa
= addnfastate(nf
);
295 *pb
= addnfastate(nf
);
296 addnfaarc(nf
, *pa
, *pb
, addlabel(ll
, n
->n_type
, n
->n_str
));
303 dumpstate(labellist
*ll
, nfa
*nf
, int istate
)
310 istate
== nf
->nf_start
? '*' : ' ',
312 istate
== nf
->nf_finish
? '.' : ' ');
313 st
= &nf
->nf_state
[istate
];
315 for (i
= 0; i
< st
->st_narcs
; i
++) {
318 printf("-> %2d %s", ar
->ar_arrow
,
319 PyGrammar_LabelRepr(&ll
->ll_label
[ar
->ar_label
]));
326 dumpnfa(labellist
*ll
, nfa
*nf
)
330 printf("NFA '%s' has %d states; start %d, finish %d\n",
331 nf
->nf_name
, nf
->nf_nstates
, nf
->nf_start
, nf
->nf_finish
);
332 for (i
= 0; i
< nf
->nf_nstates
; i
++)
333 dumpstate(ll
, nf
, i
);
337 /* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
340 addclosure(bitset ss
, nfa
*nf
, int istate
)
342 if (addbit(ss
, istate
)) {
343 nfastate
*st
= &nf
->nf_state
[istate
];
344 nfaarc
*ar
= st
->st_arc
;
347 for (i
= st
->st_narcs
; --i
>= 0; ) {
348 if (ar
->ar_label
== EMPTY
)
349 addclosure(ss
, nf
, ar
->ar_arrow
);
355 typedef struct _ss_arc
{
361 typedef struct _ss_state
{
370 typedef struct _ss_dfa
{
376 static void printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
377 labellist
*ll
, char *msg
);
378 static void simplify(int xx_nstates
, ss_state
*xx_state
);
379 static void convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
);
382 makedfa(nfagrammar
*gr
, nfa
*nf
, dfa
*d
)
384 int nbits
= nf
->nf_nstates
;
387 ss_state
*xx_state
, *yy
;
389 int istate
, jstate
, iarc
, jarc
, ibit
;
393 ss
= newbitset(nbits
);
394 addclosure(ss
, nf
, nf
->nf_start
);
395 xx_state
= PyMem_NEW(ss_state
, 1);
396 if (xx_state
== NULL
)
397 Py_FatalError("no mem for xx_state in makedfa");
404 yy
->ss_finish
= testbit(ss
, nf
->nf_finish
);
406 printf("Error: nonterminal '%s' may produce empty.\n",
409 /* This algorithm is from a book written before
410 the invention of structured programming... */
412 /* For each unmarked state... */
413 for (istate
= 0; istate
< xx_nstates
; ++istate
) {
414 yy
= &xx_state
[istate
];
416 /* For all its states... */
417 for (ibit
= 0; ibit
< nf
->nf_nstates
; ++ibit
) {
418 if (!testbit(ss
, ibit
))
420 st
= &nf
->nf_state
[ibit
];
421 /* For all non-empty arcs from this state... */
422 for (iarc
= 0; iarc
< st
->st_narcs
; iarc
++) {
423 ar
= &st
->st_arc
[iarc
];
424 if (ar
->ar_label
== EMPTY
)
426 /* Look up in list of arcs from this state */
427 for (jarc
= 0; jarc
< yy
->ss_narcs
; ++jarc
) {
428 zz
= &yy
->ss_arc
[jarc
];
429 if (ar
->ar_label
== zz
->sa_label
)
432 /* Add new arc for this state */
433 PyMem_RESIZE(yy
->ss_arc
, ss_arc
,
435 if (yy
->ss_arc
== NULL
)
436 Py_FatalError("out of mem");
437 zz
= &yy
->ss_arc
[yy
->ss_narcs
++];
438 zz
->sa_label
= ar
->ar_label
;
439 zz
->sa_bitset
= newbitset(nbits
);
442 /* Add destination */
443 addclosure(zz
->sa_bitset
, nf
, ar
->ar_arrow
);
446 /* Now look up all the arrow states */
447 for (jarc
= 0; jarc
< xx_state
[istate
].ss_narcs
; jarc
++) {
448 zz
= &xx_state
[istate
].ss_arc
[jarc
];
449 for (jstate
= 0; jstate
< xx_nstates
; jstate
++) {
450 if (samebitset(zz
->sa_bitset
,
451 xx_state
[jstate
].ss_ss
, nbits
)) {
452 zz
->sa_arrow
= jstate
;
456 PyMem_RESIZE(xx_state
, ss_state
, xx_nstates
+ 1);
457 if (xx_state
== NULL
)
458 Py_FatalError("out of mem");
459 zz
->sa_arrow
= xx_nstates
;
460 yy
= &xx_state
[xx_nstates
++];
461 yy
->ss_ss
= zz
->sa_bitset
;
465 yy
->ss_finish
= testbit(yy
->ss_ss
, nf
->nf_finish
);
471 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
472 "before minimizing");
474 simplify(xx_nstates
, xx_state
);
477 printssdfa(xx_nstates
, xx_state
, nbits
, &gr
->gr_ll
,
480 convert(d
, xx_nstates
, xx_state
);
486 printssdfa(int xx_nstates
, ss_state
*xx_state
, int nbits
,
487 labellist
*ll
, char *msg
)
493 printf("Subset DFA %s\n", msg
);
494 for (i
= 0; i
< xx_nstates
; i
++) {
498 printf(" Subset %d", i
);
502 for (ibit
= 0; ibit
< nbits
; ibit
++) {
503 if (testbit(yy
->ss_ss
, ibit
))
507 for (iarc
= 0; iarc
< yy
->ss_narcs
; iarc
++) {
508 zz
= &yy
->ss_arc
[iarc
];
509 printf(" Arc to state %d, label %s\n",
512 &ll
->ll_label
[zz
->sa_label
]));
518 /* PART THREE -- SIMPLIFY DFA */
520 /* Simplify the DFA by repeatedly eliminating states that are
521 equivalent to another oner. This is NOT Algorithm 3.3 from
522 [Aho&Ullman 77]. It does not always finds the minimal DFA,
523 but it does usually make a much smaller one... (For an example
524 of sub-optimal behavior, try S: x a b+ | y a b+.)
528 samestate(ss_state
*s1
, ss_state
*s2
)
532 if (s1
->ss_narcs
!= s2
->ss_narcs
|| s1
->ss_finish
!= s2
->ss_finish
)
534 for (i
= 0; i
< s1
->ss_narcs
; i
++) {
535 if (s1
->ss_arc
[i
].sa_arrow
!= s2
->ss_arc
[i
].sa_arrow
||
536 s1
->ss_arc
[i
].sa_label
!= s2
->ss_arc
[i
].sa_label
)
543 renamestates(int xx_nstates
, ss_state
*xx_state
, int from
, int to
)
548 printf("Rename state %d to %d.\n", from
, to
);
549 for (i
= 0; i
< xx_nstates
; i
++) {
550 if (xx_state
[i
].ss_deleted
)
552 for (j
= 0; j
< xx_state
[i
].ss_narcs
; j
++) {
553 if (xx_state
[i
].ss_arc
[j
].sa_arrow
== from
)
554 xx_state
[i
].ss_arc
[j
].sa_arrow
= to
;
560 simplify(int xx_nstates
, ss_state
*xx_state
)
567 for (i
= 1; i
< xx_nstates
; i
++) {
568 if (xx_state
[i
].ss_deleted
)
570 for (j
= 0; j
< i
; j
++) {
571 if (xx_state
[j
].ss_deleted
)
573 if (samestate(&xx_state
[i
], &xx_state
[j
])) {
574 xx_state
[i
].ss_deleted
++;
575 renamestates(xx_nstates
, xx_state
,
586 /* PART FOUR -- GENERATE PARSING TABLES */
588 /* Convert the DFA into a grammar that can be used by our parser */
591 convert(dfa
*d
, int xx_nstates
, ss_state
*xx_state
)
597 for (i
= 0; i
< xx_nstates
; i
++) {
601 yy
->ss_rename
= addstate(d
);
604 for (i
= 0; i
< xx_nstates
; i
++) {
608 for (j
= 0; j
< yy
->ss_narcs
; j
++) {
610 addarc(d
, yy
->ss_rename
,
611 xx_state
[zz
->sa_arrow
].ss_rename
,
615 addarc(d
, yy
->ss_rename
, yy
->ss_rename
, 0);
622 /* PART FIVE -- GLUE IT ALL TOGETHER */
625 maketables(nfagrammar
*gr
)
632 if (gr
->gr_nnfas
== 0)
634 g
= newgrammar(gr
->gr_nfa
[0]->nf_type
);
635 /* XXX first rule must be start rule */
638 for (i
= 0; i
< gr
->gr_nnfas
; i
++) {
641 printf("Dump of NFA for '%s' ...\n", nf
->nf_name
);
642 dumpnfa(&gr
->gr_ll
, nf
);
643 printf("Making DFA for '%s' ...\n", nf
->nf_name
);
645 d
= adddfa(g
, nf
->nf_type
, nf
->nf_name
);
646 makedfa(gr
, gr
->gr_nfa
[i
], d
);
676 Input is a grammar in extended BNF (using * for repetition, + for
677 at-least-once repetition, [] for optional parts, | for alternatives and
678 () for grouping). This has already been parsed and turned into a parse
681 Each rule is considered as a regular expression in its own right.
682 It is turned into a Non-deterministic Finite Automaton (NFA), which
683 is then turned into a Deterministic Finite Automaton (DFA), which is then
684 optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
685 or similar compiler books (this technique is more often used for lexical
688 The DFA's are used by the parser as parsing tables in a special way
689 that's probably unique. Before they are usable, the FIRST sets of all
690 non-terminals are computed.
696 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977