1 # Copyright (c) 1998-2002 John Aycock
3 # Permission is hereby granted, free of charge, to any person obtaining
4 # a copy of this software and associated documentation files (the
5 # "Software"), to deal in the Software without restriction, including
6 # without limitation the rights to use, copy, modify, merge, publish,
7 # distribute, sublicense, and/or sell copies of the Software, and to
8 # permit persons to whom the Software is furnished to do so, subject to
9 # the following conditions:
11 # The above copyright notice and this permission notice shall be
12 # included in all copies or substantial portions of the Software.
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 __version__
= 'SPARK-0.7 (pre-alpha-5)'
26 # Compatability with older pythons.
27 def output(string
='', end
='\n'):
28 sys
.stdout
.write(string
+ end
)
38 def _namelist(instance
):
39 namelist
, namedict
, classlist
= [], {}, [instance
.__class
__]
43 for name
in c
.__dict
__.keys():
44 if name
not in namedict
:
50 def __init__(self
, flags
=0):
51 pattern
= self
.reflect()
52 self
.re
= re
.compile(pattern
, re
.VERBOSE|flags
)
55 for name
, number
in self
.re
.groupindex
.items():
56 self
.index2func
[number
-1] = getattr(self
, 't_' + name
)
58 def makeRE(self
, name
):
59 doc
= getattr(self
, name
).__doc
__
60 rv
= '(?P<%s>%s)' % (name
[2:], doc
)
65 for name
in _namelist(self
):
66 if name
[:2] == 't_' and name
!= 't_default':
67 rv
.append(self
.makeRE(name
))
69 rv
.append(self
.makeRE('t_default'))
72 def error(self
, s
, pos
):
73 output("Lexical error at position %s" % pos
)
76 def tokenize(self
, s
):
80 m
= self
.re
.match(s
, pos
)
85 for i
in range(len(groups
)):
86 if groups
[i
] and i
in self
.index2func
:
87 self
.index2func
[i
](groups
[i
])
90 def t_default(self
, s
):
92 output("Specification error: unmatched input")
96 # Extracted from GenericParser and made global so that [un]picking works.
99 def __init__(self
, stateno
, items
):
100 self
.T
, self
.complete
, self
.items
= [], [], items
101 self
.stateno
= stateno
105 # An Earley parser, as per J. Earley, "An Efficient Context-Free
106 # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley,
107 # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis,
108 # Carnegie-Mellon University, August 1968. New formulation of
109 # the parser according to J. Aycock, "Practical Earley Parsing
110 # and the SPARK Toolkit", Ph.D. thesis, University of Victoria,
111 # 2001, and J. Aycock and R. N. Horspool, "Practical Earley
112 # Parsing", unpublished paper, 2001.
115 def __init__(self
, start
):
121 self
.ruleschanged
= 1
128 # When pickling, take the time to generate the full state machine;
129 # some information is then extraneous, too. Unfortunately we
130 # can't save the rule2func map.
132 def __getstate__(self
):
133 if self
.ruleschanged
:
135 # XXX - duplicated from parse()
141 self
.ruleschanged
= 0
142 self
.edges
, self
.cores
= {}, {}
143 self
.states
= { 0: self
.makeState0() }
144 self
.makeState(0, self
._BOF
)
146 # XXX - should find a better way to do this..
151 for k
, v
in self
.edges
.items():
154 if state
in self
.states
:
155 self
.goto(state
, sym
)
157 rv
= self
.__dict
__.copy()
158 for s
in self
.states
.values():
165 def __setstate__(self
, D
):
170 start
= D
['rules'][self
._START
][0][1][1] # Blech.
172 D
['rule2func'] = self
.rule2func
173 D
['makeSet'] = self
.makeSet_fast
177 # A hook for GenericASTBuilder and GenericASTMatcher. Mess
178 # thee not with this; nor shall thee toucheth the _preprocess
179 # argument to addRule.
181 def preprocess(self
, rule
, func
): return rule
, func
183 def addRule(self
, doc
, func
, _preprocess
=1):
188 for i
in range(len(rules
)):
189 if rules
[i
] == '::=':
191 index
.append(len(rules
))
193 for i
in range(len(index
)-1):
194 lhs
= rules
[index
[i
]]
195 rhs
= rules
[index
[i
]+2:index
[i
+1]]
196 rule
= (lhs
, tuple(rhs
))
199 rule
, fn
= self
.preprocess(rule
, func
)
201 if lhs
in self
.rules
:
202 self
.rules
[lhs
].append(rule
)
204 self
.rules
[lhs
] = [ rule
]
205 self
.rule2func
[rule
] = fn
206 self
.rule2name
[rule
] = func
.__name
__[2:]
207 self
.ruleschanged
= 1
209 def collectRules(self
):
210 for name
in _namelist(self
):
212 func
= getattr(self
, name
)
214 self
.addRule(doc
, func
)
216 def augment(self
, start
):
217 rule
= '%s ::= %s %s' % (self
._START
, self
._BOF
, start
)
218 self
.addRule(rule
, lambda args
: args
[1], 0)
220 def computeNull(self
):
224 for rulelist
in self
.rules
.values():
226 self
.nullable
[lhs
] = 0
227 for rule
in rulelist
:
230 self
.nullable
[lhs
] = 1
233 # We only need to consider rules which
234 # consist entirely of nonterminal symbols.
235 # This should be a savings on typical
239 if sym
not in self
.rules
:
247 if self
.nullable
[lhs
]:
250 if not self
.nullable
[sym
]:
253 self
.nullable
[lhs
] = 1
256 def makeState0(self
):
258 for rule
in self
.newrules
[self
._START
]:
259 s0
.items
.append((rule
, 0))
262 def finalState(self
, tokens
):
266 if len(self
.newrules
[self
._START
]) == 2 and len(tokens
) == 0:
268 start
= self
.rules
[self
._START
][0][1][1]
269 return self
.goto(1, start
)
271 def makeNewRules(self
):
273 for rulelist
in self
.rules
.values():
274 for rule
in rulelist
:
275 worklist
.append((rule
, 0, 1, rule
))
277 for rule
, i
, candidate
, oldrule
in worklist
:
282 if sym
not in self
.rules
or \
283 not self
.nullable
[sym
]:
289 newrhs
[i
] = self
._NULLABLE
+sym
290 newrule
= (lhs
, tuple(newrhs
))
291 worklist
.append((newrule
, i
+1,
297 lhs
= self
._NULLABLE
+lhs
299 if lhs
in self
.newrules
:
300 self
.newrules
[lhs
].append(rule
)
302 self
.newrules
[lhs
] = [ rule
]
303 self
.new2old
[rule
] = oldrule
305 def typestring(self
, token
):
308 def error(self
, token
):
309 output("Syntax error at or near `%s' token" % token
)
312 def parse(self
, tokens
):
313 sets
= [ [(1,0), (2,0)] ]
316 if self
.ruleschanged
:
321 self
.ruleschanged
= 0
322 self
.edges
, self
.cores
= {}, {}
323 self
.states
= { 0: self
.makeState0() }
324 self
.makeState(0, self
._BOF
)
326 for i
in range(len(tokens
)):
331 self
.makeSet(tokens
[i
], sets
, i
)
334 self
.makeSet(None, sets
, len(tokens
))
336 #_dump(tokens, sets, self.states)
338 finalitem
= (self
.finalState(tokens
), 0)
339 if finalitem
not in sets
[-2]:
341 self
.error(tokens
[i
-1])
345 return self
.buildTree(self
._START
, finalitem
,
348 def isnullable(self
, sym
):
350 # For symbols in G_e only. If we weren't supporting 1.5,
351 # could just use sym.startswith().
353 return self
._NULLABLE
== sym
[0:len(self
._NULLABLE
)]
355 def skip(self
, hs
, pos
=0):
358 if not self
.isnullable(hs
[1][pos
]):
363 def makeState(self
, state
, sym
):
364 assert sym
is not None
366 # Compute \epsilon-kernel state's core and see if
370 for rule
, pos
in self
.states
[state
].items
:
372 if rhs
[pos
:pos
+1] == (sym
,):
373 kitems
.append((rule
, self
.skip(rule
, pos
+1)))
378 if tcore
in self
.cores
:
379 return self
.cores
[tcore
]
381 # Nope, doesn't exist. Compute it and the associated
382 # \epsilon-nonkernel state together; we'll need it right away.
384 k
= self
.cores
[tcore
] = len(self
.states
)
385 K
, NK
= _State(k
, kitems
), _State(k
+1, [])
390 rules
= self
.newrules
393 for item
in worklist
:
397 X
.complete
.append(rule
)
401 key
= (X
.stateno
, nextSym
)
402 if nextSym
not in rules
:
408 if nextSym
not in predicted
:
409 predicted
[nextSym
] = 1
410 for prule
in rules
[nextSym
]:
411 ppos
= self
.skip(prule
)
415 # Problem: we know K needs generating, but we
416 # don't yet know about NK. Can't commit anything
417 # regarding NK to self.edges until we're sure. Should
418 # we delay committing on both K and NK to avoid this
419 # hacky code? This creates other problems..
428 # Check for \epsilon-nonkernel's core. Unfortunately we
429 # need to know the entire set of predicted nonterminals
430 # to do this without accidentally duplicating states.
432 core
= sorted(predicted
.keys())
434 if tcore
in self
.cores
:
435 self
.edges
[(k
, None)] = self
.cores
[tcore
]
438 nk
= self
.cores
[tcore
] = self
.edges
[(k
, None)] = NK
.stateno
439 self
.edges
.update(edges
)
443 def goto(self
, state
, sym
):
445 if key
not in self
.edges
:
447 # No transitions from state on sym.
454 # Target state isn't generated yet. Remedy this.
456 rv
= self
.makeState(state
, sym
)
460 def gotoT(self
, state
, t
):
461 return [self
.goto(state
, t
)]
463 def gotoST(self
, state
, st
):
465 for t
in self
.states
[state
].T
:
467 rv
.append(self
.goto(state
, t
))
470 def add(self
, set, item
, i
=None, predecessor
=None, causal
=None):
471 if predecessor
is None:
479 self
.links
[key
].append((predecessor
, causal
))
481 def makeSet(self
, token
, sets
, i
):
482 cur
, next
= sets
[i
], sets
[i
+1]
484 ttype
= token
is not None and self
.typestring(token
) or None
485 if ttype
is not None:
486 fn
, arg
= self
.gotoT
, ttype
488 fn
, arg
= self
.gotoST
, token
496 self
.add(next
, (k
, parent
), i
+1, ptr
)
497 nk
= self
.goto(k
, None)
499 self
.add(next
, (nk
, i
+1))
504 for rule
in self
.states
[state
].complete
:
506 for pitem
in sets
[parent
]:
507 pstate
, pparent
= pitem
508 k
= self
.goto(pstate
, lhs
)
510 why
= (item
, i
, rule
)
511 pptr
= (pitem
, parent
)
512 self
.add(cur
, (k
, pparent
),
514 nk
= self
.goto(k
, None)
516 self
.add(cur
, (nk
, i
))
518 def makeSet_fast(self
, token
, sets
, i
):
520 # Call *only* when the entire state machine has been built!
521 # It relies on self.edges being filled in completely, and
522 # then duplicates and inlines code to boost speed at the
523 # cost of extreme ugliness.
525 cur
, next
= sets
[i
], sets
[i
+1]
526 ttype
= token
is not None and self
.typestring(token
) or None
531 if ttype
is not None:
532 k
= self
.edges
.get((state
, ttype
), None)
534 #self.add(next, (k, parent), i+1, ptr)
541 self
.links
[key
].append((ptr
, None))
543 #nk = self.goto(k, None)
544 nk
= self
.edges
.get((k
, None), None)
546 #self.add(next, (nk, i+1))
553 add
= self
.gotoST(state
, token
)
556 self
.add(next
, (k
, parent
), i
+1, ptr
)
557 #nk = self.goto(k, None)
558 nk
= self
.edges
.get((k
, None), None)
560 self
.add(next
, (nk
, i
+1))
565 for rule
in self
.states
[state
].complete
:
567 for pitem
in sets
[parent
]:
568 pstate
, pparent
= pitem
569 #k = self.goto(pstate, lhs)
570 k
= self
.edges
.get((pstate
, lhs
), None)
572 why
= (item
, i
, rule
)
573 pptr
= (pitem
, parent
)
574 #self.add(cur, (k, pparent),
582 self
.links
[key
].append((pptr
, why
))
584 #nk = self.goto(k, None)
585 nk
= self
.edges
.get((k
, None), None)
587 #self.add(cur, (nk, i))
594 def predecessor(self
, key
, causal
):
595 for p
, c
in self
.links
[key
]:
600 def causal(self
, key
):
601 links
= self
.links
[key
]
610 return rule2cause
[self
.ambiguity(choices
)]
612 def deriveEpsilon(self
, nt
):
613 if len(self
.newrules
[nt
]) > 1:
614 rule
= self
.ambiguity(self
.newrules
[nt
])
616 rule
= self
.newrules
[nt
][0]
620 attr
= [None] * len(rhs
)
622 for i
in range(len(rhs
)-1, -1, -1):
623 attr
[i
] = self
.deriveEpsilon(rhs
[i
])
624 return self
.rule2func
[self
.new2old
[rule
]](attr
)
626 def buildTree(self
, nt
, item
, tokens
, k
):
630 for rule
in self
.states
[state
].complete
:
635 rule
= self
.ambiguity(choices
)
639 attr
= [None] * len(rhs
)
641 for i
in range(len(rhs
)-1, -1, -1):
643 if sym
not in self
.newrules
:
645 attr
[i
] = tokens
[k
-1]
647 item
, k
= self
.predecessor(key
, None)
648 #elif self.isnullable(sym):
649 elif self
._NULLABLE
== sym
[0:len(self
._NULLABLE
)]:
650 attr
[i
] = self
.deriveEpsilon(sym
)
653 why
= self
.causal(key
)
654 attr
[i
] = self
.buildTree(sym
, why
[0],
656 item
, k
= self
.predecessor(key
, why
)
657 return self
.rule2func
[self
.new2old
[rule
]](attr
)
659 def ambiguity(self
, rules
):
661 # XXX - problem here and in collectRules() if the same rule
662 # appears in >1 method. Also undefined results if rules
663 # causing the ambiguity appear in the same method.
667 for i
in range(len(rules
)):
668 lhs
, rhs
= rule
= rules
[i
]
669 name
= self
.rule2name
[self
.new2old
[rule
]]
670 sortlist
.append((len(rhs
), name
))
673 list = [b
for a
, b
in sortlist
]
674 return rules
[name2index
[self
.resolve(list)]]
676 def resolve(self
, list):
678 # Resolve ambiguity in favor of the shortest RHS.
679 # Since we walk the tree from the top down, this
680 # should effectively resolve in favor of a "shift".
685 # GenericASTBuilder automagically constructs a concrete/abstract syntax tree
686 # for a given input. The extra argument is a class (not an instance!)
687 # which supports the "__setslice__" and "__len__" methods.
689 # XXX - silently overrides any user code in methods.
692 class GenericASTBuilder(GenericParser
):
693 def __init__(self
, AST
, start
):
694 GenericParser
.__init
__(self
, start
)
697 def preprocess(self
, rule
, func
):
698 rebind
= lambda lhs
, self
=self
: \
699 lambda args
, lhs
=lhs
, self
=self
: \
700 self
.buildASTNode(args
, lhs
)
702 return rule
, rebind(lhs
)
704 def buildASTNode(self
, args
, lhs
):
707 if isinstance(arg
, self
.AST
):
710 children
.append(self
.terminal(arg
))
711 return self
.nonterminal(lhs
, children
)
713 def terminal(self
, token
): return token
715 def nonterminal(self
, type, args
):
717 rv
[:len(args
)] = args
721 # GenericASTTraversal is a Visitor pattern according to Design Patterns. For
722 # each node it attempts to invoke the method n_<node type>, falling
723 # back onto the default() method if the n_* can't be found. The preorder
724 # traversal also looks for an exit hook named n_<node type>_exit (no default
725 # routine is called if it's not found). To prematurely halt traversal
726 # of a subtree, call the prune() method -- this only makes sense for a
727 # preorder traversal. Node type is determined via the typestring() method.
730 class GenericASTTraversalPruningException
:
733 class GenericASTTraversal
:
734 def __init__(self
, ast
):
737 def typestring(self
, node
):
741 raise GenericASTTraversalPruningException
743 def preorder(self
, node
=None):
748 name
= 'n_' + self
.typestring(node
)
749 if hasattr(self
, name
):
750 func
= getattr(self
, name
)
754 except GenericASTTraversalPruningException
:
760 name
= name
+ '_exit'
761 if hasattr(self
, name
):
762 func
= getattr(self
, name
)
765 def postorder(self
, node
=None):
772 name
= 'n_' + self
.typestring(node
)
773 if hasattr(self
, name
):
774 func
= getattr(self
, name
)
780 def default(self
, node
):
784 # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__"
787 # XXX - makes assumptions about how GenericParser walks the parse tree.
790 class GenericASTMatcher(GenericParser
):
791 def __init__(self
, start
, ast
):
792 GenericParser
.__init
__(self
, start
)
795 def preprocess(self
, rule
, func
):
796 rebind
= lambda func
, self
=self
: \
797 lambda args
, func
=func
, self
=self
: \
798 self
.foundMatch(args
, func
)
803 return (lhs
, tuple(rhslist
)), rebind(func
)
805 def foundMatch(self
, args
, func
):
809 def match_r(self
, node
):
810 self
.input.insert(0, node
)
815 self
.input.insert(0, '(')
816 children
= children
+ 1
820 self
.input.insert(0, ')')
822 def match(self
, ast
=None):
828 self
.parse(self
.input)
830 def resolve(self
, list):
832 # Resolve ambiguity in favor of the longest RHS.
836 def _dump(tokens
, sets
, states
):
837 for i
in range(len(sets
)):
841 for (lhs
, rhs
), pos
in states
[item
[0]].items
:
842 output('\t\t', lhs
, '::=', end
='')
843 output(' '.join(rhs
[:pos
]), end
='')
845 output(' '.join(rhs
[pos
:]))
848 output('token %s' % str(tokens
[i
]))