1 # Copyright (c) 1998-2002 John Aycock
3 # Permission is hereby granted, free of charge, to any person obtaining
4 # a copy of this software and associated documentation files (the
5 # "Software"), to deal in the Software without restriction, including
6 # without limitation the rights to use, copy, modify, merge, publish,
7 # distribute, sublicense, and/or sell copies of the Software, and to
8 # permit persons to whom the Software is furnished to do so, subject to
9 # the following conditions:
11 # The above copyright notice and this permission notice shall be
12 # included in all copies or substantial portions of the Software.
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 __version__
= 'SPARK-0.7 (pre-alpha-5)'
27 def _namelist(instance
):
28 namelist
, namedict
, classlist
= [], {}, [instance
.__class
__]
32 for name
in c
.__dict
__.keys():
33 if not namedict
.has_key(name
):
39 def __init__(self
, flags
=0):
40 pattern
= self
.reflect()
41 self
.re
= re
.compile(pattern
, re
.VERBOSE|flags
)
44 for name
, number
in self
.re
.groupindex
.items():
45 self
.index2func
[number
-1] = getattr(self
, 't_' + name
)
47 def makeRE(self
, name
):
48 doc
= getattr(self
, name
).__doc
__
49 rv
= '(?P<%s>%s)' % (name
[2:], doc
)
54 for name
in _namelist(self
):
55 if name
[:2] == 't_' and name
!= 't_default':
56 rv
.append(self
.makeRE(name
))
58 rv
.append(self
.makeRE('t_default'))
59 return string
.join(rv
, '|')
61 def error(self
, s
, pos
):
62 print "Lexical error at position %s" % pos
65 def tokenize(self
, s
):
69 m
= self
.re
.match(s
, pos
)
74 for i
in range(len(groups
)):
75 if groups
[i
] and self
.index2func
.has_key(i
):
76 self
.index2func
[i
](groups
[i
])
79 def t_default(self
, s
):
81 print "Specification error: unmatched input"
85 # Extracted from GenericParser and made global so that [un]picking works.
88 def __init__(self
, stateno
, items
):
89 self
.T
, self
.complete
, self
.items
= [], [], items
90 self
.stateno
= stateno
94 # An Earley parser, as per J. Earley, "An Efficient Context-Free
95 # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley,
96 # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis,
97 # Carnegie-Mellon University, August 1968. New formulation of
98 # the parser according to J. Aycock, "Practical Earley Parsing
99 # and the SPARK Toolkit", Ph.D. thesis, University of Victoria,
100 # 2001, and J. Aycock and R. N. Horspool, "Practical Earley
101 # Parsing", unpublished paper, 2001.
104 def __init__(self
, start
):
110 self
.ruleschanged
= 1
117 # When pickling, take the time to generate the full state machine;
118 # some information is then extraneous, too. Unfortunately we
119 # can't save the rule2func map.
121 def __getstate__(self
):
122 if self
.ruleschanged
:
124 # XXX - duplicated from parse()
130 self
.ruleschanged
= 0
131 self
.edges
, self
.cores
= {}, {}
132 self
.states
= { 0: self
.makeState0() }
133 self
.makeState(0, self
._BOF
)
135 # XXX - should find a better way to do this..
140 for k
, v
in self
.edges
.items():
143 if self
.states
.has_key(state
):
144 self
.goto(state
, sym
)
146 rv
= self
.__dict
__.copy()
147 for s
in self
.states
.values():
154 def __setstate__(self
, D
):
159 start
= D
['rules'][self
._START
][0][1][1] # Blech.
161 D
['rule2func'] = self
.rule2func
162 D
['makeSet'] = self
.makeSet_fast
166 # A hook for GenericASTBuilder and GenericASTMatcher. Mess
167 # thee not with this; nor shall thee toucheth the _preprocess
168 # argument to addRule.
170 def preprocess(self
, rule
, func
): return rule
, func
172 def addRule(self
, doc
, func
, _preprocess
=1):
174 rules
= string
.split(doc
)
177 for i
in range(len(rules
)):
178 if rules
[i
] == '::=':
180 index
.append(len(rules
))
182 for i
in range(len(index
)-1):
183 lhs
= rules
[index
[i
]]
184 rhs
= rules
[index
[i
]+2:index
[i
+1]]
185 rule
= (lhs
, tuple(rhs
))
188 rule
, fn
= self
.preprocess(rule
, func
)
190 if self
.rules
.has_key(lhs
):
191 self
.rules
[lhs
].append(rule
)
193 self
.rules
[lhs
] = [ rule
]
194 self
.rule2func
[rule
] = fn
195 self
.rule2name
[rule
] = func
.__name
__[2:]
196 self
.ruleschanged
= 1
198 def collectRules(self
):
199 for name
in _namelist(self
):
201 func
= getattr(self
, name
)
203 self
.addRule(doc
, func
)
205 def augment(self
, start
):
206 rule
= '%s ::= %s %s' % (self
._START
, self
._BOF
, start
)
207 self
.addRule(rule
, lambda args
: args
[1], 0)
209 def computeNull(self
):
213 for rulelist
in self
.rules
.values():
215 self
.nullable
[lhs
] = 0
216 for rule
in rulelist
:
219 self
.nullable
[lhs
] = 1
222 # We only need to consider rules which
223 # consist entirely of nonterminal symbols.
224 # This should be a savings on typical
228 if not self
.rules
.has_key(sym
):
236 if self
.nullable
[lhs
]:
239 if not self
.nullable
[sym
]:
242 self
.nullable
[lhs
] = 1
245 def makeState0(self
):
247 for rule
in self
.newrules
[self
._START
]:
248 s0
.items
.append((rule
, 0))
251 def finalState(self
, tokens
):
255 if len(self
.newrules
[self
._START
]) == 2 and len(tokens
) == 0:
257 start
= self
.rules
[self
._START
][0][1][1]
258 return self
.goto(1, start
)
260 def makeNewRules(self
):
262 for rulelist
in self
.rules
.values():
263 for rule
in rulelist
:
264 worklist
.append((rule
, 0, 1, rule
))
266 for rule
, i
, candidate
, oldrule
in worklist
:
271 if not self
.rules
.has_key(sym
) or \
272 not self
.nullable
[sym
]:
278 newrhs
[i
] = self
._NULLABLE
+sym
279 newrule
= (lhs
, tuple(newrhs
))
280 worklist
.append((newrule
, i
+1,
286 lhs
= self
._NULLABLE
+lhs
288 if self
.newrules
.has_key(lhs
):
289 self
.newrules
[lhs
].append(rule
)
291 self
.newrules
[lhs
] = [ rule
]
292 self
.new2old
[rule
] = oldrule
294 def typestring(self
, token
):
297 def error(self
, token
):
298 print "Syntax error at or near `%s' token" % token
301 def parse(self
, tokens
):
302 sets
= [ [(1,0), (2,0)] ]
305 if self
.ruleschanged
:
310 self
.ruleschanged
= 0
311 self
.edges
, self
.cores
= {}, {}
312 self
.states
= { 0: self
.makeState0() }
313 self
.makeState(0, self
._BOF
)
315 for i
in xrange(len(tokens
)):
320 self
.makeSet(tokens
[i
], sets
, i
)
323 self
.makeSet(None, sets
, len(tokens
))
325 #_dump(tokens, sets, self.states)
327 finalitem
= (self
.finalState(tokens
), 0)
328 if finalitem
not in sets
[-2]:
330 self
.error(tokens
[i
-1])
334 return self
.buildTree(self
._START
, finalitem
,
337 def isnullable(self
, sym
):
339 # For symbols in G_e only. If we weren't supporting 1.5,
340 # could just use sym.startswith().
342 return self
._NULLABLE
== sym
[0:len(self
._NULLABLE
)]
344 def skip(self
, (lhs
, rhs
), pos
=0):
347 if not self
.isnullable(rhs
[pos
]):
352 def makeState(self
, state
, sym
):
353 assert sym
is not None
355 # Compute \epsilon-kernel state's core and see if
359 for rule
, pos
in self
.states
[state
].items
:
361 if rhs
[pos
:pos
+1] == (sym
,):
362 kitems
.append((rule
, self
.skip(rule
, pos
+1)))
367 if self
.cores
.has_key(tcore
):
368 return self
.cores
[tcore
]
370 # Nope, doesn't exist. Compute it and the associated
371 # \epsilon-nonkernel state together; we'll need it right away.
373 k
= self
.cores
[tcore
] = len(self
.states
)
374 K
, NK
= _State(k
, kitems
), _State(k
+1, [])
379 rules
= self
.newrules
382 for item
in worklist
:
386 X
.complete
.append(rule
)
390 key
= (X
.stateno
, nextSym
)
391 if not rules
.has_key(nextSym
):
392 if not edges
.has_key(key
):
397 if not predicted
.has_key(nextSym
):
398 predicted
[nextSym
] = 1
399 for prule
in rules
[nextSym
]:
400 ppos
= self
.skip(prule
)
404 # Problem: we know K needs generating, but we
405 # don't yet know about NK. Can't commit anything
406 # regarding NK to self.edges until we're sure. Should
407 # we delay committing on both K and NK to avoid this
408 # hacky code? This creates other problems..
417 # Check for \epsilon-nonkernel's core. Unfortunately we
418 # need to know the entire set of predicted nonterminals
419 # to do this without accidentally duplicating states.
421 core
= predicted
.keys()
424 if self
.cores
.has_key(tcore
):
425 self
.edges
[(k
, None)] = self
.cores
[tcore
]
428 nk
= self
.cores
[tcore
] = self
.edges
[(k
, None)] = NK
.stateno
429 self
.edges
.update(edges
)
433 def goto(self
, state
, sym
):
435 if not self
.edges
.has_key(key
):
437 # No transitions from state on sym.
444 # Target state isn't generated yet. Remedy this.
446 rv
= self
.makeState(state
, sym
)
450 def gotoT(self
, state
, t
):
451 return [self
.goto(state
, t
)]
453 def gotoST(self
, state
, st
):
455 for t
in self
.states
[state
].T
:
457 rv
.append(self
.goto(state
, t
))
460 def add(self
, set, item
, i
=None, predecessor
=None, causal
=None):
461 if predecessor
is None:
469 self
.links
[key
].append((predecessor
, causal
))
471 def makeSet(self
, token
, sets
, i
):
472 cur
, next
= sets
[i
], sets
[i
+1]
474 ttype
= token
is not None and self
.typestring(token
) or None
475 if ttype
is not None:
476 fn
, arg
= self
.gotoT
, ttype
478 fn
, arg
= self
.gotoST
, token
486 self
.add(next
, (k
, parent
), i
+1, ptr
)
487 nk
= self
.goto(k
, None)
489 self
.add(next
, (nk
, i
+1))
494 for rule
in self
.states
[state
].complete
:
496 for pitem
in sets
[parent
]:
497 pstate
, pparent
= pitem
498 k
= self
.goto(pstate
, lhs
)
500 why
= (item
, i
, rule
)
501 pptr
= (pitem
, parent
)
502 self
.add(cur
, (k
, pparent
),
504 nk
= self
.goto(k
, None)
506 self
.add(cur
, (nk
, i
))
508 def makeSet_fast(self
, token
, sets
, i
):
510 # Call *only* when the entire state machine has been built!
511 # It relies on self.edges being filled in completely, and
512 # then duplicates and inlines code to boost speed at the
513 # cost of extreme ugliness.
515 cur
, next
= sets
[i
], sets
[i
+1]
516 ttype
= token
is not None and self
.typestring(token
) or None
521 if ttype
is not None:
522 k
= self
.edges
.get((state
, ttype
), None)
524 #self.add(next, (k, parent), i+1, ptr)
531 self
.links
[key
].append((ptr
, None))
533 #nk = self.goto(k, None)
534 nk
= self
.edges
.get((k
, None), None)
536 #self.add(next, (nk, i+1))
543 add
= self
.gotoST(state
, token
)
546 self
.add(next
, (k
, parent
), i
+1, ptr
)
547 #nk = self.goto(k, None)
548 nk
= self
.edges
.get((k
, None), None)
550 self
.add(next
, (nk
, i
+1))
555 for rule
in self
.states
[state
].complete
:
557 for pitem
in sets
[parent
]:
558 pstate
, pparent
= pitem
559 #k = self.goto(pstate, lhs)
560 k
= self
.edges
.get((pstate
, lhs
), None)
562 why
= (item
, i
, rule
)
563 pptr
= (pitem
, parent
)
564 #self.add(cur, (k, pparent),
572 self
.links
[key
].append((pptr
, why
))
574 #nk = self.goto(k, None)
575 nk
= self
.edges
.get((k
, None), None)
577 #self.add(cur, (nk, i))
584 def predecessor(self
, key
, causal
):
585 for p
, c
in self
.links
[key
]:
590 def causal(self
, key
):
591 links
= self
.links
[key
]
600 return rule2cause
[self
.ambiguity(choices
)]
602 def deriveEpsilon(self
, nt
):
603 if len(self
.newrules
[nt
]) > 1:
604 rule
= self
.ambiguity(self
.newrules
[nt
])
606 rule
= self
.newrules
[nt
][0]
610 attr
= [None] * len(rhs
)
612 for i
in range(len(rhs
)-1, -1, -1):
613 attr
[i
] = self
.deriveEpsilon(rhs
[i
])
614 return self
.rule2func
[self
.new2old
[rule
]](attr
)
616 def buildTree(self
, nt
, item
, tokens
, k
):
620 for rule
in self
.states
[state
].complete
:
625 rule
= self
.ambiguity(choices
)
629 attr
= [None] * len(rhs
)
631 for i
in range(len(rhs
)-1, -1, -1):
633 if not self
.newrules
.has_key(sym
):
635 attr
[i
] = tokens
[k
-1]
637 item
, k
= self
.predecessor(key
, None)
638 #elif self.isnullable(sym):
639 elif self
._NULLABLE
== sym
[0:len(self
._NULLABLE
)]:
640 attr
[i
] = self
.deriveEpsilon(sym
)
643 why
= self
.causal(key
)
644 attr
[i
] = self
.buildTree(sym
, why
[0],
646 item
, k
= self
.predecessor(key
, why
)
647 return self
.rule2func
[self
.new2old
[rule
]](attr
)
649 def ambiguity(self
, rules
):
651 # XXX - problem here and in collectRules() if the same rule
652 # appears in >1 method. Also undefined results if rules
653 # causing the ambiguity appear in the same method.
657 for i
in range(len(rules
)):
658 lhs
, rhs
= rule
= rules
[i
]
659 name
= self
.rule2name
[self
.new2old
[rule
]]
660 sortlist
.append((len(rhs
), name
))
663 list = map(lambda (a
,b
): b
, sortlist
)
664 return rules
[name2index
[self
.resolve(list)]]
666 def resolve(self
, list):
668 # Resolve ambiguity in favor of the shortest RHS.
669 # Since we walk the tree from the top down, this
670 # should effectively resolve in favor of a "shift".
675 # GenericASTBuilder automagically constructs a concrete/abstract syntax tree
676 # for a given input. The extra argument is a class (not an instance!)
677 # which supports the "__setslice__" and "__len__" methods.
679 # XXX - silently overrides any user code in methods.
682 class GenericASTBuilder(GenericParser
):
683 def __init__(self
, AST
, start
):
684 GenericParser
.__init
__(self
, start
)
687 def preprocess(self
, rule
, func
):
688 rebind
= lambda lhs
, self
=self
: \
689 lambda args
, lhs
=lhs
, self
=self
: \
690 self
.buildASTNode(args
, lhs
)
692 return rule
, rebind(lhs
)
694 def buildASTNode(self
, args
, lhs
):
697 if isinstance(arg
, self
.AST
):
700 children
.append(self
.terminal(arg
))
701 return self
.nonterminal(lhs
, children
)
703 def terminal(self
, token
): return token
705 def nonterminal(self
, type, args
):
707 rv
[:len(args
)] = args
711 # GenericASTTraversal is a Visitor pattern according to Design Patterns. For
712 # each node it attempts to invoke the method n_<node type>, falling
713 # back onto the default() method if the n_* can't be found. The preorder
714 # traversal also looks for an exit hook named n_<node type>_exit (no default
715 # routine is called if it's not found). To prematurely halt traversal
716 # of a subtree, call the prune() method -- this only makes sense for a
717 # preorder traversal. Node type is determined via the typestring() method.
720 class GenericASTTraversalPruningException
:
723 class GenericASTTraversal
:
724 def __init__(self
, ast
):
727 def typestring(self
, node
):
731 raise GenericASTTraversalPruningException
733 def preorder(self
, node
=None):
738 name
= 'n_' + self
.typestring(node
)
739 if hasattr(self
, name
):
740 func
= getattr(self
, name
)
744 except GenericASTTraversalPruningException
:
750 name
= name
+ '_exit'
751 if hasattr(self
, name
):
752 func
= getattr(self
, name
)
755 def postorder(self
, node
=None):
762 name
= 'n_' + self
.typestring(node
)
763 if hasattr(self
, name
):
764 func
= getattr(self
, name
)
770 def default(self
, node
):
774 # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__"
777 # XXX - makes assumptions about how GenericParser walks the parse tree.
780 class GenericASTMatcher(GenericParser
):
781 def __init__(self
, start
, ast
):
782 GenericParser
.__init
__(self
, start
)
785 def preprocess(self
, rule
, func
):
786 rebind
= lambda func
, self
=self
: \
787 lambda args
, func
=func
, self
=self
: \
788 self
.foundMatch(args
, func
)
793 return (lhs
, tuple(rhslist
)), rebind(func
)
795 def foundMatch(self
, args
, func
):
799 def match_r(self
, node
):
800 self
.input.insert(0, node
)
805 self
.input.insert(0, '(')
806 children
= children
+ 1
810 self
.input.insert(0, ')')
812 def match(self
, ast
=None):
818 self
.parse(self
.input)
820 def resolve(self
, list):
822 # Resolve ambiguity in favor of the longest RHS.
826 def _dump(tokens
, sets
, states
):
827 for i
in range(len(sets
)):
831 for (lhs
, rhs
), pos
in states
[item
[0]].items
:
832 print '\t\t', lhs
, '::=',
833 print string
.join(rhs
[:pos
]),
835 print string
.join(rhs
[pos
:])
838 print 'token', str(tokens
[i
])