1 # Copyright (c) 1998-2002 John Aycock
3 # Permission is hereby granted, free of charge, to any person obtaining
4 # a copy of this software and associated documentation files (the
5 # "Software"), to deal in the Software without restriction, including
6 # without limitation the rights to use, copy, modify, merge, publish,
7 # distribute, sublicense, and/or sell copies of the Software, and to
8 # permit persons to whom the Software is furnished to do so, subject to
9 # the following conditions:
11 # The above copyright notice and this permission notice shall be
12 # included in all copies or substantial portions of the Software.
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 __version__
= 'SPARK-0.7 (pre-alpha-5)'
28 def _namelist(instance
):
29 namelist
, namedict
, classlist
= [], {}, [instance
.__class
__]
33 for name
in c
.__dict
__.keys():
34 if not namedict
.has_key(name
):
40 def __init__(self
, flags
=0):
41 pattern
= self
.reflect()
42 self
.re
= re
.compile(pattern
, re
.VERBOSE|flags
)
45 for name
, number
in self
.re
.groupindex
.items():
46 self
.index2func
[number
-1] = getattr(self
, 't_' + name
)
48 def makeRE(self
, name
):
49 doc
= getattr(self
, name
).__doc
__
50 rv
= '(?P<%s>%s)' % (name
[2:], doc
)
55 for name
in _namelist(self
):
56 if name
[:2] == 't_' and name
!= 't_default':
57 rv
.append(self
.makeRE(name
))
59 rv
.append(self
.makeRE('t_default'))
60 return string
.join(rv
, '|')
62 def error(self
, s
, pos
):
63 print "Lexical error at position %s" % pos
66 def tokenize(self
, s
):
70 m
= self
.re
.match(s
, pos
)
75 for i
in range(len(groups
)):
76 if groups
[i
] and self
.index2func
.has_key(i
):
77 self
.index2func
[i
](groups
[i
])
80 def t_default(self
, s
):
82 print "Specification error: unmatched input"
86 # Extracted from GenericParser and made global so that [un]picking works.
89 def __init__(self
, stateno
, items
):
90 self
.T
, self
.complete
, self
.items
= [], [], items
91 self
.stateno
= stateno
95 # An Earley parser, as per J. Earley, "An Efficient Context-Free
96 # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley,
97 # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis,
98 # Carnegie-Mellon University, August 1968. New formulation of
99 # the parser according to J. Aycock, "Practical Earley Parsing
100 # and the SPARK Toolkit", Ph.D. thesis, University of Victoria,
101 # 2001, and J. Aycock and R. N. Horspool, "Practical Earley
102 # Parsing", unpublished paper, 2001.
105 def __init__(self
, start
):
111 self
.ruleschanged
= 1
118 # When pickling, take the time to generate the full state machine;
119 # some information is then extraneous, too. Unfortunately we
120 # can't save the rule2func map.
122 def __getstate__(self
):
123 if self
.ruleschanged
:
125 # XXX - duplicated from parse()
131 self
.ruleschanged
= 0
132 self
.edges
, self
.cores
= {}, {}
133 self
.states
= { 0: self
.makeState0() }
134 self
.makeState(0, self
._BOF
)
136 # XXX - should find a better way to do this..
141 for k
, v
in self
.edges
.items():
144 if self
.states
.has_key(state
):
145 self
.goto(state
, sym
)
147 rv
= self
.__dict
__.copy()
148 for s
in self
.states
.values():
155 def __setstate__(self
, D
):
160 start
= D
['rules'][self
._START
][0][1][1] # Blech.
162 D
['rule2func'] = self
.rule2func
163 D
['makeSet'] = self
.makeSet_fast
167 # A hook for GenericASTBuilder and GenericASTMatcher. Mess
168 # thee not with this; nor shall thee toucheth the _preprocess
169 # argument to addRule.
171 def preprocess(self
, rule
, func
): return rule
, func
173 def addRule(self
, doc
, func
, _preprocess
=1):
175 rules
= string
.split(doc
)
178 for i
in range(len(rules
)):
179 if rules
[i
] == '::=':
181 index
.append(len(rules
))
183 for i
in range(len(index
)-1):
184 lhs
= rules
[index
[i
]]
185 rhs
= rules
[index
[i
]+2:index
[i
+1]]
186 rule
= (lhs
, tuple(rhs
))
189 rule
, fn
= self
.preprocess(rule
, func
)
191 if self
.rules
.has_key(lhs
):
192 self
.rules
[lhs
].append(rule
)
194 self
.rules
[lhs
] = [ rule
]
195 self
.rule2func
[rule
] = fn
196 self
.rule2name
[rule
] = func
.__name
__[2:]
197 self
.ruleschanged
= 1
199 def collectRules(self
):
200 for name
in _namelist(self
):
202 func
= getattr(self
, name
)
204 self
.addRule(doc
, func
)
206 def augment(self
, start
):
207 rule
= '%s ::= %s %s' % (self
._START
, self
._BOF
, start
)
208 self
.addRule(rule
, lambda args
: args
[1], 0)
210 def computeNull(self
):
214 for rulelist
in self
.rules
.values():
216 self
.nullable
[lhs
] = 0
217 for rule
in rulelist
:
220 self
.nullable
[lhs
] = 1
223 # We only need to consider rules which
224 # consist entirely of nonterminal symbols.
225 # This should be a savings on typical
229 if not self
.rules
.has_key(sym
):
237 if self
.nullable
[lhs
]:
240 if not self
.nullable
[sym
]:
243 self
.nullable
[lhs
] = 1
246 def makeState0(self
):
248 for rule
in self
.newrules
[self
._START
]:
249 s0
.items
.append((rule
, 0))
252 def finalState(self
, tokens
):
256 if len(self
.newrules
[self
._START
]) == 2 and len(tokens
) == 0:
258 start
= self
.rules
[self
._START
][0][1][1]
259 return self
.goto(1, start
)
261 def makeNewRules(self
):
263 for rulelist
in self
.rules
.values():
264 for rule
in rulelist
:
265 worklist
.append((rule
, 0, 1, rule
))
267 for rule
, i
, candidate
, oldrule
in worklist
:
272 if not self
.rules
.has_key(sym
) or \
273 not self
.nullable
[sym
]:
279 newrhs
[i
] = self
._NULLABLE
+sym
280 newrule
= (lhs
, tuple(newrhs
))
281 worklist
.append((newrule
, i
+1,
287 lhs
= self
._NULLABLE
+lhs
289 if self
.newrules
.has_key(lhs
):
290 self
.newrules
[lhs
].append(rule
)
292 self
.newrules
[lhs
] = [ rule
]
293 self
.new2old
[rule
] = oldrule
295 def typestring(self
, token
):
298 def error(self
, token
):
299 print "Syntax error at or near `%s' token" % token
302 def parse(self
, tokens
):
303 sets
= [ [(1,0), (2,0)] ]
306 if self
.ruleschanged
:
311 self
.ruleschanged
= 0
312 self
.edges
, self
.cores
= {}, {}
313 self
.states
= { 0: self
.makeState0() }
314 self
.makeState(0, self
._BOF
)
316 for i
in xrange(len(tokens
)):
321 self
.makeSet(tokens
[i
], sets
, i
)
324 self
.makeSet(None, sets
, len(tokens
))
326 #_dump(tokens, sets, self.states)
328 finalitem
= (self
.finalState(tokens
), 0)
329 if finalitem
not in sets
[-2]:
331 self
.error(tokens
[i
-1])
335 return self
.buildTree(self
._START
, finalitem
,
338 def isnullable(self
, sym
):
340 # For symbols in G_e only. If we weren't supporting 1.5,
341 # could just use sym.startswith().
343 return self
._NULLABLE
== sym
[0:len(self
._NULLABLE
)]
345 def skip(self
, (lhs
, rhs
), pos
=0):
348 if not self
.isnullable(rhs
[pos
]):
353 def makeState(self
, state
, sym
):
354 assert sym
is not None
356 # Compute \epsilon-kernel state's core and see if
360 for rule
, pos
in self
.states
[state
].items
:
362 if rhs
[pos
:pos
+1] == (sym
,):
363 kitems
.append((rule
, self
.skip(rule
, pos
+1)))
368 if self
.cores
.has_key(tcore
):
369 return self
.cores
[tcore
]
371 # Nope, doesn't exist. Compute it and the associated
372 # \epsilon-nonkernel state together; we'll need it right away.
374 k
= self
.cores
[tcore
] = len(self
.states
)
375 K
, NK
= _State(k
, kitems
), _State(k
+1, [])
380 rules
= self
.newrules
383 for item
in worklist
:
387 X
.complete
.append(rule
)
391 key
= (X
.stateno
, nextSym
)
392 if not rules
.has_key(nextSym
):
393 if not edges
.has_key(key
):
398 if not predicted
.has_key(nextSym
):
399 predicted
[nextSym
] = 1
400 for prule
in rules
[nextSym
]:
401 ppos
= self
.skip(prule
)
405 # Problem: we know K needs generating, but we
406 # don't yet know about NK. Can't commit anything
407 # regarding NK to self.edges until we're sure. Should
408 # we delay committing on both K and NK to avoid this
409 # hacky code? This creates other problems..
418 # Check for \epsilon-nonkernel's core. Unfortunately we
419 # need to know the entire set of predicted nonterminals
420 # to do this without accidentally duplicating states.
422 core
= predicted
.keys()
425 if self
.cores
.has_key(tcore
):
426 self
.edges
[(k
, None)] = self
.cores
[tcore
]
429 nk
= self
.cores
[tcore
] = self
.edges
[(k
, None)] = NK
.stateno
430 self
.edges
.update(edges
)
434 def goto(self
, state
, sym
):
436 if not self
.edges
.has_key(key
):
438 # No transitions from state on sym.
445 # Target state isn't generated yet. Remedy this.
447 rv
= self
.makeState(state
, sym
)
451 def gotoT(self
, state
, t
):
452 return [self
.goto(state
, t
)]
454 def gotoST(self
, state
, st
):
456 for t
in self
.states
[state
].T
:
458 rv
.append(self
.goto(state
, t
))
461 def add(self
, set, item
, i
=None, predecessor
=None, causal
=None):
462 if predecessor
is None:
470 self
.links
[key
].append((predecessor
, causal
))
472 def makeSet(self
, token
, sets
, i
):
473 cur
, next
= sets
[i
], sets
[i
+1]
475 ttype
= token
is not None and self
.typestring(token
) or None
476 if ttype
is not None:
477 fn
, arg
= self
.gotoT
, ttype
479 fn
, arg
= self
.gotoST
, token
487 self
.add(next
, (k
, parent
), i
+1, ptr
)
488 nk
= self
.goto(k
, None)
490 self
.add(next
, (nk
, i
+1))
495 for rule
in self
.states
[state
].complete
:
497 for pitem
in sets
[parent
]:
498 pstate
, pparent
= pitem
499 k
= self
.goto(pstate
, lhs
)
501 why
= (item
, i
, rule
)
502 pptr
= (pitem
, parent
)
503 self
.add(cur
, (k
, pparent
),
505 nk
= self
.goto(k
, None)
507 self
.add(cur
, (nk
, i
))
509 def makeSet_fast(self
, token
, sets
, i
):
511 # Call *only* when the entire state machine has been built!
512 # It relies on self.edges being filled in completely, and
513 # then duplicates and inlines code to boost speed at the
514 # cost of extreme ugliness.
516 cur
, next
= sets
[i
], sets
[i
+1]
517 ttype
= token
is not None and self
.typestring(token
) or None
522 if ttype
is not None:
523 k
= self
.edges
.get((state
, ttype
), None)
525 #self.add(next, (k, parent), i+1, ptr)
532 self
.links
[key
].append((ptr
, None))
534 #nk = self.goto(k, None)
535 nk
= self
.edges
.get((k
, None), None)
537 #self.add(next, (nk, i+1))
544 add
= self
.gotoST(state
, token
)
547 self
.add(next
, (k
, parent
), i
+1, ptr
)
548 #nk = self.goto(k, None)
549 nk
= self
.edges
.get((k
, None), None)
551 self
.add(next
, (nk
, i
+1))
556 for rule
in self
.states
[state
].complete
:
558 for pitem
in sets
[parent
]:
559 pstate
, pparent
= pitem
560 #k = self.goto(pstate, lhs)
561 k
= self
.edges
.get((pstate
, lhs
), None)
563 why
= (item
, i
, rule
)
564 pptr
= (pitem
, parent
)
565 #self.add(cur, (k, pparent),
573 self
.links
[key
].append((pptr
, why
))
575 #nk = self.goto(k, None)
576 nk
= self
.edges
.get((k
, None), None)
578 #self.add(cur, (nk, i))
585 def predecessor(self
, key
, causal
):
586 for p
, c
in self
.links
[key
]:
591 def causal(self
, key
):
592 links
= self
.links
[key
]
601 return rule2cause
[self
.ambiguity(choices
)]
603 def deriveEpsilon(self
, nt
):
604 if len(self
.newrules
[nt
]) > 1:
605 rule
= self
.ambiguity(self
.newrules
[nt
])
607 rule
= self
.newrules
[nt
][0]
611 attr
= [None] * len(rhs
)
613 for i
in range(len(rhs
)-1, -1, -1):
614 attr
[i
] = self
.deriveEpsilon(rhs
[i
])
615 return self
.rule2func
[self
.new2old
[rule
]](attr
)
617 def buildTree(self
, nt
, item
, tokens
, k
):
621 for rule
in self
.states
[state
].complete
:
626 rule
= self
.ambiguity(choices
)
630 attr
= [None] * len(rhs
)
632 for i
in range(len(rhs
)-1, -1, -1):
634 if not self
.newrules
.has_key(sym
):
636 attr
[i
] = tokens
[k
-1]
638 item
, k
= self
.predecessor(key
, None)
639 #elif self.isnullable(sym):
640 elif self
._NULLABLE
== sym
[0:len(self
._NULLABLE
)]:
641 attr
[i
] = self
.deriveEpsilon(sym
)
644 why
= self
.causal(key
)
645 attr
[i
] = self
.buildTree(sym
, why
[0],
647 item
, k
= self
.predecessor(key
, why
)
648 return self
.rule2func
[self
.new2old
[rule
]](attr
)
650 def ambiguity(self
, rules
):
652 # XXX - problem here and in collectRules() if the same rule
653 # appears in >1 method. Also undefined results if rules
654 # causing the ambiguity appear in the same method.
658 for i
in range(len(rules
)):
659 lhs
, rhs
= rule
= rules
[i
]
660 name
= self
.rule2name
[self
.new2old
[rule
]]
661 sortlist
.append((len(rhs
), name
))
664 list = map(lambda (a
,b
): b
, sortlist
)
665 return rules
[name2index
[self
.resolve(list)]]
667 def resolve(self
, list):
669 # Resolve ambiguity in favor of the shortest RHS.
670 # Since we walk the tree from the top down, this
671 # should effectively resolve in favor of a "shift".
676 # GenericASTBuilder automagically constructs a concrete/abstract syntax tree
677 # for a given input. The extra argument is a class (not an instance!)
678 # which supports the "__setslice__" and "__len__" methods.
680 # XXX - silently overrides any user code in methods.
683 class GenericASTBuilder(GenericParser
):
684 def __init__(self
, AST
, start
):
685 GenericParser
.__init
__(self
, start
)
688 def preprocess(self
, rule
, func
):
689 rebind
= lambda lhs
, self
=self
: \
690 lambda args
, lhs
=lhs
, self
=self
: \
691 self
.buildASTNode(args
, lhs
)
693 return rule
, rebind(lhs
)
695 def buildASTNode(self
, args
, lhs
):
698 if isinstance(arg
, self
.AST
):
701 children
.append(self
.terminal(arg
))
702 return self
.nonterminal(lhs
, children
)
704 def terminal(self
, token
): return token
706 def nonterminal(self
, type, args
):
708 rv
[:len(args
)] = args
712 # GenericASTTraversal is a Visitor pattern according to Design Patterns. For
713 # each node it attempts to invoke the method n_<node type>, falling
714 # back onto the default() method if the n_* can't be found. The preorder
715 # traversal also looks for an exit hook named n_<node type>_exit (no default
716 # routine is called if it's not found). To prematurely halt traversal
717 # of a subtree, call the prune() method -- this only makes sense for a
718 # preorder traversal. Node type is determined via the typestring() method.
721 class GenericASTTraversalPruningException
:
724 class GenericASTTraversal
:
725 def __init__(self
, ast
):
728 def typestring(self
, node
):
732 raise GenericASTTraversalPruningException
734 def preorder(self
, node
=None):
739 name
= 'n_' + self
.typestring(node
)
740 if hasattr(self
, name
):
741 func
= getattr(self
, name
)
745 except GenericASTTraversalPruningException
:
751 name
= name
+ '_exit'
752 if hasattr(self
, name
):
753 func
= getattr(self
, name
)
756 def postorder(self
, node
=None):
763 name
= 'n_' + self
.typestring(node
)
764 if hasattr(self
, name
):
765 func
= getattr(self
, name
)
771 def default(self
, node
):
775 # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__"
778 # XXX - makes assumptions about how GenericParser walks the parse tree.
781 class GenericASTMatcher(GenericParser
):
782 def __init__(self
, start
, ast
):
783 GenericParser
.__init
__(self
, start
)
786 def preprocess(self
, rule
, func
):
787 rebind
= lambda func
, self
=self
: \
788 lambda args
, func
=func
, self
=self
: \
789 self
.foundMatch(args
, func
)
794 return (lhs
, tuple(rhslist
)), rebind(func
)
796 def foundMatch(self
, args
, func
):
800 def match_r(self
, node
):
801 self
.input.insert(0, node
)
806 self
.input.insert(0, '(')
807 children
= children
+ 1
811 self
.input.insert(0, ')')
813 def match(self
, ast
=None):
819 self
.parse(self
.input)
821 def resolve(self
, list):
823 # Resolve ambiguity in favor of the longest RHS.
827 def _dump(tokens
, sets
, states
):
828 for i
in range(len(sets
)):
832 for (lhs
, rhs
), pos
in states
[item
[0]].items
:
833 print '\t\t', lhs
, '::=',
834 print string
.join(rhs
[:pos
]),
836 print string
.join(rhs
[pos
:])
839 print 'token', str(tokens
[i
])