1 # Copyright (c) 1998-2000 John Aycock
3 # Permission is hereby granted, free of charge, to any person obtaining
4 # a copy of this software and associated documentation files (the
5 # "Software"), to deal in the Software without restriction, including
6 # without limitation the rights to use, copy, modify, merge, publish,
7 # distribute, sublicense, and/or sell copies of the Software, and to
8 # permit persons to whom the Software is furnished to do so, subject to
9 # the following conditions:
11 # The above copyright notice and this permission notice shall be
12 # included in all copies or substantial portions of the Software.
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 __version__
= 'SPARK-0.6.1'
28 def _namelist(instance
):
29 namelist
, namedict
, classlist
= [], {}, [instance
.__class
__]
34 if not namedict
.has_key(name
):
41 pattern
= self
.reflect()
42 self
.re
= re
.compile(pattern
, re
.VERBOSE | re
.MULTILINE
)
45 for name
, number
in self
.re
.groupindex
.items():
46 self
.index2func
[number
-1] = getattr(self
, 't_' + name
)
48 def makeRE(self
, name
):
49 if hasattr(self
, '__doc__'):
50 doc
= getattr(self
, name
).__doc
__
52 doc
= self
.get_re(name
[2:])
53 rv
= '(?P<%s>%s)' % (name
[2:], doc
)
58 for name
in _namelist(self
):
59 if name
[:2] == 't_' and name
!= 't_default':
60 rv
.append(self
.makeRE(name
))
62 rv
.append(self
.makeRE('t_default'))
63 return string
.join(rv
, '|')
65 def error(self
, s
, pos
):
66 print "Lexical error at position %s" % pos
69 def tokenize(self
, s
):
73 m
= self
.re
.match(s
, pos
)
78 for i
in range(len(groups
)):
79 if groups
[i
] and self
.index2func
.has_key(i
):
80 self
.index2func
[i
](groups
[i
])
83 def t_default(self
, s
):
88 def __init__(self
, start
):
93 self
.startRule
= self
.augment(start
)
100 # A hook for GenericASTBuilder and GenericASTMatcher.
102 def preprocess(self
, rule
, func
): return rule
, func
104 def addRule(self
, doc
, func
):
105 rules
= string
.split(doc
)
108 for i
in range(len(rules
)):
109 if rules
[i
] == '::=':
111 index
.append(len(rules
))
113 for i
in range(len(index
)-1):
114 lhs
= rules
[index
[i
]]
115 rhs
= rules
[index
[i
]+2:index
[i
+1]]
116 rule
= (lhs
, tuple(rhs
))
118 rule
, fn
= self
.preprocess(rule
, func
)
120 if self
.rules
.has_key(lhs
):
121 self
.rules
[lhs
].append(rule
)
123 self
.rules
[lhs
] = [ rule
]
124 self
.rule2func
[rule
] = fn
125 self
.rule2name
[rule
] = func
.__name
__[2:]
126 self
.ruleschanged
= 1
128 def collectRules(self
):
129 for name
in _namelist(self
):
131 func
= getattr(self
, name
)
133 self
.addRule(doc
, func
)
135 def augment(self
, start
):
137 # Tempting though it is, this isn't made into a call
138 # to self.addRule() because the start rule shouldn't
139 # be subject to preprocessing.
141 startRule
= (self
._START
, ( start
, self
._EOF
))
142 self
.rule2func
[startRule
] = lambda args
: args
[0]
143 self
.rules
[self
._START
] = [ startRule
]
144 self
.rule2name
[startRule
] = ''
151 for rulelist
in self
.rules
.values():
152 for lhs
, rhs
in rulelist
:
153 if not self
.first
.has_key(lhs
):
157 self
.first
[lhs
][None] = 1
161 if not self
.rules
.has_key(sym
):
162 self
.first
[lhs
][sym
] = 1
164 union
[(sym
, lhs
)] = 1
168 for src
, dest
in union
.keys():
169 destlen
= len(self
.first
[dest
])
170 self
.first
[dest
].update(self
.first
[src
])
171 if len(self
.first
[dest
]) != destlen
:
175 # An Earley parser, as per J. Earley, "An Efficient Context-Free
176 # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley,
177 # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis,
178 # Carnegie-Mellon University, August 1968, p. 27.
181 def typestring(self
, token
):
184 def error(self
, token
):
185 print "Syntax error at or near `%s' token" % token
188 def parse(self
, tokens
):
190 tokens
.append(self
._EOF
)
191 states
= { 0: [ (self
.startRule
, 0, 0) ] }
193 if self
.ruleschanged
:
196 for i
in xrange(len(tokens
)):
201 self
.buildState(tokens
[i
], states
, i
, tree
)
203 #_dump(tokens, states)
205 if i
< len(tokens
)-1 or states
[i
+1] != [(self
.startRule
, 2, 0)]:
207 self
.error(tokens
[i
-1])
208 rv
= self
.buildTree(tokens
, tree
, ((self
.startRule
, 2, 0), i
+1))
212 def buildState(self
, token
, states
, i
, tree
):
218 rule
, pos
, parent
= item
222 # A -> a . (completer)
226 needsCompletion
[lhs
] = (item
, i
)
228 for pitem
in states
[parent
]:
232 prule
, ppos
, pparent
= pitem
235 if prhs
[ppos
:ppos
+1] == (lhs
,):
241 tree
[(new
, i
)] = [(item
, i
)]
243 tree
[(new
, i
)].append((item
, i
))
249 # A -> a . B (predictor)
251 if self
.rules
.has_key(nextSym
):
253 # Work on completer step some more; for rules
254 # with empty RHS, the "parent state" is the
255 # current state we're adding Earley items to,
256 # so the Earley items the completer step needs
257 # may not all be present when it runs.
259 if needsCompletion
.has_key(nextSym
):
260 new
= (rule
, pos
+1, parent
)
261 olditem_i
= needsCompletion
[nextSym
]
264 tree
[(new
, i
)] = [olditem_i
]
266 tree
[(new
, i
)].append(olditem_i
)
269 # Has this been predicted already?
271 if predicted
.has_key(nextSym
):
273 predicted
[nextSym
] = 1
275 ttype
= token
is not self
._EOF
and \
276 self
.typestring(token
) or \
278 if ttype
is not None:
280 # Even smarter predictor, when the
281 # token's type is known. The code is
282 # grungy, but runs pretty fast. Three
283 # cases are looked for: rules with
284 # empty RHS; first symbol on RHS is a
285 # terminal; first symbol on RHS is a
286 # nonterminal (and isn't nullable).
288 for prule
in self
.rules
[nextSym
]:
295 if not self
.rules
.has_key(prhs0
):
301 first
= self
.first
[prhs0
]
302 if not first
.has_key(None) and \
303 not first
.has_key(ttype
):
308 for prule
in self
.rules
[nextSym
]:
310 # Smarter predictor, as per Grune &
311 # Jacobs' _Parsing Techniques_. Not
312 # as good as FIRST sets though.
315 if len(prhs
) > 0 and \
316 not self
.rules
.has_key(prhs
[0]) and \
319 state
.append((prule
, 0, i
))
322 # A -> a . c (scanner)
324 elif token
== nextSym
:
325 #assert new not in states[i+1]
326 states
[i
+1].append((rule
, pos
+1, parent
))
328 def buildTree(self
, tokens
, tree
, root
):
330 self
.buildTree_r(stack
, tokens
, -1, tree
, root
)
333 def buildTree_r(self
, stack
, tokens
, tokpos
, tree
, root
):
334 (rule
, pos
, parent
), state
= root
337 want
= ((rule
, pos
, parent
), state
)
338 if not tree
.has_key(want
):
340 # Since pos > 0, it didn't come from closure,
341 # and if it isn't in tree[], then there must
342 # be a terminal symbol to the left of the dot.
343 # (It must be from a "scanner" step.)
347 stack
.insert(0, tokens
[tokpos
])
351 # There's a NT to the left of the dot.
352 # Follow the tree pointer recursively (>1
353 # tree pointers from it indicates ambiguity).
354 # Since the item must have come about from a
355 # "completer" step, the state where the item
356 # came from must be the parent state of the
357 # item the tree pointer points to.
359 children
= tree
[want
]
360 if len(children
) > 1:
361 child
= self
.ambiguity(children
)
365 tokpos
= self
.buildTree_r(stack
,
369 (crule
, cpos
, cparent
), cstate
= child
373 result
= self
.rule2func
[rule
](stack
[:len(rhs
)])
374 stack
[:len(rhs
)] = [result
]
377 def ambiguity(self
, children
):
379 # XXX - problem here and in collectRules() if the same
380 # rule appears in >1 method. But in that case the
381 # user probably gets what they deserve :-) Also
382 # undefined results if rules causing the ambiguity
383 # appear in the same method.
387 for i
in range(len(children
)):
388 ((rule
, pos
, parent
), index
) = children
[i
]
390 name
= self
.rule2name
[rule
]
391 sortlist
.append((len(rhs
), name
))
394 list = map(lambda (a
,b
): b
, sortlist
)
395 return children
[name2index
[self
.resolve(list)]]
397 def resolve(self
, list):
399 # Resolve ambiguity in favor of the shortest RHS.
400 # Since we walk the tree from the top down, this
401 # should effectively resolve in favor of a "shift".
406 # GenericASTBuilder automagically constructs a concrete/abstract syntax tree
407 # for a given input. The extra argument is a class (not an instance!)
408 # which supports the "__setslice__" and "__len__" methods.
410 # XXX - silently overrides any user code in methods.
413 class GenericASTBuilder(GenericParser
):
414 def __init__(self
, AST
, start
):
415 GenericParser
.__init
__(self
, start
)
418 def preprocess(self
, rule
, func
):
419 rebind
= lambda lhs
, self
=self
: \
420 lambda args
, lhs
=lhs
, self
=self
: \
421 self
.buildASTNode(args
, lhs
)
423 return rule
, rebind(lhs
)
425 def buildASTNode(self
, args
, lhs
):
428 if isinstance(arg
, self
.AST
):
431 children
.append(self
.terminal(arg
))
432 return self
.nonterminal(lhs
, children
)
434 def terminal(self
, token
): return token
436 def nonterminal(self
, type, args
):
438 rv
[:len(args
)] = args
442 # GenericASTTraversal is a Visitor pattern according to Design Patterns. For
443 # each node it attempts to invoke the method n_<node type>, falling
444 # back onto the default() method if the n_* can't be found. The preorder
445 # traversal also looks for an exit hook named n_<node type>_exit (no default
446 # routine is called if it's not found). To prematurely halt traversal
447 # of a subtree, call the prune() method -- this only makes sense for a
448 # preorder traversal. Node type is determined via the typestring() method.
451 class GenericASTTraversalPruningException
:
454 class GenericASTTraversal
:
455 def __init__(self
, ast
):
458 def typestring(self
, node
):
462 raise GenericASTTraversalPruningException
464 def preorder(self
, node
=None):
469 name
= 'n_' + self
.typestring(node
)
470 if hasattr(self
, name
):
471 func
= getattr(self
, name
)
475 except GenericASTTraversalPruningException
:
481 name
= name
+ '_exit'
482 if hasattr(self
, name
):
483 func
= getattr(self
, name
)
486 def postorder(self
, node
=None):
493 name
= 'n_' + self
.typestring(node
)
494 if hasattr(self
, name
):
495 func
= getattr(self
, name
)
501 def default(self
, node
):
505 # GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__"
508 # XXX - makes assumptions about how GenericParser walks the parse tree.
511 class GenericASTMatcher(GenericParser
):
512 def __init__(self
, start
, ast
):
513 GenericParser
.__init
__(self
, start
)
516 def preprocess(self
, rule
, func
):
517 rebind
= lambda func
, self
=self
: \
518 lambda args
, func
=func
, self
=self
: \
519 self
.foundMatch(args
, func
)
524 return (lhs
, tuple(rhslist
)), rebind(func
)
526 def foundMatch(self
, args
, func
):
530 def match_r(self
, node
):
531 self
.input.insert(0, node
)
536 self
.input.insert(0, '(')
537 children
= children
+ 1
541 self
.input.insert(0, ')')
543 def match(self
, ast
=None):
549 self
.parse(self
.input)
551 def resolve(self
, list):
553 # Resolve ambiguity in favor of the longest RHS.
557 def _dump(tokens
, states
):
558 for i
in range(len(states
)):
560 for (lhs
, rhs
), pos
, parent
in states
[i
]:
561 print '\t', lhs
, '::=',
562 print string
.join(rhs
[:pos
]),
564 print string
.join(rhs
[pos
:]),
568 print 'token', str(tokens
[i
])