2to3 (compiles, not tested)
[tag_parser.git] / src / mjacob / nltk / grammar / TagProduction.py
blob353abfcc041b9a8af964f0866f07b9ee0cb20dcc
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 18, 2011
5 Production rule for a tree-adjoining grammar.
6 Based on nltk.grammar.Production
8 TAG productions are more complicated than context free productions.
9 While context-free productions describe rules for rewriting strings,
10 TAG productions can be more intuitively understood as rules for rewriting
11 a parse tree.
13 Each TAG production is a full tree representing part of the derivation
14 of a parse of a tree adjoining language. The nodes in the tree are analogous
15 to non-terminal elements in a context free production, but are a bit more
16 complicated. The leaves of the tree can be either lexical strings or non-terminal
17 elements.
19 There are two basic types of TAG productions, commonly called "initial" and
20 "auxiliary". I will introduce 4 trees, α through δ, to discuss these.
22 α: S β: NP γ: VP δ: VP
23 / \ | | / \
24 NP VP "John" "ate" *VP "quickly"
26 Trees α, β and γ are initial trees. Tree δ is an auxiliary tree.
27 Initial trees may be attached to each other by "substituting" a
28 non-terminal leaf node with an initial tree bearing that same non-terminal
29 as its root. Auxiliary trees may be attached to other trees by the process
30 of "adjunction", whereby an intermediate node (say, VP) is replaced w/ the
31 auxiliary tree, and any children of that node replace the "foot" of the
32 auxiliary tree, which is a non-terminal leaf marked in this example as "*VP".
34 @author: mjacob
35 '''
36 from nltk.tree import Tree, ImmutableTree
37 from nltk.grammar import Production, Nonterminal
38 import re
39 from mjacob.annotations.not_yet_implemented_method import NotYetImplemented
40 from mjacob.collections.FrozenIndex import FrozenIndex
41 from mjacob.collections.OrderedFrozenSet import OrderedFrozenSet
42 from mjacob.annotations.memoized import Memoize
43 from mjacob.nltk.grammar.TagNonterminal import TagNonterminal
45 IS_LEAF = re.compile("(?:'(.*)')$")
47 class TagProduction(ImmutableTree):
48 """A """
49 def __new__(cls, *args):
50 return list.__new__(cls)
52 def __init__(self, rule):
53 """
54 arguments:
55 rule: may be a CFG Production (nltk.grammar.Production),
56 or a Tree (nltk.tree.Tree), or a string representing
57 an nltk Tree.
59 TagNonterminals can have the additional options indicators:
60 *X: this is a foot node
61 X.N: No adjunction is allowed
62 X.O: Adjunction is mandatory at this node
63 """
64 if isinstance(rule, Production):
65 tree = TagProduction._convert_cfg_rule(rule)
67 elif isinstance(rule, Tree):
68 tree = TagProduction._convert_tree(Tree.convert(rule)) # make a copy
70 else:
71 tree = TagProduction._convert_tree(Tree(rule))
73 super(TagProduction, self).__init__(tree.node, tree)
75 def root(self):
76 """the root of the production (the root of the tree)"""
77 return self.node
79 @Memoize
80 def is_auxiliary(self):
81 """True iff the production is an auxiliary tree"""
82 return self.foot_treeposition() is not None
84 @Memoize
85 def is_initial(self):
86 """True iff the production is an initial (non-auxiliary) tree"""
87 return self.foot_treeposition() is None
89 @Memoize
90 def is_lexical(self):
91 """True iff the production contains a Terminal (e.g. a lexical item)"""
92 if self.terminals():
93 return True
94 return False
96 @Memoize
97 def is_nonlexical(self):
98 """True iff the production contains no lexical items"""
99 return not self.is_lexical()
101 @NotYetImplemented
102 def is_chomsky_normal_form(self): pass
104 @NotYetImplemented
105 def is_binarised(self): pass
107 @NotYetImplemented
108 def is_flexible_chomsky_normal_form(self): pass
110 @NotYetImplemented
111 def is_nonempty(self): pass
113 ############ stuff in the production
114 @Memoize
115 def treepositions(self):
116 """an OrderedFrozenSet of indices of nodes/leaves in this production"""
117 return OrderedFrozenSet(super(TagProduction, self).treepositions())
119 @Memoize
120 def pos_subtrees(self):
121 """an OrderedFrozenSet of index/subtree tuples"""
122 return OrderedFrozenSet((pos, self[pos])
123 for pos in self.treepositions())
125 ############ stuff at leaf positions
126 @Memoize
127 def leaf_treepositions(self):
128 """an OrderedFrozenSet of the position of leaves of the production"""
129 return OrderedFrozenSet(pos
130 for pos, subtree in self.pos_subtrees()
131 if not isinstance(subtree, Tree))
133 @Memoize
134 def leaves(self):
135 """an OrderedFrozenSet of the leaves of the production"""
136 return OrderedFrozenSet(super(TagProduction, self).leaves())
138 @Memoize
139 def foot_treeposition(self):
140 """If the production is auxiliary, the index of the foot. Otherwise, None"""
141 for pos in self.leaf_treepositions():
142 subtree = self[pos]
143 if isinstance(subtree, TagNonterminal) and subtree.is_foot():
144 return pos
146 return None
148 @Memoize
149 def terminal_treepositions(self):
150 """an OrderedFrozenSet of terminal (lexical) positions"""
151 term_positions = []
153 for pos in self.leaf_treepositions():
154 node = self[pos]
155 if not isinstance(node, TagNonterminal):
156 term_positions.append(pos)
158 return OrderedFrozenSet(term_positions)
160 @Memoize
161 def terminals(self):
162 """an OrderedFrozenSet of the terminals (lexical elements)"""
163 return OrderedFrozenSet(self[pos] for pos in self.terminal_treepositions())
165 @Memoize
166 def substitution_treepositions(self):
167 """an OrderedFrozenSet substitution positions"""
168 sub_positions = []
170 for pos in self.leaf_treepositions():
171 node = self[pos]
172 if isinstance(node, TagNonterminal) and not node.is_foot():
173 symbol = node.symbol()
174 sub_positions.append((symbol, pos))
176 return FrozenIndex(sub_positions)
178 ############ stuff not at leaf positions
179 @Memoize
180 def internal_treepositions(self):
181 """an OrderedFrozenSet of non-leaf treepositions"""
182 return OrderedFrozenSet(pos
183 for pos, subtree in self.pos_subtrees()
184 if isinstance(subtree, Tree))
186 @Memoize
187 def epsilon_treepositions(self):
188 """an OrderedFrozenSet of the positions of nodes without children"""
189 epsilon_positions = []
191 for pos, subtree in self.pos_subtrees():
192 if isinstance(subtree, Tree) and len(subtree) == 0:
193 epsilon_positions.append(pos)
195 return OrderedFrozenSet(epsilon_positions)
197 @Memoize
198 def adjunction_treepositions(self):
199 """an OrderedFrozenSet of treepositions where adjunction is admissible"""
200 adj_positions = []
202 for pos in self.treepositions():
203 node = self.get_node(pos)
204 if isinstance(node, TagNonterminal) and not node.is_foot() and not node.no_adjunction():
205 symbol = node.symbol()
206 adj_positions.append((symbol, pos))
208 return FrozenIndex(adj_positions)
210 ############# other stuff
211 def get_node(self, p):
212 """returns the node at position p"""
213 subtree = self[p]
215 if isinstance(subtree, Tree):
216 return subtree.node
217 else:
218 return subtree
220 def can_adjoin(self, gamma, p):
221 """returns true if adjunction of beta (this tree) is allowed at gamma[p].
223 note to self:
224 so... should fSA be defined in the grammar spec somewhere?
225 currently just assume the most general case i.e. labels match and adjunction
226 not explicitly forbidden. """
227 node = gamma[p]
228 if isinstance(node, Tree):
229 node = node.node
231 return (self.is_auxiliary()
232 and type(node) == TagNonterminal
233 and not node.no_adjunction()
234 and self.root().symbol() == node.symbol())
236 ############## construction helpers
237 @classmethod
238 def _convert_tree(cls, tree):
239 """helper method to substitute TagNonterminals for all the node labels"""
241 for subtree in tree.subtrees():
242 if isinstance(subtree.node, TagNonterminal):
243 subtree.node = subtree.node
244 if isinstance(subtree.node, Nonterminal):
245 subtree.node = TagNonterminal(subtree.node.symbol())
246 else:
247 subtree.node = TagNonterminal(subtree.node)
249 for leaf, position in ((tree[tree.leaf_treeposition(i)], tree.leaf_treeposition(i))
250 for i in range(len(tree.leaves()))):
252 m = re.match(IS_LEAF, leaf)
254 if m:
255 if m.group(1):
256 tree[position] = m.group(1)
257 else:
258 tree[position] = m.group(2)
260 else:
261 tree[position] = TagNonterminal(leaf)
263 return tree.freeze()
265 def as_tree(self, type=ImmutableTree):
266 """the production as a regular tree. Interior nodes are converted to Nonterminal objects"""
267 tree = Tree.convert(self)
268 for subtree in tree.subtrees():
269 if isinstance(subtree, Tree):
270 subtree.node = Nonterminal(subtree.node.symbol())
271 for i in range(len(subtree)):
272 subsubtree = subtree[i]
273 if isinstance(subsubtree, Nonterminal):
274 subtree[i] = Nonterminal(subsubtree.symbol())
275 if type is Tree:
276 return tree
277 else:
278 return type.convert(tree)
280 @classmethod
281 def _convert_cfg_rule(cls, rule):
282 """helper method for construction a TagProduction out of a CFG Production object"""
283 def convert_item(item):
284 if type(item) is Nonterminal:
285 return item.symbol()
286 else:
287 return "'%s'" % (item)
289 return cls._convert_tree(Tree(convert_item(rule.lhs()),
290 [convert_item(item)
291 for item in rule.rhs()]))