1 # This Python file uses the following encoding: utf-8
3 Created on May 18, 2011
5 Production rule for a tree-adjoining grammar.
6 Based on nltk.grammar.Production
8 TAG productions are more complicated than context free productions.
9 While context-free productions describe rules for rewriting strings,
10 TAG productions can be more intuitively understood as rules for rewriting
13 Each TAG production is a full tree representing part of the derivation
14 of a parse of a tree adjoining language. The nodes in the tree are analogous
15 to non-terminal elements in a context free production, but are a bit more
16 complicated. The leaves of the tree can be either lexical strings or non-terminal
19 There are two basic types of TAG productions, commonly called "initial" and
20 "auxiliary". I will introduce 4 trees, α through δ, to discuss these.
22 α: S β: NP γ: VP δ: VP
24 NP VP "John" "ate" *VP "quickly"
26 Trees α, β and γ are initial trees. Tree δ is an auxiliary tree.
27 Initial trees may be attached to each other by "substituting" a
28 non-terminal leaf node with an initial tree bearing that same non-terminal
29 as its root. Auxiliary trees may be attached to other trees by the process
30 of "adjunction", whereby an intermediate node (say, VP) is replaced w/ the
31 auxiliary tree, and any children of that node replace the "foot" of the
32 auxiliary tree, which is a non-terminal leaf marked in this example as "*VP".
36 from nltk
.tree
import Tree
, ImmutableTree
37 from nltk
.grammar
import Production
, Nonterminal
39 from mjacob
.annotations
.not_yet_implemented_method
import NotYetImplemented
40 from mjacob
.collections
.FrozenIndex
import FrozenIndex
41 from mjacob
.collections
.OrderedFrozenSet
import OrderedFrozenSet
42 from mjacob
.annotations
.memoized
import Memoize
43 from mjacob
.nltk
.grammar
.TagNonterminal
import TagNonterminal
45 IS_LEAF
= re
.compile("(?:'(.*)')$")
47 class TagProduction(ImmutableTree
):
49 def __new__(cls
, *args
):
50 return list.__new
__(cls
)
52 def __init__(self
, rule
):
55 rule: may be a CFG Production (nltk.grammar.Production),
56 or a Tree (nltk.tree.Tree), or a string representing
59 TagNonterminals can have the additional options indicators:
60 *X: this is a foot node
61 X.N: No adjunction is allowed
62 X.O: Adjunction is mandatory at this node
64 if isinstance(rule
, Production
):
65 tree
= TagProduction
._convert
_cfg
_rule
(rule
)
67 elif isinstance(rule
, Tree
):
68 tree
= TagProduction
._convert
_tree
(Tree
.convert(rule
)) # make a copy
71 tree
= TagProduction
._convert
_tree
(Tree(rule
))
73 super(TagProduction
, self
).__init
__(tree
.node
, tree
)
76 """the root of the production (the root of the tree)"""
80 def is_auxiliary(self
):
81 """True iff the production is an auxiliary tree"""
82 return self
.foot_treeposition() is not None
86 """True iff the production is an initial (non-auxiliary) tree"""
87 return self
.foot_treeposition() is None
91 """True iff the production contains a Terminal (e.g. a lexical item)"""
97 def is_nonlexical(self
):
98 """True iff the production contains no lexical items"""
99 return not self
.is_lexical()
102 def is_chomsky_normal_form(self
): pass
105 def is_binarised(self
): pass
108 def is_flexible_chomsky_normal_form(self
): pass
111 def is_nonempty(self
): pass
113 ############ stuff in the production
115 def treepositions(self
):
116 """an OrderedFrozenSet of indices of nodes/leaves in this production"""
117 return OrderedFrozenSet(super(TagProduction
, self
).treepositions())
120 def pos_subtrees(self
):
121 """an OrderedFrozenSet of index/subtree tuples"""
122 return OrderedFrozenSet((pos
, self
[pos
])
123 for pos
in self
.treepositions())
125 ############ stuff at leaf positions
127 def leaf_treepositions(self
):
128 """an OrderedFrozenSet of the position of leaves of the production"""
129 return OrderedFrozenSet(pos
130 for pos
, subtree
in self
.pos_subtrees()
131 if not isinstance(subtree
, Tree
))
135 """an OrderedFrozenSet of the leaves of the production"""
136 return OrderedFrozenSet(super(TagProduction
, self
).leaves())
139 def foot_treeposition(self
):
140 """If the production is auxiliary, the index of the foot. Otherwise, None"""
141 for pos
in self
.leaf_treepositions():
143 if isinstance(subtree
, TagNonterminal
) and subtree
.is_foot():
149 def terminal_treepositions(self
):
150 """an OrderedFrozenSet of terminal (lexical) positions"""
153 for pos
in self
.leaf_treepositions():
155 if not isinstance(node
, TagNonterminal
):
156 term_positions
.append(pos
)
158 return OrderedFrozenSet(term_positions
)
162 """an OrderedFrozenSet of the terminals (lexical elements)"""
163 return OrderedFrozenSet(self
[pos
] for pos
in self
.terminal_treepositions())
166 def substitution_treepositions(self
):
167 """an OrderedFrozenSet substitution positions"""
170 for pos
in self
.leaf_treepositions():
172 if isinstance(node
, TagNonterminal
) and not node
.is_foot():
173 symbol
= node
.symbol()
174 sub_positions
.append((symbol
, pos
))
176 return FrozenIndex(sub_positions
)
178 ############ stuff not at leaf positions
180 def internal_treepositions(self
):
181 """an OrderedFrozenSet of non-leaf treepositions"""
182 return OrderedFrozenSet(pos
183 for pos
, subtree
in self
.pos_subtrees()
184 if isinstance(subtree
, Tree
))
187 def epsilon_treepositions(self
):
188 """an OrderedFrozenSet of the positions of nodes without children"""
189 epsilon_positions
= []
191 for pos
, subtree
in self
.pos_subtrees():
192 if isinstance(subtree
, Tree
) and len(subtree
) == 0:
193 epsilon_positions
.append(pos
)
195 return OrderedFrozenSet(epsilon_positions
)
198 def adjunction_treepositions(self
):
199 """an OrderedFrozenSet of treepositions where adjunction is admissible"""
202 for pos
in self
.treepositions():
203 node
= self
.get_node(pos
)
204 if isinstance(node
, TagNonterminal
) and not node
.is_foot() and not node
.no_adjunction():
205 symbol
= node
.symbol()
206 adj_positions
.append((symbol
, pos
))
208 return FrozenIndex(adj_positions
)
210 ############# other stuff
211 def get_node(self
, p
):
212 """returns the node at position p"""
215 if isinstance(subtree
, Tree
):
220 def can_adjoin(self
, gamma
, p
):
221 """returns true if adjunction of beta (this tree) is allowed at gamma[p].
224 so... should fSA be defined in the grammar spec somewhere?
225 currently just assume the most general case i.e. labels match and adjunction
226 not explicitly forbidden. """
228 if isinstance(node
, Tree
):
231 return (self
.is_auxiliary()
232 and type(node
) == TagNonterminal
233 and not node
.no_adjunction()
234 and self
.root().symbol() == node
.symbol())
236 ############## construction helpers
238 def _convert_tree(cls
, tree
):
239 """helper method to substitute TagNonterminals for all the node labels"""
241 for subtree
in tree
.subtrees():
242 if isinstance(subtree
.node
, TagNonterminal
):
243 subtree
.node
= subtree
.node
244 if isinstance(subtree
.node
, Nonterminal
):
245 subtree
.node
= TagNonterminal(subtree
.node
.symbol())
247 subtree
.node
= TagNonterminal(subtree
.node
)
249 for leaf
, position
in ((tree
[tree
.leaf_treeposition(i
)], tree
.leaf_treeposition(i
))
250 for i
in range(len(tree
.leaves()))):
252 m
= re
.match(IS_LEAF
, leaf
)
256 tree
[position
] = m
.group(1)
258 tree
[position
] = m
.group(2)
261 tree
[position
] = TagNonterminal(leaf
)
265 def as_tree(self
, type=ImmutableTree
):
266 """the production as a regular tree. Interior nodes are converted to Nonterminal objects"""
267 tree
= Tree
.convert(self
)
268 for subtree
in tree
.subtrees():
269 if isinstance(subtree
, Tree
):
270 subtree
.node
= Nonterminal(subtree
.node
.symbol())
271 for i
in range(len(subtree
)):
272 subsubtree
= subtree
[i
]
273 if isinstance(subsubtree
, Nonterminal
):
274 subtree
[i
] = Nonterminal(subsubtree
.symbol())
278 return type.convert(tree
)
281 def _convert_cfg_rule(cls
, rule
):
282 """helper method for construction a TagProduction out of a CFG Production object"""
283 def convert_item(item
):
284 if type(item
) is Nonterminal
:
287 return "'%s'" % (item
)
289 return cls
._convert
_tree
(Tree(convert_item(rule
.lhs()),
291 for item
in rule
.rhs()]))