2to3 (compiles, not tested)
[tag_parser.git] / src / mjacob / nltk / parse / tag / TagChart.py
blobfa4d8163c4f8bf38c059d29f90b03df39dda282a
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 18, 2011
5 This is a hacked version of what's in NLTK.
7 it's available under the Apache 2.0 license:
8 http://www.apache.org/licenses/LICENSE-2.0
10 # Copyright (C) 2001-2011 NLTK Project
11 # Author: Edward Loper <edloper@gradient.cis.upenn.edu>
12 # Steven Bird <sb@csse.unimelb.edu.au>
13 # Jean Mark Gawron <gawron@mail.sdsu.edu>
14 # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
15 # URL: <http://www.nltk.org/>
16 # For license information, see LICENSE.TXT
18 @author: mjacob
19 '''
20 from nltk.tree import Tree
22 class TagChart(object):
23 """
24 A blackboard for hypotheses about the syntactic constituents of a
25 sentence. A chart contains a set of edges, and each edge encodes
26 a single hypothesis about the structure of some portion of the
27 sentence.
29 The L{select} method can be used to select a specific collection
30 of edges. For example C{chart.select(is_complete=True, start=0)}
31 yields all complete edges whose start indices are 0. To ensure
32 the efficiency of these selection operations, C{Chart} dynamically
33 creates and maintains an index for each set of attributes that
34 have been selected on.
36 In order to reconstruct the trees that are represented by an edge,
37 the chart associates each edge with a set of child pointer lists.
38 A X{child pointer list} is a list of the edges that license an
39 edge's right-hand side.
41 @ivar _tokens: The sentence that the chart covers.
42 @ivar _num_leaves: The number of tokens.
43 @ivar _edges: A list of the edges in the chart
44 @ivar _indexes: A dictionary mapping tuples of edge attributes
45 to indices, where each index maps the corresponding edge
46 attribute values to lists of edges.
47 """
48 def __init__(self, tokens, edge_type):
49 """
50 Construct a new chart. The chart is initialized with the
51 leaf edges corresponding to the terminal leaves.
53 @type tokens: L{list}
54 @param tokens: The sentence that this chart will be used to parse.
55 """
56 # Record the sentence token and the sentence length.
57 self.__tokens = tuple(tokens)
58 self.__num_leaves = len(self.__tokens)
59 self.__leaf_indices = dict((leaf, tuple(i
60 for i in range(len(tokens))
61 if tokens[i] == leaf))
62 for leaf in frozenset(tokens))
64 self.__edge_type=edge_type
66 # Initialise the chart.
67 self.initialize()
69 def initialize(self):
70 """
71 Clear the chart.
72 """
73 # A list of edges contained in this chart.
74 self.__edges = []
76 # The set of child pointer lists associated with each edge.
77 self.__edge_treebuilders = {}
78 self.__cached_trees = {}
80 # Indexes mapping attribute values to lists of edges
81 # (used by select()).
82 self.__indexes = {}
84 def __str__(self):
85 return "\n".join(map(str, [type(self),
86 " tokens: %s" % (self.__tokens,),
87 " leaves: %s" % (self.__num_leaves,),
88 " indices: %s" % ("\n ".join(str(y) for y in list(self.__leaf_indices.items()))),
89 " edges: %s" % ("\n ".join(str(y) for y in self.__edges)),
90 " indexes: %s" % ("\n ".join(str(y) for y in list(self.__indexes.items()))),
91 ]))
93 #////////////////////////////////////////////////////////////
94 # Sentence Access
95 #////////////////////////////////////////////////////////////
97 def num_leaves(self):
98 """
99 @return: The number of words in this chart's sentence.
100 @rtype: C{int}
102 return self.__num_leaves
104 def leaf(self, index):
106 @return: The leaf value of the word at the given index.
107 @rtype: C{string}
109 return self.__tokens[index]
111 def leaves(self):
113 @return: A list of the leaf values of each word in the
114 chart's sentence.
115 @rtype: C{list} of C{string}
117 return self.__tokens
119 def leaf_indices(self, leaf):
121 @return: A list of indices at which @param leaf appears in the sentence
122 @rtype: C{tuple} of C{int}
124 return self.__leaf_indices.get(leaf, ())
126 #////////////////////////////////////////////////////////////
127 # Edge access
128 #////////////////////////////////////////////////////////////
130 def edges(self):
132 @return: A list of all edges in this chart. New edges
133 that are added to the chart after the call to edges()
134 will be contained in this list, but you shouldn't rely on that.
135 @rtype: C{list} of L{EdgeI}
136 @see: L{iteredges}, L{select}
138 return self.__edges
140 def iteredges(self):
142 @return: An iterator over the edges in this chart. It is
143 I{not} guaranteed that new edges which are added to the
144 chart before the iterator is exhausted will also be
145 generated.
146 @rtype: C{iter} of L{EdgeI}
147 @see: L{edges}, L{select}
149 return iter(self.__edges)
151 # Iterating over the chart yields its edges.
152 __iter__ = iteredges
154 def num_edges(self):
156 @return: The number of edges contained in this chart.
157 @rtype: C{int}
159 return len(self.__edges)
161 def select(self, **restrictions):
163 @return: An iterator over the edges in this chart. Any
164 new edges that are added to the chart before the iterator
165 is exahusted will also be generated. C{restrictions}
166 can be used to restrict the set of edges that will be
167 generated.
168 @rtype: C{iter} of L{EdgeI}
170 # If there are no restrictions, then return all edges.
171 if restrictions=={}: return iter(self.__edges)
173 # Find the index corresponding to the given restrictions.
174 restr_keys = list(restrictions.keys())
175 restr_keys.sort()
176 restr_keys = tuple(restr_keys)
178 # If it doesn't exist, then create it.
179 if restr_keys not in self.__indexes:
180 self._add_index(restr_keys)
182 vals = tuple(restrictions[key] for key in restr_keys)
183 return iter(self.__indexes[restr_keys].get(vals, []))
185 def _add_index(self, restr_keys):
187 A helper function for L{select}, which creates a new index for
188 a given set of attributes (aka restriction keys).
190 # Make sure it's a valid index.
191 for key in restr_keys:
192 if not hasattr(self.__edge_type, key):
193 raise ValueError('Bad restriction: %s' % key)
195 # Create the index.
196 index = self.__indexes[restr_keys] = {}
198 # Add all existing edges to the index.
199 for edge in self.__edges:
200 vals = tuple(getattr(edge, key)() for key in restr_keys)
201 index.setdefault(vals, []).append(edge)
203 def _register_with_indexes(self, edge):
205 A helper function for L{insert}, which registers the new
206 edge with all existing indexes.
208 for (restr_keys, index) in list(self.__indexes.items()):
209 vals = tuple(getattr(edge, key)() for key in restr_keys)
210 index.setdefault(vals, []).append(edge)
212 #////////////////////////////////////////////////////////////
213 # Edge Insertion
214 #////////////////////////////////////////////////////////////
216 def insert(self, edge, treebuilder):
218 Add a new edge to the chart.
220 @type edge: L{EdgeI}
221 @param edge: The new edge
222 @rtype: C{bool}
223 @return: True if this operation modified the chart. In
224 particular, return true iff the chart did not already
225 contain C{edge}
228 # Is it a new edge?
229 if edge not in self.__edge_treebuilders:
230 #print " new edge: %s" % (edge)
231 # Add it to the list of edges.
232 self._append_edge(edge)
233 # Register with indexes.
234 self._register_with_indexes(edge)
235 self.__edge_treebuilders[edge] = set((treebuilder,))
237 return True
238 else:
239 #print " duplicate edge: %s" % (edge)
240 self.__edge_treebuilders[edge].add(treebuilder)
241 return False
243 def get_trees(self, edge, working_on=set()):
245 find the trees which can be built out of the specified edge.
247 the parameter "working_on" should not be specified by the user, as it is
248 an internal mechanism that is used to avoid loops in the recursion.
250 if edge in self.__cached_trees:
251 return self.__cached_trees[edge]
253 else:
254 working_on.add(edge)
255 treebuilders = self.__edge_treebuilders[edge]
256 if len(treebuilders) == 1:
257 trees = None
258 for treebuilder in treebuilders:
259 trees = treebuilder.build(working_on)
260 else:
261 trees = []
262 for treebuilder in treebuilders:
263 trees.extend(treebuilder.build(working_on))
264 trees = frozenset(trees)
265 self.__cached_trees[edge] = trees
266 working_on.remove(edge)
267 return trees
269 def _append_edge(self, edge):
270 self.__edges.append(edge)
272 #////////////////////////////////////////////////////////////
273 # Tree extraction & child pointer lists
274 #////////////////////////////////////////////////////////////
276 def accept(self, root, strategy):
277 """returns True if there's a goal item in the chart"""
278 return strategy.goal_found(self, root)
280 """parses(self.__grammar.start(), self.__strategy, tree_class=tree_class)"""
281 def parses(self, root, strategy, tree_class=Tree):
282 trees = set()
283 for goal_edge in strategy.goal_edges(self, root):
284 trees.update(self.get_trees(goal_edge))
285 return list(trees)