CMiniLexicon::FindMajorSignatures(): clean up whitespace
[linguistica.git] / Trie.h
blob2756b351a09fda2f7ac19d1933ce5b4308a1d261
1 // Base for collection class template
2 // Copyright © 2009 The University of Chicago
3 #ifndef TRIE_H
4 #define TRIE_H
6 #include <QList>
7 #include <q3textstream.h>
8 #include <q3listview.h>
9 #include <q3dict.h>
10 #include <qstring.h>
11 #include <QTextOStream>
13 #include "StringFunc.h"
14 #include "StringSurrogate.h"
15 #include "Parse.h"
17 class CMorphemeCollection;
18 class CNode;
19 class CTrie;
20 class CWordCollection;
21 class LinguisticaMainWindow;
24 //typedef QMap<QString,QString> StringToString;
27 class CTrieListViewItem : public Q3ListViewItem
29 public:
30 CTrieListViewItem( Q3ListView* parent = NULL );
31 CTrieListViewItem( Q3ListView *parent,
32 QString = QString::null );
33 CTrieListViewItem( Q3ListViewItem *parent,
34 QString = QString::null );
36 virtual QString text ( int ) const;
37 virtual QString key ( int, bool ) const;
40 class CNode {
41 /// m_Key[0]..m_Key[m_KeyLength - 1] is a string containing this
42 /// node’s key.
43 /// The memory pointed to is owned by the node.
44 QChar* m_Key;
45 int m_KeyLength;
47 int m_CorpusCount;
49 /// Callers can optionally store data in m_Pointer.
50 /// If m_AutoDelete is true, CNode takes responsibility for deleting it.
51 /// XXX. Unfortunately, in that case, the object’s destructor will not
52 /// be called.
53 void* m_Pointer;
55 /// m_Pointers[0]..m_Pointers[m_NumberOfBranches - 1]
56 /// point to child nodes.
57 ///
58 /// Each child’s key is this node’s key with the corresponding char
59 /// from m_Letters[0]..m_Letters[m_NumberOfBranches - 1] appended.
60 ///
61 /// The memory for the m_Pointers and m_Letters arrays and the child
62 /// nodes are owned by this node.
63 QChar* m_Letters;
64 int m_NumberOfBranches;
65 CNode** m_Pointers;
67 int m_CountBelow;
68 /// We can "activate" just part of the trie with this
69 bool m_Visible;
70 bool m_AutoDelete;
71 public:
72 /// This node’s key is m_Key[0]..m_Key[m_BreakPoint - 1],
73 /// and its parent’s key is m_Key[0]..m_Key[m_StartPoint - 1].
74 /// Thus, 0 <= m_StartPoint <= m_BreakPoint <= m_KeyLength always holds,
75 /// and m_StartPoint < m_BreakPoint for all nodes except the root.
76 int m_StartPoint;
77 int m_BreakPoint;
78 bool m_ExistenceFlag;
79 bool m_MorphemeBoundary; // FSA
80 public:
81 // construction/destruction.
83 CNode();
84 CNode(const CStringSurrogate& key, int start_index, int break_index);
85 CNode(QString key, int start_index, int break_index);
86 ~CNode();
88 // disable copy
89 private:
90 CNode(const CNode& x);
91 CNode& operator=(const CNode& x);
92 public:
93 // label.
95 CStringSurrogate GetKey();
96 int GetKeyLength() { return m_KeyLength; }
97 QChar* GetKeyPointer() { return m_Key; }
99 // child nodes.
101 CNode* GetBranch(int i) const
103 if (i >= m_NumberOfBranches || i < 0 )
104 return 0;
105 return m_Pointers[i];
107 int GetNumberOfBranches() const { return m_NumberOfBranches; }
108 void SetNumberOfBranches(int n) { m_NumberOfBranches = n; }
109 CNode* FindLetter(QChar ch);
110 /// number of children, plus 1 if ExistenceFlag is true
111 int GetWidth();
113 // associated data.
115 void DoesNotExist() { m_ExistenceFlag = false; }
116 void Exists () { m_ExistenceFlag = true; }
117 void* Get_T_Pointer();
118 void SetPointer(void* pT) { m_Pointer = pT; }
119 /// delete associated data for descendent nodes when they
120 /// are destroyed?
121 /// XXX. if set to true, the associated data will only be
122 /// deleted as a void pointer. This results in undefined
123 /// behavior, since no destructor is called.
124 void SetAutoDelete(bool b = true);
126 // corpus count.
128 int GetCorpusCount () const { return m_CorpusCount; }
129 void SetCorpusCount(int n) { m_CorpusCount = n; }
131 // find.
133 CNode* Find1(const CStringSurrogate key, bool PartialOK = 0 );
135 // insert.
137 CNode* Insert(CStringSurrogate& str, int& num_nodes,
138 CNode** new_terminal = 0, int* result = 0);
139 CNode** GetLink(const CStringSurrogate str);
140 CNode** GetLink(CStringSurrogate& str);
141 /// Requires: this node’s key is a prefix of new_node’s
142 /// If this node’s and new_node’s keys coincide, does nothing
143 /// and result is null.
144 /// Otherwise, let c be the first character after this node’s
145 /// key in new_node’s key.
147 /// If this node has a child corresponding to the letter c,
148 /// result is a pointer to the corresponding edge.
150 /// If not, this node adopts new_node as a child corresponding
151 /// to the letter c, and the result is a pointer to the new edge.
153 /// Example:
154 /// CNode* new_node = acquire_new_node_somehow();
155 /// for (CNode** edge = &root;
156 /// edge != 0;
157 /// edge = (*edge)->GetLink(new_node))
158 /// // do nothing
159 /// ;
160 CNode** GetLink(CNode* new_node);
162 // iteration.
164 /// invoke CStem::CutRightBeforeHere(index) on each word from words
165 /// found in the trie
166 void BreakAllWordsBelowHere(int index, CWordCollection* words);
167 /// result is count plus the number of nodes with ExistenceFlag true
168 int ComputeNumberOfEntries(int count);
169 /// result is the number of prefixes of str that exist in the trie
170 /// XXX. some false positives might be included in that count
171 int CountValidSubstrings(CStringSurrogate& str);
172 /// invoke CStem::CutRightBeforeHere(index) on
173 /// static_cast<CStem*>(m_Pointer) for each descendent node
174 void CutAllWordsHereAndBelow_AfterNLetters(int index); // FSA
175 /// append existing descendents’ key.mid(Offset)s to *out.
176 void FindAllTerminalsBelowHere(CParse* out, int Offset = 0);
177 /// deepest descendent for which GetCountBelow() > count / 2
178 /// if no such descendent exists, result is null.
179 CNode* FindLowestMajorityNode(int count);
180 /// append each descendent with GetWidth() > threshold to out
181 void FindSFBumps(CMorphemeCollection& out, int threshold);
182 void MakeCutsAtMorphemeBoundary(int depth); // FSA
183 void MakeCutsWhereCountExceeds(int count, int start, int end);
184 bool MakeMorphemeBoundariesAtThisWidth(int n, int min_stem_len,
185 int this_len); // FSA
186 CNode* SearchForPrefix(CStringSurrogate& str, int& result,
187 int Letterno = 0);
189 // sort.
191 /// put child nodes in alphabetical order,
192 /// then call Alphabetize() on each child node.
193 void Alphabetize();
195 // output.
197 QString Display(int tabs, bool reversed);
198 void DumpVisibleToLogFile(QTextOStream* out, bool reversed);
199 // void DumpVisibleWords(CWordCollection* out, bool reversed);
200 /// list all descendent nodes as out[i], out[i+1], ... and set i
201 /// to the index after the last descendent.
202 void CreateNodeArray(CNode** out, int& i);
203 /// list existent descendent nodes as out[i], out[i+1], ... and
204 /// set i to the index after the last existing descendent
205 void MakeATerminalPointerArray(CNode** out, int& i);
206 /// cut *parse at each index where there is an existing node
207 /// and save the corresponding corpus counts as piece values.
208 int ProjectTrieStructureOntoWord(CParse* parse, int position = 0);
210 // number of existing descendents.
212 int GetCountBelow() { return m_CountBelow; }
213 void SetCountBelow(int n) { m_CountBelow = n; }
215 // output to GUI.
217 void ListDisplay(Q3ListView* widget,
218 LinguisticaMainWindow* doc = 0,
219 int* count = 0,
220 QMap<QString, QString>* filter = 0,
221 bool ReverseFlag = false);
222 void ListDisplay(Q3ListViewItem* parent,
223 LinguisticaMainWindow* doc = 0,
224 int* count = 0,
225 QMap<QString, QString>* filter = 0,
226 bool ReverseFlag = false);
228 // invisible branches.
230 /// if Flag is TRUE, make all visible, else all invisible
231 void MakeAllVisible(bool flag);
232 bool MakeVisible(const CStringSurrogate str);
234 friend class CTrie;
237 class CTrie {
238 protected:
239 CNode* m_Root;
240 /// List of all nodes, for iterating over all nodes.
241 /// created when required by CreateNodeArray(), owned by the trie
242 /// not updated as the trie changes
243 mutable CNode** m_NodeArray;
244 bool m_ReverseFlag; // For output functions
245 bool m_AutoDelete;
246 /// Number of nodes below root (always >= 0).
247 int m_Count;
248 int m_NumberOfNodes;
249 bool m_TrieHasChangedFlag;
250 bool m_IsAlphabetized;
251 public:
252 // construction/destruction.
254 explicit CTrie(bool ReverseFlag = false);
255 virtual ~CTrie();
257 /// delete associated data for nodes when they are removed
258 /// or trie is destroyed?
259 /// XXX. if set to true, the associated data will only be
260 /// deleted as a void pointer. This results in undefined
261 /// behavior, since no destructor is called.
262 void SetAutoDelete(bool b = true);
264 // disable copy.
265 private:
266 CTrie(const CTrie& x);
267 CTrie& operator=(const CTrie& x);
268 public:
269 // assign.
271 /// Replace with a default-constructed trie,
272 /// except remember the reversed and auto-delete flags.
273 void ResetToEmpty(); // just removes structure;
275 // insert.
277 CNode* operator<<(CStringSurrogate str);
278 CNode* operator<<(CParse str) { return *this << &str; }
279 /// deprecated
280 CNode* operator<<(CParse* str);
281 CNode* Insert(CStringSurrogate str);
282 CNode* Insert(CStringSurrogate str, int* result);
284 // remove.
286 bool RemoveFromTrie(const CStringSurrogate,
287 bool RemovePointerFlag = true);
289 // find.
291 CNode* Find1(CStringSurrogate, bool PartialOK = 0);
293 // size.
295 int ComputeNumberOfEntries();
296 int GetCount() { return m_Count; }
297 int GetSize() { return m_Count; }
299 // output.
301 QString Display();
302 void CreateNodeArray();
303 CNode** GetNodeArray() { return m_NodeArray; }
305 // sort.
307 void Alphabetize();
309 // reverse keys.
311 bool IsReverse() { return m_ReverseFlag; }
313 /// root node
314 CNode* GetRoot1();
316 // affix discovery.
318 /// result is the number of prefixes of str that exist in the trie
319 int CountValidSubstrings(CStringSurrogate& str);
320 /// write a list of suffixes corresponding to prefix to *out
321 void CreateSuffixSignatures(const CStringSurrogate& prefix, CParse* out);
322 /// deprecated
323 void CreateSuffixSignatures(const CStringSurrogate*, CParse*);
324 void FindSFBumps(CMorphemeCollection& out, int threshold);
326 /// deepest descendent for which GetCountBelow() > GetCount() / 2
327 CNode* FindLowestMajorityNode();
329 // node count.
331 void IncrementCount(int n = 1) { m_Count += n; }
333 // hide branch.
335 void MakeAllNodesVisible(bool flag); // see MakeVisible
336 bool MakeVisible(const CStringSurrogate key);
338 // output.
340 /// requires: Array[0]..Array[GetCount() - 1] are valid to write to
341 /// list existent nodes to those locations
342 void MakeATerminalPointerArray(CNode** Array);
343 /// output to GUI
344 void ListDisplay(Q3ListView* widget,
345 LinguisticaMainWindow* doc = 0,
346 QMap<QString, QString>* filter = 0,
347 bool ReverseFlag = false);
348 /// cut *parse at each index where there is an existing node
349 /// and save the corresponding corpus counts as piece values.
350 int ProjectTrieStructureOntoWord(CParse* parse);
352 void MakeCutsAtMorphemeBoundary(); // FSA
353 void MakeCutsWhereCountExceeds(int, int start = 1, int end = 10000);
354 void MakeMorphemeBoundariesAtThisWidth(int, int); // FSA
355 CNode* SearchForPrefix(CStringSurrogate& str, int& result);
358 #endif // TRIE_H