1 // Base for collection class template
2 // Copyright © 2009 The University of Chicago
7 #include <q3textstream.h>
8 #include <q3listview.h>
11 #include <QTextOStream>
13 #include "StringFunc.h"
14 #include "StringSurrogate.h"
17 class CMorphemeCollection
;
20 class CWordCollection
;
21 class LinguisticaMainWindow
;
24 //typedef QMap<QString,QString> StringToString;
27 class CTrieListViewItem
: public Q3ListViewItem
30 CTrieListViewItem( Q3ListView
* parent
= NULL
);
31 CTrieListViewItem( Q3ListView
*parent
,
32 QString
= QString::null
);
33 CTrieListViewItem( Q3ListViewItem
*parent
,
34 QString
= QString::null
);
36 virtual QString
text ( int ) const;
37 virtual QString
key ( int, bool ) const;
41 /// m_Key[0]..m_Key[m_KeyLength - 1] is a string containing this
43 /// The memory pointed to is owned by the node.
49 /// Callers can optionally store data in m_Pointer.
50 /// If m_AutoDelete is true, CNode takes responsibility for deleting it.
51 /// XXX. Unfortunately, in that case, the object’s destructor will not
55 /// m_Pointers[0]..m_Pointers[m_NumberOfBranches - 1]
56 /// point to child nodes.
58 /// Each child’s key is this node’s key with the corresponding char
59 /// from m_Letters[0]..m_Letters[m_NumberOfBranches - 1] appended.
61 /// The memory for the m_Pointers and m_Letters arrays and the child
62 /// nodes are owned by this node.
64 int m_NumberOfBranches
;
68 /// We can "activate" just part of the trie with this
72 /// This node’s key is m_Key[0]..m_Key[m_BreakPoint - 1],
73 /// and its parent’s key is m_Key[0]..m_Key[m_StartPoint - 1].
74 /// Thus, 0 <= m_StartPoint <= m_BreakPoint <= m_KeyLength always holds,
75 /// and m_StartPoint < m_BreakPoint for all nodes except the root.
79 bool m_MorphemeBoundary
; // FSA
81 // construction/destruction.
84 CNode(const CStringSurrogate
& key
, int start_index
, int break_index
);
85 CNode(QString key
, int start_index
, int break_index
);
90 CNode(const CNode
& x
);
91 CNode
& operator=(const CNode
& x
);
95 CStringSurrogate
GetKey();
96 int GetKeyLength() { return m_KeyLength
; }
97 QChar
* GetKeyPointer() { return m_Key
; }
101 CNode
* GetBranch(int i
) const
103 if (i
>= m_NumberOfBranches
|| i
< 0 )
105 return m_Pointers
[i
];
107 int GetNumberOfBranches() const { return m_NumberOfBranches
; }
108 void SetNumberOfBranches(int n
) { m_NumberOfBranches
= n
; }
109 CNode
* FindLetter(QChar ch
);
110 /// number of children, plus 1 if ExistenceFlag is true
115 void DoesNotExist() { m_ExistenceFlag
= false; }
116 void Exists () { m_ExistenceFlag
= true; }
117 void* Get_T_Pointer();
118 void SetPointer(void* pT
) { m_Pointer
= pT
; }
119 /// delete associated data for descendent nodes when they
121 /// XXX. if set to true, the associated data will only be
122 /// deleted as a void pointer. This results in undefined
123 /// behavior, since no destructor is called.
124 void SetAutoDelete(bool b
= true);
128 int GetCorpusCount () const { return m_CorpusCount
; }
129 void SetCorpusCount(int n
) { m_CorpusCount
= n
; }
133 CNode
* Find1(const CStringSurrogate key
, bool PartialOK
= 0 );
137 CNode
* Insert(CStringSurrogate
& str
, int& num_nodes
,
138 CNode
** new_terminal
= 0, int* result
= 0);
139 CNode
** GetLink(const CStringSurrogate str
);
140 CNode
** GetLink(CStringSurrogate
& str
);
141 /// Requires: this node’s key is a prefix of new_node’s
142 /// If this node’s and new_node’s keys coincide, does nothing
143 /// and result is null.
144 /// Otherwise, let c be the first character after this node’s
145 /// key in new_node’s key.
147 /// If this node has a child corresponding to the letter c,
148 /// result is a pointer to the corresponding edge.
150 /// If not, this node adopts new_node as a child corresponding
151 /// to the letter c, and the result is a pointer to the new edge.
154 /// CNode* new_node = acquire_new_node_somehow();
155 /// for (CNode** edge = &root;
157 /// edge = (*edge)->GetLink(new_node))
160 CNode
** GetLink(CNode
* new_node
);
164 /// invoke CStem::CutRightBeforeHere(index) on each word from words
165 /// found in the trie
166 void BreakAllWordsBelowHere(int index
, CWordCollection
* words
);
167 /// result is count plus the number of nodes with ExistenceFlag true
168 int ComputeNumberOfEntries(int count
);
169 /// result is the number of prefixes of str that exist in the trie
170 /// XXX. some false positives might be included in that count
171 int CountValidSubstrings(CStringSurrogate
& str
);
172 /// invoke CStem::CutRightBeforeHere(index) on
173 /// static_cast<CStem*>(m_Pointer) for each descendent node
174 void CutAllWordsHereAndBelow_AfterNLetters(int index
); // FSA
175 /// append existing descendents’ key.mid(Offset)s to *out.
176 void FindAllTerminalsBelowHere(CParse
* out
, int Offset
= 0);
177 /// deepest descendent for which GetCountBelow() > count / 2
178 /// if no such descendent exists, result is null.
179 CNode
* FindLowestMajorityNode(int count
);
180 /// append each descendent with GetWidth() > threshold to out
181 void FindSFBumps(CMorphemeCollection
& out
, int threshold
);
182 void MakeCutsAtMorphemeBoundary(int depth
); // FSA
183 void MakeCutsWhereCountExceeds(int count
, int start
, int end
);
184 bool MakeMorphemeBoundariesAtThisWidth(int n
, int min_stem_len
,
185 int this_len
); // FSA
186 CNode
* SearchForPrefix(CStringSurrogate
& str
, int& result
,
191 /// put child nodes in alphabetical order,
192 /// then call Alphabetize() on each child node.
197 QString
Display(int tabs
, bool reversed
);
198 void DumpVisibleToLogFile(QTextOStream
* out
, bool reversed
);
199 // void DumpVisibleWords(CWordCollection* out, bool reversed);
200 /// list all descendent nodes as out[i], out[i+1], ... and set i
201 /// to the index after the last descendent.
202 void CreateNodeArray(CNode
** out
, int& i
);
203 /// list existent descendent nodes as out[i], out[i+1], ... and
204 /// set i to the index after the last existing descendent
205 void MakeATerminalPointerArray(CNode
** out
, int& i
);
206 /// cut *parse at each index where there is an existing node
207 /// and save the corresponding corpus counts as piece values.
208 int ProjectTrieStructureOntoWord(CParse
* parse
, int position
= 0);
210 // number of existing descendents.
212 int GetCountBelow() { return m_CountBelow
; }
213 void SetCountBelow(int n
) { m_CountBelow
= n
; }
217 void ListDisplay(Q3ListView
* widget
,
218 LinguisticaMainWindow
* doc
= 0,
220 QMap
<QString
, QString
>* filter
= 0,
221 bool ReverseFlag
= false);
222 void ListDisplay(Q3ListViewItem
* parent
,
223 LinguisticaMainWindow
* doc
= 0,
225 QMap
<QString
, QString
>* filter
= 0,
226 bool ReverseFlag
= false);
228 // invisible branches.
230 /// if Flag is TRUE, make all visible, else all invisible
231 void MakeAllVisible(bool flag
);
232 bool MakeVisible(const CStringSurrogate str
);
240 /// List of all nodes, for iterating over all nodes.
241 /// created when required by CreateNodeArray(), owned by the trie
242 /// not updated as the trie changes
243 mutable CNode
** m_NodeArray
;
244 bool m_ReverseFlag
; // For output functions
246 /// Number of nodes below root (always >= 0).
249 bool m_TrieHasChangedFlag
;
250 bool m_IsAlphabetized
;
252 // construction/destruction.
254 explicit CTrie(bool ReverseFlag
= false);
257 /// delete associated data for nodes when they are removed
258 /// or trie is destroyed?
259 /// XXX. if set to true, the associated data will only be
260 /// deleted as a void pointer. This results in undefined
261 /// behavior, since no destructor is called.
262 void SetAutoDelete(bool b
= true);
266 CTrie(const CTrie
& x
);
267 CTrie
& operator=(const CTrie
& x
);
271 /// Replace with a default-constructed trie,
272 /// except remember the reversed and auto-delete flags.
273 void ResetToEmpty(); // just removes structure;
277 CNode
* operator<<(CStringSurrogate str
);
278 CNode
* operator<<(CParse str
) { return *this << &str
; }
280 CNode
* operator<<(CParse
* str
);
281 CNode
* Insert(CStringSurrogate str
);
282 CNode
* Insert(CStringSurrogate str
, int* result
);
286 bool RemoveFromTrie(const CStringSurrogate
,
287 bool RemovePointerFlag
= true);
291 CNode
* Find1(CStringSurrogate
, bool PartialOK
= 0);
295 int ComputeNumberOfEntries();
296 int GetCount() { return m_Count
; }
297 int GetSize() { return m_Count
; }
302 void CreateNodeArray();
303 CNode
** GetNodeArray() { return m_NodeArray
; }
311 bool IsReverse() { return m_ReverseFlag
; }
318 /// result is the number of prefixes of str that exist in the trie
319 int CountValidSubstrings(CStringSurrogate
& str
);
320 /// write a list of suffixes corresponding to prefix to *out
321 void CreateSuffixSignatures(const CStringSurrogate
& prefix
, CParse
* out
);
323 void CreateSuffixSignatures(const CStringSurrogate
*, CParse
*);
324 void FindSFBumps(CMorphemeCollection
& out
, int threshold
);
326 /// deepest descendent for which GetCountBelow() > GetCount() / 2
327 CNode
* FindLowestMajorityNode();
331 void IncrementCount(int n
= 1) { m_Count
+= n
; }
335 void MakeAllNodesVisible(bool flag
); // see MakeVisible
336 bool MakeVisible(const CStringSurrogate key
);
340 /// requires: Array[0]..Array[GetCount() - 1] are valid to write to
341 /// list existent nodes to those locations
342 void MakeATerminalPointerArray(CNode
** Array
);
344 void ListDisplay(Q3ListView
* widget
,
345 LinguisticaMainWindow
* doc
= 0,
346 QMap
<QString
, QString
>* filter
= 0,
347 bool ReverseFlag
= false);
348 /// cut *parse at each index where there is an existing node
349 /// and save the corresponding corpus counts as piece values.
350 int ProjectTrieStructureOntoWord(CParse
* parse
);
352 void MakeCutsAtMorphemeBoundary(); // FSA
353 void MakeCutsWhereCountExceeds(int, int start
= 1, int end
= 10000);
354 void MakeMorphemeBoundariesAtThisWidth(int, int); // FSA
355 CNode
* SearchForPrefix(CStringSurrogate
& str
, int& result
);