HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Trie.cpp
bloba9c3e6f37078e604de830c2975dd9e3b54d53bb9
1 // Implementation of CTrie methods
2 // Copyright © 2009 The University of Chicago
3 #include "Trie.h"
5 #include <memory>
6 #include <vector>
7 #include <algorithm>
8 #include <functional>
9 #include <QTextOStream>
10 #include "linguisticamainwindow.h"
11 #include "ui/Status.h"
12 #include "StringSurrogate.h"
13 #include "Parse.h"
14 #include "Stem.h"
15 #include "MorphemeCollection.h"
16 #include "WordCollection.h"
17 #include "CompareFunc.h"
18 #include "Typedefs.h"
20 // CTrieListViewItem
21 /////////////////////////////////////////////////////////////
23 CTrieListViewItem::CTrieListViewItem( Q3ListView *parent,
24 QString label1 )
25 : Q3ListViewItem( parent, label1 )
29 CTrieListViewItem::CTrieListViewItem( Q3ListViewItem *parent,
30 QString label1 )
31 : Q3ListViewItem( parent, label1 )
34 QString CTrieListViewItem::key( int column, bool ascending ) const
36 //switch( column )
37 //{
38 //default:
39 return Q3ListViewItem::key( column, ascending );
40 //}
43 QString CTrieListViewItem::text( int column ) const
45 //switch( column )
46 //{
47 //default:
48 return Q3ListViewItem::text( column );
49 //}
53 // CTrie
54 /////////////////////////////////////////////////////////////
56 CTrie::CTrie(bool ReverseFlag)
57 : m_Root(new CNode()),
58 m_NodeArray(),
59 m_ReverseFlag(ReverseFlag),
60 m_AutoDelete(true),
61 m_Count(0),
62 m_NumberOfNodes(1),
63 m_TrieHasChangedFlag(true),
64 m_IsAlphabetized(false) { }
66 CTrie::~CTrie()
68 delete m_Root;
69 delete[] m_NodeArray;
72 void CTrie::ResetToEmpty()
74 delete m_Root; // destructor deletes all child nodes
75 m_Root = new CNode();
76 m_Count = 0;
77 m_NumberOfNodes = 1;
78 m_TrieHasChangedFlag = true;
79 m_IsAlphabetized = false;
80 // m_NodeArray, m_ReverseFlag, and m_AutoDelete remain unchanged.
83 CNode* CTrie::operator<< ( CStringSurrogate ssS )
85 return Insert(ssS);
89 CNode* CTrie::operator<< ( CParse* Parse )
91 CStringSurrogate ss = Parse->GetKey();
92 return Insert( ss );
95 void CTrie::ListDisplay(Q3ListView* List, LinguisticaMainWindow* doc,
96 QMap<QString, QString>* filter, bool ReverseFlag)
98 int count = 0;
99 // Remove all previous columns
100 while( List->columns() ) List->removeColumn( 0 );
102 // Add column header
103 if( m_ReverseFlag ) List->addColumn( "Reverse Trie" );
104 else List->addColumn( "Forward Trie" );
106 if( doc ) // TODO: is doc set and if so are these functions below right
108 doc->status_display().major_operation = "Preparing trie for display";
109 doc->status_display().progress.set_denominator(m_Count);
110 m_Root->ListDisplay( List, doc, &count, filter, ReverseFlag );
111 doc->status_display().major_operation.clear();
112 doc->status_display().progress.clear();
114 else m_Root->ListDisplay( List, NULL, NULL, filter, ReverseFlag );
118 void CTrie::FindSFBumps( CMorphemeCollection& TempMorphemes, int SFThreshold )
120 m_Root->FindSFBumps ( TempMorphemes, SFThreshold );
124 void CTrie::MakeCutsWhereCountExceeds( int count, int start, int end )
126 m_Root->MakeCutsWhereCountExceeds( count, start, end );
129 //----------------------------------------------------------------
130 ////////////////////////////////////////////////////////////////
131 //----------------------------------------------------------------
133 // Insert
135 CNode* CTrie::Insert( CStringSurrogate ssS, int* pResult )
137 CNode* pNode = NULL;
139 m_Root->Insert(ssS, m_NumberOfNodes, &pNode, pResult);
141 if ( pResult && *pResult == 1 )
143 m_Count++;
146 m_TrieHasChangedFlag = TRUE;
147 m_IsAlphabetized = FALSE;
149 return pNode;
152 CNode* CTrie::Insert(CStringSurrogate ssS)
154 int Result = 0;
155 int* pResult = &Result;
156 CNode* pNode = NULL;
158 m_Root->Insert(ssS, m_NumberOfNodes, &pNode, pResult);
160 if ( pResult && *pResult == 1 )
162 m_Count++;
165 m_TrieHasChangedFlag = TRUE;
166 m_IsAlphabetized = FALSE;
168 return pNode;
172 //----------------------------------------------------------------
173 ////////////////////////////////////////////////////////////////
174 //----------------------------------------------------------------
178 void CTrie::MakeATerminalPointerArray( CNode** Array )
180 int Index = 0;
182 m_Root->MakeATerminalPointerArray( Array, Index );
183 Q_ASSERT ( (int) Index == m_Count );
189 int CTrie::CountValidSubstrings( CStringSurrogate& ss )
191 return m_Root->CountValidSubstrings( ss );
196 CNode* CTrie::Find1 ( CStringSurrogate SS, bool PartialOK )
198 CNode* pNode = m_Root->Find1(SS, PartialOK);
199 return pNode;
202 bool CTrie::RemoveFromTrie ( const CStringSurrogate string, bool RemovePointerFlag )
204 CNode* pNode = m_Root;
205 CNode* qNode = NULL;
206 m_TrieHasChangedFlag = TRUE;
207 while ( pNode ) {
208 for (int i = pNode->m_StartPoint + 1; i < pNode->m_BreakPoint; i++ ) {
209 if ( string[i] != pNode->m_Key [i] ) {
210 return FALSE;
213 QChar c = string [ pNode->m_BreakPoint ];
214 qNode = pNode->FindLetter( c );
215 if ( qNode == NULL ) {
216 return FALSE;
218 if (qNode->GetKey() == string ) {
219 Q_ASSERT ( qNode->m_Pointer );
220 qNode->m_ExistenceFlag = FALSE;
222 if ( RemovePointerFlag && m_AutoDelete ) {
223 delete qNode->m_Pointer;
225 qNode->m_Pointer = NULL;
227 if ( qNode->m_Letters == NULL ) {
228 delete qNode;
229 m_NumberOfNodes--;
231 QChar* NewLetters = new QChar [ pNode->GetNumberOfBranches() - 1 ];
232 CNode** NewPointers = new CNode* [ pNode->GetNumberOfBranches() - 1 ];
233 int k = 0;
234 for (int j = 0; j < (int)pNode->GetNumberOfBranches(); j++) {
235 if ( c == pNode->m_Letters[j] ) { continue; }
236 NewLetters[k] = pNode->m_Letters[j];
237 NewPointers[k] = pNode->m_Pointers[j];
238 k++;
240 Q_ASSERT ( k == (int)pNode->GetNumberOfBranches()-1 );
241 delete [] pNode->m_Letters; pNode->m_Letters = NewLetters;
242 delete [] pNode->m_Pointers; pNode->m_Pointers = NewPointers;
243 pNode->SetNumberOfBranches(k);
245 else {
246 qNode->DoesNotExist();
248 m_Count--;
249 return TRUE;
251 else {
252 pNode = qNode;
253 continue;
256 Q_ASSERT (FALSE);
257 return FALSE;
261 CNode* CTrie::SearchForPrefix(CStringSurrogate& CStringSurrogate, int& Result )
263 CNode* pNode = NULL;
264 Result = 0;
265 pNode = m_Root->SearchForPrefix (CStringSurrogate, Result );
267 return pNode;
273 //----------------------------------------------------------------
274 ////////////////////////////////////////////////////////////////
275 //----------------------------------------------------------------
277 // Get At
280 CNode* CTrie::GetRoot1()
282 return m_Root;
285 //----------------------------------------------------------------
286 ////////////////////////////////////////////////////////////////
287 //----------------------------------------------------------------
289 // Alphabetize
290 void CTrie::Alphabetize()
292 if ( m_IsAlphabetized == FALSE)
294 m_Root->Alphabetize();
296 m_IsAlphabetized = TRUE;
299 int CTrie::ComputeNumberOfEntries()
301 return m_Root->ComputeNumberOfEntries(0);
304 void CTrie::CreateNodeArray()
306 // XXX. avoid delete/new cycle if there’s room
307 delete[] m_NodeArray;
308 m_NodeArray = new CNode*[m_NumberOfNodes];
310 int Index = 0;
311 m_Root->CreateNodeArray(m_NodeArray, Index);
313 Q_ASSERT(Index = GetCount());
316 void CTrie::CreateSuffixSignatures(const CStringSurrogate& prefix, CParse* out)
317 { CreateSuffixSignatures(&prefix, out); }
319 //----------------------------------------------------------------------------
320 void CTrie::CreateSuffixSignatures ( const CStringSurrogate* Prefix, CParse* pSignature)
322 // This finds the location of Prefix in the Trie, and takes all of the
323 // suffixes to it and creates a "signature" of them.
325 CNode* pNode;
326 int nResult;
327 CStringSurrogate ssInitial;
328 CStringSurrogate ssPrefix = *Prefix;
330 Alphabetize();
333 pNode = SearchForPrefix ( ssPrefix, nResult );
335 Q_ASSERT ( nResult != 0 );
337 if ( pNode && nResult == 2 )
339 ssInitial = pNode->GetKey().Mid( ssPrefix.GetLength() );
342 int Offset = ssPrefix.GetLength();
343 if( pNode ) pNode->FindAllTerminalsBelowHere( pSignature, Offset );
347 //------------------------------------------------------------------------------------
348 // find the deepest node in the Trie whose count is more than half of m_Count
349 CNode* CTrie::FindLowestMajorityNode()
351 CNode* pNode = m_Root->FindLowestMajorityNode( m_Count );
353 if ( pNode == m_Root ) { return NULL; }
354 else
356 return pNode;
362 void CTrie::MakeAllNodesVisible(bool Flag)
364 m_Root->MakeAllVisible(Flag);
368 bool CTrie::MakeVisible( const CStringSurrogate string )
370 return m_Root->MakeVisible(string);
375 //----------------------------------------------------------------//
376 ////////////////////////////////////////////////////////////////
377 //----------------------------------------------------------------//
379 // New functions added for FSA
382 void CTrie::MakeMorphemeBoundariesAtThisWidth(int n, int MinimumStemLength)
385 int ThisLength = 0;
386 m_Root->MakeMorphemeBoundariesAtThisWidth(n, MinimumStemLength, ThisLength);
393 //----------------------------------------------------------------//
394 ////////////////////////////////////////////////////////////////
395 //----------------------------------------------------------------//
397 void CTrie::MakeCutsAtMorphemeBoundary()
399 int depth = 0;
400 m_Root->MakeCutsAtMorphemeBoundary( depth );
404 void CTrie::SetAutoDelete( bool b )
406 m_AutoDelete = b;
407 m_Root->SetAutoDelete(b);
412 // CNode
413 /////////////////////////////////////////////////////////////
415 CNode::CNode(QString s, int StartPoint, int BreakPoint)
416 : m_Key(new QChar[s.length()]), // filled below
417 m_KeyLength(s.length()),
418 m_CorpusCount(0),
419 m_Pointer(),
420 m_Letters(),
421 m_NumberOfBranches(0),
422 m_Pointers(),
423 m_CountBelow(0),
424 m_Visible(false),
425 m_AutoDelete(true), // *this owns m_Pointer
426 // public:
427 m_StartPoint(StartPoint),
428 m_BreakPoint(BreakPoint),
429 m_ExistenceFlag(false),
430 m_MorphemeBoundary(false)
431 { LxStrCpy(s, m_Key, s.length()); }
433 CNode::CNode(const CStringSurrogate& SS, int StartPoint, int BreakPoint)
434 : m_Key(new QChar[SS.GetLength()]), // filled below
435 m_KeyLength(SS.GetLength()),
436 m_CorpusCount(0),
437 m_Pointer(),
438 m_Letters(),
439 m_NumberOfBranches(0),
440 m_Pointers(),
441 m_CountBelow(0),
442 m_Visible(false),
443 m_AutoDelete(true), // *this owns m_Pointer
444 // public:
445 m_StartPoint(StartPoint),
446 m_BreakPoint(BreakPoint),
447 m_ExistenceFlag(false),
448 m_MorphemeBoundary(false)
449 { LxStrCpy(&SS, m_Key, SS.GetLength()); }
451 CNode::CNode()
452 : m_Key(),
453 m_KeyLength(0),
454 m_CorpusCount(0),
455 m_Pointer(),
456 m_Letters(),
457 m_NumberOfBranches(0),
458 m_Pointers(),
459 m_CountBelow(0),
460 m_Visible(false),
461 m_AutoDelete(true), // *this owns m_Pointer
462 // public:
463 m_StartPoint(0),
464 m_BreakPoint(0),
465 m_ExistenceFlag(false),
466 m_MorphemeBoundary(false) { }
468 CNode::~CNode()
470 // Delete child nodes
471 if (m_Letters != 0) {
472 for (int i=0; i < m_NumberOfBranches; ++i)
473 delete m_Pointers[i];
474 delete[] m_Pointers;
477 // If we own it, delete data
478 if (m_AutoDelete)
479 delete m_Pointer;
481 delete[] m_Key;
482 delete[] m_Letters;
485 void CNode::SetAutoDelete( bool b )
487 m_AutoDelete = b;
488 for( int i=0; m_Letters && i < m_NumberOfBranches; i++ )
490 m_Pointers[i]->SetAutoDelete(b);
495 int CNode::CountValidSubstrings( CStringSurrogate& ss )
497 int count = 0;
499 if( m_ExistenceFlag ) count++;
501 CNode* pNode = FindLetter( ss[0] );
502 if( pNode )
504 int step = pNode->m_BreakPoint - m_BreakPoint;
505 CStringSurrogate ssMid = ss.Mid( step );
506 count += pNode->CountValidSubstrings( ssMid );
509 return count;
513 void CNode::FindSFBumps ( CMorphemeCollection& TempMorphemes, int SFThreshold )
515 if ( GetWidth() > SFThreshold )
517 TempMorphemes << CStringSurrogate( m_Key, 0, m_KeyLength );
520 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
522 m_Pointers[i]->FindSFBumps ( TempMorphemes, SFThreshold );
527 void CNode::BreakAllWordsBelowHere(int BreakLocation, CWordCollection* Words )
529 CStem* pWord;
531 CStringSurrogate ssWord;
533 if ( m_KeyLength > 2 && m_KeyLength > BreakLocation + 1)
535 ssWord = CSS(m_Key, 0, m_KeyLength);
537 ssWord = ssWord.Mid(1, ssWord.GetLength() - 2 );// eliminate first and last #
538 pWord = *Words ^= ssWord;
539 if ( pWord )
540 pWord->CutRightBeforeHere (BreakLocation );
545 CNode* CNode::Insert(CStringSurrogate& SS, int& NumberOfNodes,
546 CNode** ppNode, int* pResult)
548 using std::auto_ptr;
549 using std::copy;
550 using std::swap;
552 // The node that is returned is the node immediately down from
553 // node that called this function in the first place.
554 // It may be "This" or it may not.
555 // ppNode is a pointer to the Terminal node that is eventually
556 // identified or created for the string in question.
557 CStringSurrogate ssKey = GetKey();
558 auto_ptr<CNode> this_node(this);
560 // Result: 1: new node added
561 // 2: node already existed
563 //-----------------------------------------------------------------------//
565 // 1: A difference before m_BreakPoint: it's definitely a new string.
566 // should i be initialized as m_StartPoint + 1??
567 int i = m_StartPoint;
568 for (; i < m_BreakPoint && i < SS.GetLength(); ++i) {
569 // XXX. SS and ssKey could be backwards, hence the obscure test.
570 // if ( SS[i] != ssKey[i] )
571 if ( LxStrCmp( &SS, ssKey.GetKey(),1,1,SS.GetStart()+i,ssKey.GetStart()+i) ) {
572 // we create a new node that dominates *this and also
573 // dominates the other new node created, new_node.
575 auto_ptr<CNode> prev(new CNode(ssKey.Left(i), m_StartPoint, i));
576 prev->GetLink(this_node.release());
577 ++NumberOfNodes;
578 m_StartPoint = i;
580 auto_ptr<CNode> new_node(new CNode(SS, i, SS.GetLength()));
581 new_node->Exists();
582 CNode* inserted = new_node.get();
583 prev->GetLink(new_node.release());
584 ++NumberOfNodes;
586 *ppNode = inserted;
588 // Declare success.
589 if (pResult) *pResult = 1;
591 prev->SetCountBelow((m_ExistenceFlag ? 0 : 1) + m_CountBelow + 1);
592 prev->SetCorpusCount(m_CorpusCount + 1);
593 inserted->SetCorpusCount(1);
595 prev->SetAutoDelete(m_AutoDelete);
596 inserted->SetAutoDelete(m_AutoDelete);
598 return prev.release();
601 // Now i == m_BreakPoint or i == SS.GetLength(),
602 // and m_Key[j] == SS.m_Key[j] for all j < i
604 //-----------------------------------------------------------------------//
606 // 2. "s" is shorter than this node's Key:
607 if (i < m_BreakPoint && i == SS.GetLength()) {
608 auto_ptr<CNode> prev(new CNode(ssKey.Left(i), m_StartPoint, i));
609 prev->GetLink(this_node.release());
610 prev->Exists();
611 ++NumberOfNodes;
612 m_StartPoint = i;
613 *ppNode = prev.get();
615 // Declare success.
616 if (pResult) *pResult = 1;
618 prev->SetCountBelow((m_ExistenceFlag ? 0 : 1) + m_CountBelow);
619 prev->SetCorpusCount(m_CorpusCount + 1);
620 prev->SetAutoDelete(m_AutoDelete);
622 return prev.release();
625 //-----------------------------------------------------------------------//
627 // 3. "s" is exactly this node's key:
628 if (i == m_BreakPoint && i == SS.GetLength()) {
629 if (m_ExistenceFlag)
630 // Already present.
631 *pResult = 2;
632 else
633 // Success.
634 *pResult = 1;
636 Exists();
637 SetCorpusCount(m_CorpusCount + 1);
639 *ppNode = this;
641 return this_node.release();
644 //-----------------------------------------------------------------------//
645 // 4. This node only takes us part way into the word:
646 QChar Letter = SS[m_BreakPoint];
647 for (int j = 0; j < m_NumberOfBranches; ++j) {
648 if (m_Letters[j] == Letter) {
649 CNode* next = m_Pointers[j];
650 int result;
651 auto_ptr<CNode> new_node(next->Insert(SS, NumberOfNodes,
652 ppNode, &result));
653 if (pResult) *pResult = result;
654 m_Pointers[j] = new_node.release();
656 if (result == 1)
657 // new descendent
658 ++m_CountBelow;
660 ++m_CorpusCount;
661 return this_node.release();
665 //-----------------------------------------------------------------------//
666 // 5. the letter at BreakPoint is new:
667 auto_ptr<CNode> new_node(new CNode(SS, m_BreakPoint, SS.GetLength()));
668 new_node->Exists();
669 new_node->SetCorpusCount(1);
670 new_node->SetAutoDelete(m_AutoDelete);
671 ++NumberOfNodes;
672 *ppNode = new_node.get();
674 int length = m_NumberOfBranches;
676 // XXX. use realloc work-alike
677 QChar* NewLetters = new QChar[length + 1];
678 CNode** NewPointers = new CNode*[length + 1];
679 copy(m_Letters, m_Letters + length, NewLetters);
680 copy(m_Pointers, m_Pointers + length, NewPointers);
681 NewLetters[length] = Letter;
682 NewPointers[length] = new_node.release();
684 swap(m_Letters, NewLetters);
685 swap(m_Pointers, NewPointers);
686 ++m_NumberOfBranches;
688 delete[] NewLetters;
689 delete[] NewPointers;
691 // Declare success.
692 if (pResult) *pResult = 1;
694 ++m_CountBelow;
695 ++m_CorpusCount;
697 return this_node.release();
699 //-----------------------------------------------------------------------//
700 // end ///////////////////////
704 CNode** CNode::GetLink ( const CStringSurrogate s )
706 int i;
708 QChar c = s[m_BreakPoint];
709 int length = 0;
711 if (m_Letters)
713 length = m_NumberOfBranches;
715 for ( i = 0; i < length; i++)
717 if (m_Letters[i] == c) return &m_Pointers[i];
720 QChar* NewLetters = new QChar[length + 1];
721 CNode** NewPointers = new CNode*[length + 1];
722 for ( i = 0; i < length; i++)
724 NewLetters[i] = m_Letters[i];
725 NewPointers[i] = m_Pointers[i];
727 NewLetters[length] = c;
728 m_NumberOfBranches++;
729 delete m_Letters;
730 m_Letters = NewLetters;
731 delete m_Pointers;
732 m_Pointers = NewPointers;
733 m_Pointers[length] = new CNode(s, m_BreakPoint, m_BreakPoint);
734 return &m_Pointers[length];
738 CNode* CNode::FindLetter (QChar c)
741 int length = 0;
742 if (m_Letters) { length = m_NumberOfBranches;}
743 for (int i = 0; i < length; i++)
745 if (m_Letters[i] == c) return m_Pointers[i];
748 return NULL;
752 //int CNode::GetCount() const
754 // return m_Count;
758 CNode** CNode::GetLink(CNode* pNode)
760 using std::copy;
761 using std::swap;
763 int length = m_NumberOfBranches;
765 // case 1: pNode is terminal, has same Key as this:
766 if (pNode->GetKeyLength() == m_BreakPoint)
767 return 0;
769 QChar c = pNode->GetKey()[m_BreakPoint];
771 for (int i = 0; i < length; ++i)
772 if (m_Letters[i] == c)
773 return &m_Pointers[i];
775 // XXX. should use realloc workalike
776 QChar* NewLetters = new QChar[length + 1];
777 CNode** NewPointers = new CNode*[length + 1];
778 copy(m_Letters, m_Letters + length, NewLetters);
779 copy(m_Pointers, m_Pointers + length, NewPointers);
780 NewLetters[length] = c;
781 NewPointers[length] = pNode;
783 swap(m_Letters, NewLetters);
784 swap(m_Pointers, NewPointers);
785 delete[] NewLetters;
786 delete[] NewPointers;
788 ++m_NumberOfBranches;
789 return &m_Pointers[length];
792 void CNode::MakeCutsWhereCountExceeds( int count, int start, int end )
794 CStringSurrogate ssKey(m_Key,0,m_KeyLength);
795 // QString spaces = "\n";
796 int i;
798 if( start < m_KeyLength && end+1 >= m_KeyLength && m_NumberOfBranches >= count )
800 // for( i=0; i < m_KeyLength; i++ ) spaces += " ";
801 CutAllWordsHereAndBelow_AfterNLetters ( m_KeyLength );
804 if( end+1 >= m_KeyLength )
806 for( i=0; m_Letters && i < m_NumberOfBranches; i++ )
808 m_Pointers[i]->MakeCutsWhereCountExceeds( count, start, end );
814 CStringSurrogate CNode::GetKey()
816 return CStringSurrogate(m_Key,0,m_KeyLength );
820 void CNode::MakeATerminalPointerArray (CNode** Array, int& Index)
823 if (m_ExistenceFlag)
825 Array[ Index ] = this;
826 Index++;
829 for (int i = 0 ; m_Letters && i < (int)m_NumberOfBranches; i++)
831 m_Pointers[i]->MakeATerminalPointerArray(Array, Index);
837 CNode* CNode::Find1(const CStringSurrogate string, bool PartialOK )
839 if (m_Key && LxStrCmp( &string, m_Key, string.GetLength(), m_KeyLength ) == 0 && ( PartialOK || m_ExistenceFlag ) )
841 return this;
844 if( m_Key )
846 if( string.GetLength() < m_BreakPoint )
848 if( !PartialOK ) return NULL;
851 for( int i = m_StartPoint; i < m_BreakPoint-1; i++ )
853 if( string[i] != m_Key[i] )
855 if( PartialOK && ( i > m_StartPoint ) ) return this;
856 else return NULL;
861 QChar c = string[m_BreakPoint];
862 for ( int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
864 if ( c == m_Letters[i] )
866 return m_Pointers[i] ->Find1 (string);
869 return NULL;
873 CNode* CNode::SearchForPrefix ( CStringSurrogate& SS, int& Result, int Letterno )
875 CNode* pNode;
876 Q_ASSERT ( Letterno == m_StartPoint);
877 if ( Letterno < m_BreakPoint-1)
879 for (int i = Letterno + 1; i < m_BreakPoint; i++)
881 if ( i == SS.GetLength() )
883 Result = 2;
884 return this;
885 // Prefix is inside of this node, however!
887 if ( SS[i] != m_Key[i] )
889 Result = 0;
890 return NULL;
891 // Prefix not found!
897 Letterno = m_BreakPoint;
899 if ( m_Key && SS == CStringSurrogate( m_Key, 0, m_KeyLength ) )
901 Result = 1;
902 return this;
905 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
907 if ( SS[Letterno] == m_Letters[i] )
909 pNode = FindLetter ( SS[m_BreakPoint] );
910 pNode = pNode->SearchForPrefix ( SS, Result, m_BreakPoint);
911 return pNode;
914 Result = 0;
915 return NULL;
919 void CNode::ListDisplay(Q3ListView* List, LinguisticaMainWindow* doc,
920 int* count, QMap<QString, QString>* filter, bool ReverseFlag)
922 QString Output;
923 CStringSurrogate ssKey(m_Key,0,m_KeyLength);
924 QString star = "*",
925 width = " Width: ",
926 lt_curly = " {",
927 rt_curly = "}",
928 below = " Count below: ";
929 below += QString("%1").arg(m_CountBelow);
931 // Advance counter
932 if( doc && count )
934 // Update on multiples of 100
935 if( (*count) % 100 == 0 )
936 doc->status_display().progress = *count;
937 *count = *count + 1;
940 if (m_Key)
942 Output.append ( ssKey.Display(filter) );
943 if ( ReverseFlag == TRUE )
945 Output.append( " (" );
946 QString reverse;
947 LxStrCpy_R( ssKey.Display(filter),reverse,ssKey.GetLength(), ssKey.GetStart() );
948 Output.append( reverse + ")" );
953 if (m_ExistenceFlag )
955 Output.append (star);
957 Output.append ( width );
958 Output += QString("%1").arg( GetWidth() );
959 if ( m_Letters )
961 Output.append (lt_curly);
962 Output.append ( CStringSurrogate( m_Letters, 0, m_NumberOfBranches ).Display(filter) );
963 Output.append (rt_curly);
966 Output.append ( below );
968 CTrieListViewItem* item = new CTrieListViewItem( List, Output );
969 item->setSelectable(false);
970 item->setOpen(true);
972 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
974 m_Pointers[i] ->ListDisplay (item, doc, count, filter, ReverseFlag);
979 void CNode::ListDisplay(Q3ListViewItem* List, LinguisticaMainWindow* doc,
980 int* count, QMap<QString, QString>* filter, bool ReverseFlag)
982 QString Output;
983 CStringSurrogate ssKey(m_Key,0,m_KeyLength);
984 QString star = "*",
985 width = " Width: ",
986 lt_curly = " {",
987 rt_curly = "}",
988 below = " Count below: ";
989 below += QString("%1").arg(m_CountBelow);
991 // Advance counter
992 if( doc && count )
994 // Update on multiples of 100
995 if( (*count) % 100 == 0 )
996 doc->status_display().progress = *count;
997 *count = *count + 1;
1000 if (m_Key)
1002 Output.append ( ssKey.Display(filter) );
1003 if ( ReverseFlag == TRUE )
1005 Output.append( " (" );
1006 QString reverse;
1007 LxStrCpy_R( ssKey.Display(filter),reverse,ssKey.GetLength(), ssKey.GetStart() );
1008 Output.append( reverse + ")" );
1013 if (m_ExistenceFlag )
1015 Output.append (star);
1017 Output.append ( width );
1018 Output += QString("%1").arg( GetWidth() );
1019 if ( m_Letters )
1021 Output.append (lt_curly);
1022 Output.append ( CStringSurrogate( m_Letters, 0, m_NumberOfBranches ).Display(filter) );
1023 Output.append (rt_curly);
1026 Output.append ( below );
1028 CTrieListViewItem* item = new CTrieListViewItem( List, Output );
1029 item->setSelectable(false);
1030 item->setOpen(true);
1032 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
1034 m_Pointers[i] ->ListDisplay (item, doc, count, filter, ReverseFlag);
1039 int CNode::GetWidth()
1041 int width = 0;
1042 if (m_ExistenceFlag) { width = 1;}
1044 if (m_Pointers) { return m_NumberOfBranches + width; } else {return width; }
1048 namespace {
1049 // helper for CNode::Alphabetize
1050 // index_in<char>("Hello")(0) == 'H'.
1051 template<class T> class index_in : public std::unary_function<int, T> {
1052 const T* array;
1053 public:
1054 index_in(const T* v) : array(v) { }
1055 T operator()(int i) { return array[i]; }
1059 void CNode::Alphabetize()
1061 using std::copy;
1062 using std::swap;
1063 using std::transform;
1064 using std::mem_fun;
1066 const int Length = m_NumberOfBranches;
1068 if (Length == 0)
1069 // nothing to do
1070 return;
1072 QChar* NewLetters = new QChar[Length];
1073 CNode** NewPointers = new CNode*[Length];
1075 std::vector<int> shuffle(Length);
1076 SortLetters(&shuffle[0], m_Letters, Length);
1078 transform(shuffle.begin(), shuffle.end(),
1079 NewLetters, index_in<QChar>(m_Letters));
1080 transform(shuffle.begin(), shuffle.end(),
1081 NewPointers, index_in<CNode*>(m_Pointers));
1082 swap(m_Letters, NewLetters);
1083 swap(m_Pointers, NewPointers);
1085 delete[] NewLetters;
1086 delete[] NewPointers;
1088 std::for_each(m_Pointers, m_Pointers + Length,
1089 mem_fun(&CNode::Alphabetize));
1093 void* CNode::Get_T_Pointer()
1095 if (m_ExistenceFlag)
1097 return m_Pointer;
1099 else
1101 return NULL;
1106 int CNode::ComputeNumberOfEntries(int count)
1108 if (m_ExistenceFlag)
1110 count++;
1113 for (int i = 0 ; m_Letters && i < (int)m_NumberOfBranches; i++)
1115 count = m_Pointers[i]->ComputeNumberOfEntries(count);
1117 return count;
1121 void CNode::CreateNodeArray(CNode** NodeArray, int& Index)
1124 NodeArray[Index] = this;
1125 Index++;
1127 for (int i = 0 ; m_Letters && i < (int)m_NumberOfBranches; i++)
1129 m_Pointers[i]->CreateNodeArray(NodeArray, Index);
1135 void CNode::FindAllTerminalsBelowHere( CParse* pSuffixes, int Offset )
1137 CStringSurrogate ssPiece;
1138 if( m_ExistenceFlag )
1140 ssPiece = CStringSurrogate( m_Key,0,m_KeyLength );
1141 pSuffixes->Append( ssPiece.Mid( Offset ) );
1144 for( int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++ )
1146 m_Pointers[i]->FindAllTerminalsBelowHere ( pSuffixes, Offset );
1152 CNode* CNode::FindLowestMajorityNode(int Count)
1154 bool FoundFlag = FALSE;
1155 CNode* pNode;
1158 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
1160 if ( m_Pointers[i]->GetCountBelow() > Count / 2 )
1162 FoundFlag = TRUE;
1163 return pNode = m_Pointers[i]->FindLowestMajorityNode( Count );
1168 if ( FoundFlag == FALSE )
1170 if (m_CountBelow > Count / 2 ) { return this; }
1171 else
1173 return NULL;
1176 else return NULL;
1181 void CNode::DumpVisibleToLogFile(QTextOStream* stream, bool ReverseFlag)
1183 if (m_Visible && m_ExistenceFlag )
1185 if (ReverseFlag)
1187 QChar* string = new QChar[ m_KeyLength + 1];
1188 LxStrCpy_R(m_Key, string, m_KeyLength );
1189 *stream << string << endl;
1190 delete string;
1192 else
1193 *stream << m_Key << endl;
1195 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
1197 m_Pointers[i]->DumpVisibleToLogFile(stream, ReverseFlag);
1204 void CNode::DumpVisibleWords ( CWordCollection* Words, bool ReverseFlag )
1206 if (m_Visible && m_ExistenceFlag )
1208 if (ReverseFlag)
1210 *Words << CStringSurrogate(m_Key, 0, m_KeyLength);
1212 else
1214 *Words << CStringSurrogate(m_Key, 0, m_KeyLength);
1217 for (int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
1219 m_Pointers[i]->DumpVisibleWords(Words, ReverseFlag);
1225 void CNode::MakeAllVisible(bool Flag)
1227 m_Visible = Flag;
1228 for (int i = 0 ; m_Letters && i < (int)m_NumberOfBranches; i++)
1230 m_Pointers[i]->MakeAllVisible(Flag);
1235 bool CNode::MakeVisible( const CStringSurrogate string )
1237 if (m_Key && LxStrCmp( &string, m_Key, string.GetLength(), m_KeyLength ) == 0 && m_ExistenceFlag)
1239 m_Visible = TRUE;
1240 return TRUE;
1243 if (m_Key)
1245 if ( string.GetLength() < m_BreakPoint )
1247 return FALSE;
1249 for (int i = m_StartPoint; i < m_BreakPoint-1;i++)
1251 if ( string[i] != m_Key[i] )
1253 return FALSE;
1259 QChar c = string[m_BreakPoint];
1260 for ( int i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
1262 if ( c == m_Letters[i] )
1264 return m_Pointers[i] ->MakeVisible (string);
1267 return FALSE;
1271 bool CNode::MakeMorphemeBoundariesAtThisWidth(int n, int MinimumStemLength, int ThisLength)
1273 bool val;
1274 bool bLowerIsBoundary = FALSE;
1275 int KeyLen = 0;
1276 int NumberOfBranches=0;
1278 // QString DebugString;
1279 // const char* DebugStringDisplay;
1280 // int DebugInt;
1283 if ( m_Key)
1285 KeyLen = GetKeyLength();
1286 ThisLength = KeyLen;
1288 // DebugString = QString(m_Key, KeyLen);
1289 // DebugStringDisplay = DebugString.ascii();
1290 // DebugInt = 1;
1293 NumberOfBranches = GetNumberOfBranches();
1295 for (int i = 0 ; m_Letters && i < NumberOfBranches; i++)
1297 val = m_Pointers[i]->MakeMorphemeBoundariesAtThisWidth(n, MinimumStemLength, ThisLength );
1298 if (val) bLowerIsBoundary = TRUE;
1301 if ( ThisLength >= MinimumStemLength && GetWidth() >= n )
1303 m_MorphemeBoundary = TRUE;
1305 else
1307 m_MorphemeBoundary = false;
1310 return m_MorphemeBoundary;
1314 void CNode::CutAllWordsHereAndBelow_AfterNLetters ( int n )
1316 int NumberOfBranches=0;
1318 if ( m_ExistenceFlag )
1320 ((CStem*)m_Pointer)->CutRightBeforeHere (n);
1323 NumberOfBranches = GetNumberOfBranches();
1325 for (int i = 0 ; m_Letters && i < NumberOfBranches; i++)
1327 m_Pointers[i]->CutAllWordsHereAndBelow_AfterNLetters ( n );
1332 void CNode::MakeCutsAtMorphemeBoundary( int depth )
1334 int ThisDepth = depth;
1335 int KeyLen = 0;
1336 int NumberOfBranches=0;
1338 if (m_Key)
1340 KeyLen = GetKeyLength();
1341 ThisDepth = KeyLen;
1344 if ( m_MorphemeBoundary )
1346 CutAllWordsHereAndBelow_AfterNLetters ( ThisDepth );
1349 NumberOfBranches = GetNumberOfBranches();
1351 for (int i = 0 ; m_Letters && i < NumberOfBranches; i++)
1353 m_Pointers[i]->MakeCutsAtMorphemeBoundary(ThisDepth);
1357 int CTrie::ProjectTrieStructureOntoWord( CParse* parse )
1359 int status = 0;
1361 parse->ClearParseStructure();
1364 if( m_ReverseFlag )
1366 parse->ReverseMe();
1372 status = m_Root->ProjectTrieStructureOntoWord( parse );
1375 if( m_ReverseFlag )
1377 parse->ReverseMe();
1382 return status;
1385 int CNode::ProjectTrieStructureOntoWord( CParse* parse, int position )
1387 CStringSurrogate ssKey = GetKey();
1388 CStringSurrogate SS = parse->GetKey();
1390 int newPosition = position;
1394 // Result: -1: word doesn't exist
1395 // 0: word existed
1397 //-----------------------------------------------------------------------//
1399 // 1: A difference before m_BreakPoint: it's definitely a new string.
1400 int i = m_StartPoint;
1401 for (; (SS.GetLength() > 0) && m_Key && i < m_BreakPoint && i < SS.GetLength(); i++)
1403 newPosition++;
1405 // if ( SS[i] != ssKey[i] )
1406 if ( LxStrCmp( &SS, ssKey.GetKey(),1,1,SS.GetStart()+i,ssKey.GetStart()+i) )
1408 return -1;
1412 //-----------------------------------------------------------------------//
1414 // 2. "s" is shorter than this node's Key:
1415 if ( i < m_BreakPoint && i == SS.GetLength() )
1417 return -1;
1420 //-----------------------------------------------------------------------//
1422 // 3. "s" is exactly this node's key:
1423 if ( m_Key && LxStrCmp ( &SS, m_Key, SS.GetLength(), m_KeyLength, SS.GetStart() ) == 0 )
1425 parse->SetPieceValue( parse->Size(), (double) m_CorpusCount );
1427 return 0;
1431 //-----------------------------------------------------------------------//
1432 // 4. This node only takes us part way into the word:
1433 QChar Letter = SS[m_BreakPoint];
1434 for (int j = 0; m_Letters && j < (int)m_NumberOfBranches; j++)
1436 if ( m_Letters[j] == Letter )
1440 parse->SetPieceValue( parse->Size(), (double) m_CorpusCount );
1441 parse->CutNFromTheLeft( newPosition );
1444 return m_Pointers[j]->ProjectTrieStructureOntoWord( parse, newPosition );
1449 //-----------------------------------------------------------------------//
1450 // 5. the letter at BreakPoint is new:
1451 return -1;
1453 //-----------------------------------------------------------------------//
1454 // end ///////////////////////
1457 QString CTrie::Display()
1459 return QString( m_Root->Display( 0, m_ReverseFlag ) + "=========================" );
1462 QString CNode::Display( int tabs, bool ReverseFlag )
1464 QString Output = " ";
1465 QString star = "*",
1466 width = " Width: ",
1467 lt_curly = " {",
1468 rt_curly = "}",
1469 below = " Count below: ";
1470 below += QString("%1").arg(m_CountBelow);
1472 CStringSurrogate ssKey(m_Key,0,m_KeyLength);
1473 int i;
1475 for( i = 0; i < tabs; i++ )
1477 Output.append( "__" );
1479 Output.append( " " );
1481 if (m_Key)
1483 Output.append ( ssKey.Display() );
1484 if ( ReverseFlag == TRUE )
1486 Output.append( " (" );
1487 QString reverse;
1488 LxStrCpy_R( ssKey.Display(),reverse,ssKey.GetLength(), ssKey.GetStart() );
1489 Output.append( reverse + ")" );
1494 if (m_ExistenceFlag )
1496 Output.append (star);
1498 // Output.append ( width );
1499 // Output += QString("%1").arg( GetWidth() );
1500 if ( m_Letters )
1502 Output.append (lt_curly);
1503 Output.append ( CStringSurrogate( m_Letters, 0, m_NumberOfBranches ).Display() );
1504 Output.append (rt_curly);
1507 // Output.append ( below );
1509 Output.append ( "\n" );
1511 for (i = 0; m_Letters && i < (int)m_NumberOfBranches; i++)
1513 Output.append( m_Pointers[i]->Display( tabs+1, ReverseFlag ) );
1516 return Output;