1 // Implementation of CTemplateCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "TemplateCollection.h"
6 #include <Q3TextStream>
11 #include "ui/Status.h"
13 #include "Alignment.h"
15 #include "WordCollection.h"
19 CTemplateCollection::CTemplateCollection()
20 : m_AllWordsAndParses(),
21 m_TotalGlobalStickNess(0.0),
23 m_GlobalNodeStickNess2(),
24 m_TotalGlobalStickNess2(0.0),
26 m_NumberOfDeletedTemplates(0) { }
28 CTemplateCollection::~CTemplateCollection(void)
32 void CTemplateCollection::ListDisplay(
33 Q3ListView
* pView
, linguistica::ui::status_user_agent
& status
)
35 pView
->setRootIsDecorated(false);
38 // Remove all previous columns.
39 while (pView
->columns() != 0)
40 pView
->removeColumn(0);
42 // Add Column headers.
43 pView
->addColumn("TemplateNumber");
44 pView
->addColumn("NumberOfColumns");
45 pView
->addColumn("Complexity");
46 pView
->addColumn("Layer1");
47 pView
->addColumn("Layer2");
48 pView
->addColumn("Layer3");
49 pView
->addColumn("Layer4");
51 // Column three or four gets really wide, so
52 // limit the width to 180.
53 pView
->setColumnWidthMode(2, Q3ListView::Manual
);
54 pView
->setColumnWidth(2, 180);
56 pView
->setColumnWidthMode(3, Q3ListView::Manual
);
57 pView
->setColumnWidth(3, 180);
59 pView
->setColumnWidthMode(4, Q3ListView::Manual
);
60 pView
->setColumnWidth(4, 180);
62 pView
->setColumnWidthMode(5, Q3ListView::Manual
);
63 pView
->setColumnWidth(5, 180);
65 status
.major_operation
= "Displaying Templates";
66 status
.progress
.clear();
67 const int TotalNumber
= GetCount();
68 status
.progress
.set_denominator(TotalNumber
);
69 for (int i
= 0; i
< TotalNumber
; ++i
) {
70 CTemplate
* pTemplate
= GetAt(i
);
71 pTemplate
->ListDisplay(pView
);
74 status
.progress
.clear();
75 status
.major_operation
.clear();
78 CTemplate
* CTemplateCollection::AddTemplate( CTemplate
* pTemplate
)
85 QString SpelledOutTemplate
= pTemplate
->Display(); // yuhuask Problematic ?
87 pTerminal
= Insert (CStringSurrogate(SpelledOutTemplate
), &Result
);
90 qTemplate
= new CTemplate( * pTemplate
);
91 pTerminal
->SetPointer (qTemplate
);
95 qTemplate
=(CTemplate
*) pTerminal
->Get_T_Pointer();
98 IncrementCorpusCount(1);
99 qTemplate
->IncrementCorpusCount(1);
101 m_SortValidFlag
= FALSE
;
102 m_HashHasChangedFlag
= TRUE
;
110 CTemplate
* CTemplateCollection::AddToCollection(CAlignment
* pAlign
)
113 CNode
* pTerminal
= Insert(
114 CStringSurrogate(pAlign
->SpellOut()),
117 std::auto_ptr
<CTemplate
> pTemplate
;
119 pTemplate
= std::auto_ptr
<CTemplate
>(new CTemplate(pAlign
));
120 pTemplate
->SetIndex(GetCount() - 1);
121 pTerminal
->SetPointer(pTemplate
.get());
123 pTemplate
= std::auto_ptr
<CTemplate
>(
124 static_cast<CTemplate
*>(pTerminal
->Get_T_Pointer()));
127 IncrementCorpusCount(1);
128 pTemplate
->IncrementCorpusCount(1);
130 m_SortValidFlag
= false;
131 m_HashHasChangedFlag
= true;
133 return pTemplate
.release();
136 void CTemplateCollection::CheckForConflations
137 ( CTemplateCollection
* InputCollection
)
141 CTemplateCollection TemporaryHoldings
;
143 QString debugstring1
, debugstring2
;
144 // const char* CCDebugString1, *CCDebugString2;
148 for (i
= 0; i
< InputCollection
->GetCount(); i
++)
150 CTemplate
* pTemplate
= InputCollection
->GetAt(i
);
154 for ( j
= 0; j
< GetCount(); j
++)
156 CTemplate
* qTemplate
= GetAt(j
);
157 if ( pTemplate
->ShouldConflate(qTemplate
, Column
) )
159 //debugstring1 = pTemplate ->Display();
160 //CCDebugString1 = debugstring1.ascii();
161 //debugstring2 = qTemplate->Display();
162 //CCDebugString2 = debugstring2.ascii();
165 qTemplate
->ConflateWith( pTemplate
, Column
) ;
171 if (BreakFlag
) { continue; }
173 for (j
= i
+1; j
< InputCollection
->GetCount(); j
++)
176 CTemplate
* qTemplate
= InputCollection
->GetAt(j
);
177 if ( pTemplate
->ShouldConflate(qTemplate
, Column
) )
181 //debugstring1 = pTemplate ->Display();
182 //CCDebugString1 = debugstring1.ascii();
183 //debugstring2 = qTemplate->Display();
184 //CCDebugString2 = debugstring2.ascii();
188 pTemplate
->ConflateWith( qTemplate
, Column
);
189 this->AddTemplate ( pTemplate
);
196 // Keep in mind, for each column, the morpheme can be repeated
197 // e.g. b#g#_ettingbradybrady_;
201 //################################################
202 // Debug and test, should be no useful for release
206 for (i = 0; i < GetCount(); i++)
208 debugstring1 = GetAt(i)->Display();
209 CCDebugString1 = debugstring1.ascii();
215 //################################################
216 // ReAdjust - Pre-edit-1
222 TemporaryHoldings
.Empty();
223 for (i
= 0; i
< GetCount(); i
++)
225 GetAt(i
)->Readjust( &TemporaryHoldings
);
227 for (i
= 0; i
< TemporaryHoldings
.GetCount(); i
++)
229 this->AddTemplate( TemporaryHoldings
[i
] );
232 while (TemporaryHoldings
.GetCount() > 0 );
235 //*******************************************
237 CWordCollection Words, NewWords, TempWords;
238 CPolyMorpheme* pWord;
240 for (i = 0; i < GetCount(); i++)
242 CTemplate* pTemplate = GetAt(i);
246 for (int j = 1; j <= pTemplate->GetColumn(0)->Size(); j++)
247 { Words << pTemplate->GetColumn(0)->GetAt_SS(j) ; }
249 for (int col = 1; col < pTemplate->GetNumberOfColumns(); col++)
253 for (int row = 1; row <= (int) pTemplate->GetColumn(col)->Size(); row++)
257 CSS ss = pTemplate->GetColumn(col)->GetAt_SS(row);
258 if (ss == CSS("NULL") )
263 TempWords.SuffixToAllWords ( pTemplate->GetColumn(col)->GetAt_SS(row) );
265 NewWords.AddWordCollection( TempWords );
270 for (int k = 0; k < (int) Words.GetCount(); k++)
272 pWord = *Lexicon->GetPolyWords() << Words.GetAt(k);
273 pWord->CopyParse ( *(CParse*)Words.GetAt(k) );
284 void CTemplateCollection::OutputTemplatesForGoldStand()
286 StringToParse OneWordsAndParses
;
287 StringToParse::Iterator StringToParseIt
;
294 if ( m_AllWordsAndParses
!= NULL
) delete m_AllWordsAndParses
;
296 m_AllWordsAndParses
= new StringToParse();
298 //QMessageBox::information( NULL, "Debug", "Run Here -1", "OK" );
300 for (int i
= 0; i
< GetCount(); i
++)
302 CTemplate
* pTemplate
= GetAt(i
);
304 OneWordsAndParses
.clear();
305 pTemplate
->GetWordsAndParses(OneWordsAndParses
);
308 for ( StringToParseIt
= OneWordsAndParses
.begin(); StringToParseIt
!= OneWordsAndParses
.end(); StringToParseIt
++)
310 TheWord
= StringToParseIt
.key();
311 TheParse
= StringToParseIt
.data();
314 if ( m_AllWordsAndParses
->contains(TheWord
))
316 OneParse
= (*m_AllWordsAndParses
)[TheWord
];
317 OneParse
->yuhuMergeParse(TheParse
);
321 m_AllWordsAndParses
->insert(TheWord
, TheParse
);
337 void CTemplateCollection::OutputTemplates ( LPCTSTR FileName)
339 ofstream outf(FileName);
340 outf << setiosflags (ios::left);
343 Sort (TEMPLATE_SORT);
344 for (int i = 0; i < GetCount(); i++)
346 CTemplate* pTemplate = GetAtSort(i);
347 pTemplate->OutputForFile ( outf );
351 outf << "Number of columns: "<< pTemplate->GetNumberOfColumns() << endl;
352 outf << "Complexity: " << pTemplate->GetComplexity() << endl;
353 outf << "Words complexity: " << pTemplate->GetWordsTotalComplexity() << endl;
354 outf << "Sorting quantity: " << pTemplate->GetSortingQuantity() << endl<< endl;
360 void CTemplateCollection::ReadTemplateFile( LPCTSTR FileName)
362 const int bufferSize = 256;
363 char buffer[bufferSize];
365 CTemplate* pTemplate;
366 CParse* pParse= new CParse();
367 ifstream inf(FileName);
371 inf.getline(buffer, bufferSize, '\n');
372 size = atoi (buffer);
374 while (inf && Count < size)
377 inf.getline(buffer, bufferSize, '\n');
378 pParse->Collapse(buffer);
379 if ( pParse->Size() <= 0) { continue; }
381 NumberOfColumns = pParse->Size();
382 pTemplate = new CTemplate (NumberOfColumns);
383 for (int i = 1; i<= (int) pParse->Size(); i++)
385 pTemplate->AddToColumn( pParse->GetAt_SS(i), i-1);
389 inf.getline (buffer, bufferSize, '\n');
390 pParse->Collapse(buffer);
391 if ( pParse->Length() == 0 )
396 for (int i = 1; i<= (int) pParse->Size(); i++)
398 if ( pParse->GetAt_SS(i).FirstChar() == '-' ) { continue;}
399 pTemplate->AddToColumn( pParse->GetAt_SS(i), i-1);
402 AddTemplate ( pTemplate );
403 // there are 10 lines with stuff we don't read.
404 for (i = 0; i < 10; i++) { inf.getline(buffer, bufferSize, '\n'); }
411 int CompareTemplateColumn1 (const void *pA, const void *pB)
413 CTemplate* pS1=*(CTemplate**) pA;
414 CTemplate* pS2=*(CTemplate**) pB;
416 int Value1 = pS1->GetColumn(0)->SpellOut().Compare(pS2->GetColumn(0)->SpellOut() );
417 if ( Value1 != 0 ) return Value1;
419 int Value2 = pS1->GetColumn(1)->SpellOut().Compare(pS2->GetColumn(1)->SpellOut() );
422 // int Value3 = pS1->GetColumn(3)->SpellOut().Compare(pS2->GetColumn(3)->SpellOut() );
423 // if ( Value2 != 0 ) return Value2;
426 int CompareTemplateColumn2 (const void *pA, const void *pB)
428 CTemplate* pS1=*(CTemplate**) pA;
429 CTemplate* pS2=*(CTemplate**) pB;
431 if ( pS1->GetColumn(1) == NULL )
433 if ( pS2->GetColumn(1) == NULL )
439 if ( pS2->GetColumn(1) == NULL )
445 int Value1 = pS1->GetColumn(1)->SpellOut().Compare(pS2->GetColumn(1)->SpellOut() );
446 if ( Value1 != 0 ) return Value1;
448 int Value2 = pS1->GetColumn(0)->SpellOut().Compare(pS2->GetColumn(0)->SpellOut() );
452 // int Value3 = pS1->GetColumn(3)->SpellOut().Compare(pS2->GetColumn(3)->SpellOut() );
453 // if ( Value2 != 0 ) return Value2;
458 void CTemplateCollection::SortTemplates (eSortStyle SortStyle, int ColumnNumber )
460 if (SortStyle != TEMPLATE_SORT_COLUMN) { return; }
462 int Size = GetCount();
464 if (SortArray) { delete [] SortArray; }
466 SortArray = new CTemplate*[ Size ];
468 double* ValueArray = new double [ Size ];
471 for (int i = 0; i < Size; i++)
473 SortArray[i] = GetAt( i );
476 // Fix this -- make it general, not case by case
477 switch ( ColumnNumber )
481 qsort((void*) SortArray, Size, sizeof(CTemplate*), CompareTemplateColumn1);
486 qsort((void*) SortArray, Size, sizeof(CTemplate*), CompareTemplateColumn2);
491 // qsort((void*) SortArray, Size, sizeof(CTemplate*), CompareTemplateColumn3);
496 m_SortValidFlag = true;
498 m_SortStyle = SortStyle;
506 /* This function looks for templates with 3 columns in which most of the letters --
507 that is, the lexical heads -- are in the 3rd column.
511 void CTemplateCollection::FindPrefixingTemplates( CTemplateCollection* PrefixingTemplates )
513 CTemplate* pTemplate;
514 double Threshold = 0.2,
519 for (int i = 0; i < (int) GetCount(); i++)
521 pTemplate = GetAt(i);
522 if ( pTemplate->GetNumberOfColumns() < 3 ) { continue; }
523 NumberOfLetters = pTemplate->GetNumberOfLetters();
524 if ( (double) pTemplate->GetColumn(0)->GetLength() / NumberOfLetters < Threshold &&
525 (double) pTemplate->GetColumn(1)->GetLength() / NumberOfLetters < Threshold
528 PrefixingTemplates->AddTemplate( pTemplate );
541 eAffixationType CTemplateCollection::FindAffixationSide()
543 int HowManyTemplates = 10;
547 double Threshold = 0.6;
549 Sort (TEMPLATE_SORT );
550 for (int i = 0; i < HowManyTemplates && i < GetCount(); i++)
552 switch (GetAtSort(i)->DetermineAffixationSide() )
573 int Total = int( nPrefix + nSuffix + nUnknown );
574 if (nPrefix / Total > Threshold )
576 if (nSuffix / Total > Threshold )
586 void CTemplateCollection::AddToCollections( CStemCollection* SuffixStems,
587 CStemCollection* PrefixStems,
588 CSuffixCollection* Suffixes,
589 CPrefixCollection* Prefixes,
590 CSignatureCollection* PrefixSignatures,
591 CSignatureCollection* SuffixSignatures )
594 eAffixationType Type;
595 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << endl << endl << "Adding templates to collections" << endl ; }
596 for (int i = 0; i < GetCount(); i++)
599 Type = GetAt(i)->DetermineAffixationSide();
600 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << endl << GetAt(i)->SpellOut(); }
601 if ( Type == TYPE_Suffix )
603 GetAt(i)->AddToCollections ( Type, SuffixStems, (CAffixCollection*) Suffixes, SuffixSignatures );
604 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << "\t Suffix"; }
607 if ( Type == TYPE_Prefix )
609 GetAt(i)->AddToCollections ( Type, PrefixStems, (CAffixCollection*) Prefixes, PrefixSignatures );
610 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << "\t Prefix"; }
620 void CTemplateCollection::UpdateGlobalStickNess2()
625 QString Morphemei
, Morphemej
;
626 QString HostMorpheme
, SlaveMorpheme
;
627 QString PreviousMorpheme
, CurrentMorpheme
;
631 StringToFloat
* oneCollection
;
632 StringToFloat::Iterator StringToFloatIt
;
633 StringToStringToFloat::Iterator StringToStringToFloatIt
;
634 StringToInt AllMorphemes
;
635 StringToInt MorphemesInThisTemplate
;
636 StringToInt::Iterator StringToIntIt
;
638 int NumberOfWordsInThisTemplate
;
643 QFile
file( "paragmatic.txt" );
645 if ( !file
.open( QIODevice::WriteOnly
) )
650 Q3TextStream
logf( &file
);
652 Sort (TEMPLATE_SORT
);
655 // Clear m_GlobalStickNess2;
656 for (StringToStringToFloatIt
= m_GlobalStickNess2
.begin(); StringToStringToFloatIt
!= m_GlobalStickNess2
.end(); StringToStringToFloatIt
++)
658 oneCollection
= StringToStringToFloatIt
.data();
659 delete oneCollection
;
663 // Clear m_GlobalStickNess2 and m_GlobalNodeStickNess2
664 m_GlobalStickNess2
.clear();
665 m_GlobalNodeStickNess2
.clear();
669 AllMorphemes
.clear();
670 m_TotalGlobalStickNess2
= 0.0;
674 for (i
= 0; i
< GetCount(); i
++) {
675 CTemplate
* pTemplate
= GetAtSort(i
);
677 if ( pTemplate
->m_IsDeleted
) continue;
680 //myComplexity = pTemplate ->GetSortingQuantity(); in this function, we take number of words as robustness
682 MorphemesInThisTemplate
.clear();
684 NumberOfWordsInThisTemplate
= 1;
685 for ( column
=0; column
< pTemplate
->m_NumberOfColumns
; column
++)
687 oneColumn
= pTemplate
-> m_Columns
[column
];
688 NumberOfWordsInThisTemplate
= NumberOfWordsInThisTemplate
* oneColumn
->Size();
691 TotalWords
+= NumberOfWordsInThisTemplate
;
692 myComplexity
= (float) NumberOfWordsInThisTemplate
;
695 for ( column
=0; column
< pTemplate
->m_NumberOfColumns
; column
++)
697 oneColumn
= pTemplate
-> m_Columns
[column
];
699 if ( oneColumn
->Size() <= 1)
701 Morphemei
= oneColumn
->GetPiece(1).Display();
705 Morphemei
= Morphemei
.right(Morphemei
.length() -1);
707 if ( Morphemei
.length() ==0)
709 Morphemei
= TheStringNULL
;
714 AllMorphemes
.insert(Morphemei
, 1);
715 MorphemesInThisTemplate
.insert(Morphemei
, 1);
719 for ( piecei
= 1; piecei
<= oneColumn
->Size() -1; piecei
++)
721 Morphemei
= oneColumn
->GetPiece(piecei
).Display();
725 Morphemei
= Morphemei
.right(Morphemei
.length() -1);
727 if ( Morphemei
.length() ==0)
729 Morphemei
= TheStringNULL
;
733 AllMorphemes
.insert(Morphemei
, 1);
734 MorphemesInThisTemplate
.insert(Morphemei
, 1);
736 for ( piecej
= piecei
+ 1; piecej
<= oneColumn
->Size(); piecej
++)
738 Morphemej
= oneColumn
->GetPiece(piecej
).Display();
742 Morphemej
= Morphemej
.right(Morphemej
.length() -1);
744 if ( Morphemej
.length() ==0)
746 Morphemej
= TheStringNULL
;
751 AllMorphemes
.insert(Morphemej
, 1);
752 MorphemesInThisTemplate
.insert(Morphemej
, 1);
755 if ( Morphemei
> Morphemej
)
757 HostMorpheme
= Morphemei
;
758 SlaveMorpheme
= Morphemej
;
762 HostMorpheme
= Morphemej
;
763 SlaveMorpheme
= Morphemei
;
766 if ( m_GlobalStickNess2
.contains(HostMorpheme
))
768 oneCollection
= m_GlobalStickNess2
[HostMorpheme
];
770 if ( oneCollection
->contains(SlaveMorpheme
))
772 (*oneCollection
)[SlaveMorpheme
] += myComplexity
;
773 m_TotalGlobalStickNess2
+= myComplexity
;
778 oneCollection
->insert(SlaveMorpheme
, (float)(myComplexity
));
779 m_TotalGlobalStickNess2
+= myComplexity
;
785 oneCollection
= new StringToFloat();
786 m_GlobalStickNess2
.insert(HostMorpheme
, oneCollection
);
788 oneCollection
->insert(SlaveMorpheme
, (float)(myComplexity
));
789 m_TotalGlobalStickNess2
+= myComplexity
;
808 // Get the Vertex stickness for this template
810 for ( StringToIntIt
= MorphemesInThisTemplate
.begin(); StringToIntIt
!= MorphemesInThisTemplate
.end(); StringToIntIt
++)
812 oneMorpheme
= StringToIntIt
.key();
814 if ( m_GlobalNodeStickNess2
.contains(oneMorpheme
))
816 m_GlobalNodeStickNess2
[oneMorpheme
] += myComplexity
;
820 m_GlobalNodeStickNess2
.insert(oneMorpheme
,(float)(myComplexity
));
829 // Finally, Calculate the -log(prob) of edge stickness and vertex stickness
832 logf
<< "********The Paragmatic Edge StickNess******"<<endl
<<endl
;
834 for ( StringToStringToFloatIt
= m_GlobalStickNess2
.begin(); StringToStringToFloatIt
!= m_GlobalStickNess2
.end(); StringToStringToFloatIt
++)
836 HostMorpheme
= StringToStringToFloatIt
.key();
837 oneCollection
= StringToStringToFloatIt
.data();
840 for ( StringToFloatIt
= oneCollection
->begin(); StringToFloatIt
!= oneCollection
->end(); StringToFloatIt
++)
842 SlaveMorpheme
= StringToFloatIt
.key();
843 oneFloat
= StringToFloatIt
.data();
845 LogValue
= -base2log((oneFloat
)/m_TotalGlobalStickNess2
);
846 (*oneCollection
)[SlaveMorpheme
] = LogValue
;
849 logf
<< HostMorpheme
<< "-" << SlaveMorpheme
<< " : "<<LogValue
<< endl
;
857 logf
<<endl
<<endl
<<endl
;
860 logf
<< "********The Paragmatic Node StickNess******"<<endl
<<endl
;
863 for ( StringToFloatIt
= m_GlobalNodeStickNess2
.begin(); StringToFloatIt
!= m_GlobalNodeStickNess2
.end(); StringToFloatIt
++)
865 oneMorpheme
= StringToFloatIt
.key();
866 oneFloat
= StringToFloatIt
.data();
868 LogValue
= -base2log((oneFloat
)/(float)TotalWords
);
869 m_GlobalNodeStickNess2
[oneMorpheme
] = LogValue
;
872 logf
<< oneMorpheme
<< " : " << LogValue
<< endl
;
875 m_TotalWord2
= (float)TotalWords
;
886 void CTemplateCollection::AdjustTemplatesByMovingCommonTailOrHead2(int Loopi
)
889 const int MaximumSizeOfEachColumn
= 3;
890 const int MaximumNumberOfMovingLetters
= 3;
891 bool PrintChangedTemplates
= true;
893 Sort (TEMPLATE_SORT
);
896 // Check Through each Template Once
897 for (int i
= 0; i
< GetCount(); i
++) {
898 CTemplate
* pTemplate
= GetAtSort(i
);
900 if (pTemplate
->m_IsDeleted
) continue;
903 if ( i == GetCount() -1)
905 QMessageBox::information (NULL, "Debug", QString("Run Template %1").arg(i), "OK");
909 pTemplate
->AdjustMeByMovingCommonTailOrHead2(m_GlobalNodeStickNess2
,MaximumSizeOfEachColumn
, \
910 MaximumNumberOfMovingLetters
,PrintChangedTemplates
, Loopi
, true, m_GlobalStickNess2
, m_TotalGlobalStickNess2
, m_TotalWord2
);
916 bool CTemplateCollection::CollapseAlgorithm1(int loopnumber
)
922 int MinimumSizeOfStemColumn
= 5;
923 int OnlyConsiderTemplatesWithSlotNumber
=3;
924 int MaximumSymmetricError
= 2;
925 int MinimumCommonStem
=2;
926 int MaximumOutputMorphemesInOneColumn
=10000;
927 bool PrintCollapsedTemplates
= true;
928 bool PrintCreatedWords
= true;
929 bool DisplayOldDeletedTemplates
= false;
932 CTemplate
* qTemplate
;
934 CParse
* pLeftColumn
, *pRightColumn
, *pStemColumn
;
935 CParse
* qLeftColumn
, *qRightColumn
, *qStemColumn
;
936 int pIndexOfFirstOfNonStemColumns
,pIndexOfStemColumn
;
937 int qIndexOfFirstOfNonStemColumns
,qIndexOfStemColumn
;
938 bool pShouldConsiderMe
;
939 bool qShouldConsiderMe
;
940 bool CollapsedAnyOne
;
946 CollapsedAnyOne
= true;
948 // Loop until no more templates are collpased
949 while ( CollapsedAnyOne
)
952 // Check Through each Template for possible collapsing
953 for (i
= 0; i
< GetCount()-1; i
++) {
954 CTemplate
* pTemplate
= GetAtSort(i
);
955 pShouldConsiderMe
= false;
956 CollapsedAnyOne
= false;
958 if (pTemplate
->m_IsDeleted
)
964 if (pTemplate
->m_NumberOfColumns
!= OnlyConsiderTemplatesWithSlotNumber
)
970 for ( column
=0; column
< pTemplate
->m_NumberOfColumns
-1; column
++)
972 pLeftColumn
= pTemplate
->m_Columns
[column
];
973 pRightColumn
= pTemplate
->m_Columns
[column
+1];
974 pIndexOfFirstOfNonStemColumns
= column
;
978 pStemColumn
= pTemplate
->m_Columns
[column
+2];
979 pIndexOfStemColumn
= column
+ 2;
983 pStemColumn
= pTemplate
->m_Columns
[0];
984 pIndexOfStemColumn
=0;
987 if (( pLeftColumn
->Size() < MinimumSizeOfStemColumn
) \
988 && ( pRightColumn
->Size() < MinimumSizeOfStemColumn
)\
989 && ( pStemColumn
->Size() >= MinimumSizeOfStemColumn
))
991 pShouldConsiderMe
= true;
996 if ( !pShouldConsiderMe
)
1002 for (j
= i
+1; j
< GetCount(); j
++)
1004 qTemplate
= GetAtSort(j
);
1005 qShouldConsiderMe
= false;
1007 if (qTemplate
->m_IsDeleted
)
1012 if (qTemplate
->m_NumberOfColumns
!= OnlyConsiderTemplatesWithSlotNumber
)
1018 for ( column
=0; column
< qTemplate
->m_NumberOfColumns
-1; column
++)
1021 if ( column
!= pIndexOfFirstOfNonStemColumns
)
1026 qLeftColumn
= qTemplate
->m_Columns
[column
];
1027 qRightColumn
= qTemplate
->m_Columns
[column
+1];
1028 qIndexOfFirstOfNonStemColumns
= column
;
1032 qStemColumn
= qTemplate
->m_Columns
[column
+2];
1033 qIndexOfStemColumn
= column
+ 2;
1037 qStemColumn
= qTemplate
->m_Columns
[0];
1038 qIndexOfStemColumn
=0;
1041 if (( qLeftColumn
->Size() < MinimumSizeOfStemColumn
) \
1042 && ( qRightColumn
->Size() < MinimumSizeOfStemColumn
)\
1043 && ( qStemColumn
->Size() >= MinimumSizeOfStemColumn
))
1045 qShouldConsiderMe
= true;
1050 if ( !qShouldConsiderMe
)
1055 //*******************************************************
1056 // Now the pTemplate and qTemplate are ready to compare
1058 // 1. One of the two non-stem columns is indentical
1059 // 2. Another non-stem column's symmetric error less than MaximumSymmetricError
1060 // 3. The # of common stems is bigger than MinimumCommonStem
1061 //********************************************************
1063 bool PassNonStemColumnTest
;
1064 bool PassStemColumnTest
;
1065 int NumberOfSymmetricError
;
1066 int NumberOfCommonStems
;
1068 int outputi
, outputj
;
1069 QString TheMorpheme
;
1070 int EqualNonStemColumn
;
1071 int MergeNonStemColumn
;
1072 CParse
* pOneColumn
, *qOneColumn
;
1073 CStringSurrogate TempCSS
;
1079 pLeftColumn
->Alphabetize();
1080 pRightColumn
->Alphabetize();
1082 qLeftColumn
->Alphabetize();
1083 qRightColumn
->Alphabetize();
1085 PassNonStemColumnTest
= false;
1086 PassStemColumnTest
= false;
1088 // Check whether LeftColumn is equal
1090 if ( (*pLeftColumn
) == qLeftColumn
)
1092 NumberOfSymmetricError
=0;
1094 for (piecei
=1; piecei
<= pRightColumn
->Size(); piecei
++)
1096 TheMorpheme
= pRightColumn
->GetPiece(piecei
).Display();
1098 TempCSS
= CStringSurrogate(TheMorpheme
);
1099 if ( !qRightColumn
->Contains(TempCSS
))
1101 NumberOfSymmetricError
++;
1105 for (piecei
=1; piecei
<= qRightColumn
->Size(); piecei
++)
1107 TheMorpheme
= qRightColumn
->GetPiece(piecei
).Display();
1109 TempCSS
= CStringSurrogate(TheMorpheme
);
1110 if ( !pRightColumn
->Contains(TempCSS
))
1112 NumberOfSymmetricError
++;
1117 if (NumberOfSymmetricError
<= MaximumSymmetricError
)
1119 PassNonStemColumnTest
= true;
1120 EqualNonStemColumn
= pIndexOfFirstOfNonStemColumns
;
1121 MergeNonStemColumn
= pIndexOfFirstOfNonStemColumns
+ 1;
1125 PassNonStemColumnTest
= false;
1128 }else if ((*pRightColumn
) == qRightColumn
)
1130 NumberOfSymmetricError
=0;
1132 for (piecei
=1; piecei
<= pLeftColumn
->Size(); piecei
++)
1134 TheMorpheme
= pLeftColumn
->GetPiece(piecei
).Display();
1136 TempCSS
= CStringSurrogate(TheMorpheme
);
1137 if ( !qLeftColumn
->Contains(TempCSS
) )
1139 NumberOfSymmetricError
++;
1143 for (piecei
=1; piecei
<= qLeftColumn
->Size(); piecei
++)
1145 TheMorpheme
= qLeftColumn
->GetPiece(piecei
).Display();
1147 TempCSS
= CStringSurrogate(TheMorpheme
);
1148 if ( !pLeftColumn
->Contains(TempCSS
))
1150 NumberOfSymmetricError
++;
1155 if (NumberOfSymmetricError
<= MaximumSymmetricError
)
1157 PassNonStemColumnTest
= true;
1158 EqualNonStemColumn
= pIndexOfFirstOfNonStemColumns
+1;
1159 MergeNonStemColumn
= pIndexOfFirstOfNonStemColumns
;
1163 PassNonStemColumnTest
= false;
1168 if ( !PassNonStemColumnTest
)
1174 NumberOfCommonStems
=0;
1176 for (piecei
=1; piecei
<= pStemColumn
->Size(); piecei
++)
1178 TheMorpheme
= pStemColumn
->GetPiece(piecei
).Display();
1180 TempCSS
= CStringSurrogate(TheMorpheme
);
1181 if ( qStemColumn
->Contains(TempCSS
) )
1183 NumberOfCommonStems
++;
1187 if ( NumberOfCommonStems
>= MinimumCommonStem
)
1189 PassStemColumnTest
= true;
1193 PassStemColumnTest
= false;
1198 // Now, we collapse these two template together
1199 CollapsedAnyOne
= true;
1201 if (!DisplayOldDeletedTemplates
)
1203 m_NumberOfDeletedTemplates
++;
1206 qTemplate
->m_IsDeleted
= true;
1207 // Print the attempted templates
1208 if ( PrintCollapsedTemplates
)
1210 QString DisplayOfOneColumn
;
1213 QFile
file( "CollapseWithAlgorithm1.txt" );
1215 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1217 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1221 Q3TextStream
outf( &file
);
1225 //outf.open ("CollapseWithAlgorithm1.txt", ofstream::out | ofstream::app);
1227 outf
<< "***************"<<loopnumber
<<"********************" <<endl
;
1228 outf
<< " Original P-Template:" << endl
;
1231 for ( outputi
= 0; outputi
< pTemplate
->m_NumberOfColumns
; outputi
++)
1233 pOneColumn
= pTemplate
->m_Columns
[outputi
];
1235 DisplayOfOneColumn
= QString("{ ");
1237 RealTrimSize
= MaximumOutputMorphemesInOneColumn
;
1238 if ( pOneColumn
->Size() < RealTrimSize
)
1240 RealTrimSize
= pOneColumn
->Size();
1243 for (outputj
= 1; outputj
<= RealTrimSize
;outputj
++)
1245 DisplayOfOneColumn
+= pOneColumn
->GetPiece(outputj
).Display();
1246 if ( outputj
!= RealTrimSize
)
1248 DisplayOfOneColumn
+= QString(" , ");
1252 DisplayOfOneColumn
+= QString(" }");
1253 outf
<< DisplayOfOneColumn
;
1254 if ( outputi
!= pTemplate
->m_NumberOfColumns
-1)
1260 outf
<< endl
<< endl
;
1262 outf
<< " Original q-Template:" << endl
;
1265 for ( outputi
= 0; outputi
< qTemplate
->m_NumberOfColumns
; outputi
++)
1267 qOneColumn
= qTemplate
->m_Columns
[outputi
];
1269 DisplayOfOneColumn
= QString("{ ");
1271 RealTrimSize
= MaximumOutputMorphemesInOneColumn
;
1272 if ( qOneColumn
->Size() < RealTrimSize
)
1274 RealTrimSize
= qOneColumn
->Size();
1277 for (outputj
= 1; outputj
<= RealTrimSize
;outputj
++)
1279 DisplayOfOneColumn
+= qOneColumn
->GetPiece(outputj
).Display();
1280 if ( outputj
!= RealTrimSize
)
1282 DisplayOfOneColumn
+= QString(" , ");
1286 DisplayOfOneColumn
+= QString(" }");
1287 outf
<< DisplayOfOneColumn
;
1288 if ( outputi
!= qTemplate
->m_NumberOfColumns
-1)
1294 outf
<< endl
<< endl
;
1300 // To Print out the new created words, Keep old words
1301 StringToParse OneWordsAndParses
;
1302 StringToParse OldAllWordsAndParses
;
1303 StringToParse NewAllWordsAndParses
;
1304 StringToParse::iterator StringToParseIt
;
1309 // Populate the old word list
1310 if ( PrintCreatedWords
)
1312 OneWordsAndParses
.clear();
1313 pTemplate
->GetWordsAndParses(OneWordsAndParses
);
1315 for ( StringToParseIt
= OneWordsAndParses
.begin(); StringToParseIt
!= OneWordsAndParses
.end(); StringToParseIt
++)
1317 TheWord
= StringToParseIt
.key();
1318 DumpParse
= StringToParseIt
.data();
1322 if ( !OldAllWordsAndParses
.contains(TheWord
))
1324 OldAllWordsAndParses
.insert(TheWord
, NULL
);
1330 OneWordsAndParses
.clear();
1331 qTemplate
->GetWordsAndParses(OneWordsAndParses
);
1334 for ( StringToParseIt
= OneWordsAndParses
.begin(); StringToParseIt
!= OneWordsAndParses
.end(); StringToParseIt
++)
1337 TheWord
= StringToParseIt
.key();
1338 DumpParse
= StringToParseIt
.data();
1341 if ( !OldAllWordsAndParses
.contains(TheWord
))
1343 OldAllWordsAndParses
.insert(TheWord
, NULL
);
1350 // First, merge not-equal non-stem column from qTemplate into pTemplate
1351 pOneColumn
= pTemplate
->m_Columns
[MergeNonStemColumn
];
1352 qOneColumn
= qTemplate
->m_Columns
[MergeNonStemColumn
];
1354 for (piecei
=1; piecei
<= qOneColumn
->Size(); piecei
++)
1356 TheMorpheme
= qOneColumn
->GetPiece(piecei
).Display();
1358 TempCSS
= CStringSurrogate(TheMorpheme
);
1359 if ( !pOneColumn
->Contains(TempCSS
) )
1361 // pOneColumn ->AppendInAlphabeticalOrder(TempCSS);
1362 pOneColumn
->Append (TempCSS
);
1369 // Second, merge the stem column from qTemplate into pTemplate
1370 pOneColumn
= pTemplate
->m_Columns
[pIndexOfStemColumn
];
1371 qOneColumn
= qTemplate
->m_Columns
[qIndexOfStemColumn
];
1373 for (piecei
=1; piecei
<= qOneColumn
->Size(); piecei
++)
1375 TheMorpheme
= qOneColumn
->GetPiece(piecei
).Display();
1377 TempCSS
= CStringSurrogate(TheMorpheme
);
1378 if ( !pOneColumn
->Contains(TempCSS
) )
1380 // pOneColumn ->AppendInAlphabeticalOrder(TempCSS);
1381 pOneColumn
->Append (TempCSS
);
1385 // Change the flag m_IsNewAfterCollapse1 and m_StemColumnInCollapse1
1386 pTemplate
->m_IsNewAfterCollapse1
= true;
1387 pTemplate
->m_StemColumnInCollapse1
= pIndexOfStemColumn
;
1390 // Print the attempted templates
1391 if ( PrintCollapsedTemplates
)
1393 QString DisplayOfOneColumn
;
1396 QFile
file( "CollapseWithAlgorithm1.txt" );
1398 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1400 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1404 Q3TextStream
outf( &file
);
1408 //outf.open ("CollapseWithAlgorithm1.txt", ofstream::out | ofstream::app);
1409 outf
<< " Collapsed -Template:" << endl
;
1412 for ( outputi
= 0; outputi
< pTemplate
->m_NumberOfColumns
; outputi
++)
1414 pOneColumn
= pTemplate
->m_Columns
[outputi
];
1416 DisplayOfOneColumn
= QString("{ ");
1418 RealTrimSize
= MaximumOutputMorphemesInOneColumn
;
1419 if ( pOneColumn
->Size() < RealTrimSize
)
1421 RealTrimSize
= pOneColumn
->Size();
1424 for (outputj
= 1; outputj
<= RealTrimSize
;outputj
++)
1426 DisplayOfOneColumn
+= pOneColumn
->GetPiece(outputj
).Display();
1427 if ( outputj
!= RealTrimSize
)
1429 DisplayOfOneColumn
+= QString(" , ");
1433 DisplayOfOneColumn
+= QString(" }");
1434 outf
<< DisplayOfOneColumn
;
1435 if ( outputi
!= pTemplate
->m_NumberOfColumns
-1)
1441 outf
<< endl
<< endl
;
1445 // Populate the new word list
1446 if ( PrintCreatedWords
)
1448 OneWordsAndParses
.clear();
1449 pTemplate
->GetWordsAndParses(OneWordsAndParses
);
1452 for ( StringToParseIt
= OneWordsAndParses
.begin(); StringToParseIt
!= OneWordsAndParses
.end(); StringToParseIt
++)
1454 TheWord
= StringToParseIt
.key();
1455 DumpParse
= StringToParseIt
.data();
1458 if ( !NewAllWordsAndParses
.contains(TheWord
))
1460 NewAllWordsAndParses
.insert(TheWord
, NULL
);
1468 // Print out the new created words
1469 if ( PrintCreatedWords
)
1471 QFile
file( "CollapseWithAlgorithm1.txt" );
1473 if ( !file
.open( QIODevice::WriteOnly
| QIODevice::Append
) )
1475 QMessageBox::information(NULL
, "Error", "Can't Open the file!", "OK");
1479 Q3TextStream
outf( &file
);
1481 //outf.open ("CollapseWithAlgorithm1.txt", ofstream::out | ofstream::app);
1482 outf
<< " New-Created Words:" << endl
;
1485 for ( StringToParseIt
= NewAllWordsAndParses
.begin(); StringToParseIt
!= NewAllWordsAndParses
.end(); StringToParseIt
++)
1488 TheWord
= StringToParseIt
.key();
1490 if ( !OldAllWordsAndParses
.contains(TheWord
))
1492 outf
<< " "<< TheWord
<<" ,";
1497 outf
<< "}" << endl
<< endl
<<endl
;
1507 if ( CollapsedAnyOne
)
1520 void CTemplateCollection::SetSwitchOfSortingValue(bool value
)
1522 for (int i
= 0; i
< GetCount(); ++i
) {
1523 CTemplate
* pTemplate
= GetAtSort(i
);
1524 pTemplate
->SetSwitchOfSortingValue(value
);
1530 void CTemplateCollection::AbsorbWords1(int Loopi
)
1534 bool Conservative
= false;
1535 int MiniMumPrefixOrSuffixNeedToBeAbsorbed
= 2;
1536 int UnTouchedTopTemplate
= 5;
1539 CTemplate
* qTemplate
;
1540 StringToParse pWordsAndParses
;
1541 StringToParse qWordsAndParses
;
1542 StringToParse::iterator StringToParseIt
;
1543 StringToInt CommonWords
;
1545 CParse
* pParse
, *qParse
;
1550 for (i
= 1; i
< GetCount()-1; i
++) {
1551 CTemplate
* pTemplate
= GetAtSort(i
);
1553 if (pTemplate
->m_IsDeleted
)
1559 if ( TemplateIndex
<= UnTouchedTopTemplate
)
1564 pWordsAndParses
.clear();
1565 pTemplate
->GetWordsAndParses(pWordsAndParses
);
1567 CommonWords
.clear();
1569 for ( j
=0; j
< i
; j
++)
1571 qTemplate
= GetAtSort(j
);
1573 if (qTemplate
->m_IsDeleted
)
1578 // Check the common word between pTemplate and qTemplate
1579 qWordsAndParses
.clear();
1580 qTemplate
->GetWordsAndParses(qWordsAndParses
);
1583 for ( StringToParseIt
= qWordsAndParses
.begin(); StringToParseIt
!= qWordsAndParses
.end(); StringToParseIt
++)
1585 oneWord
= StringToParseIt
.key();
1586 qParse
= StringToParseIt
.data();
1588 if ( pWordsAndParses
.contains(oneWord
))
1590 pParse
= pWordsAndParses
[oneWord
];
1591 Contained
= OneParseContainAnother(qParse
, pParse
);
1593 if (!Contained
) continue;
1595 // We found this word in pParse can be absorbed to higher ranking template
1596 CommonWords
.insert(oneWord
, 1);
1607 // Clean the qWordsAndParse
1609 for ( StringToParseIt
= qWordsAndParses
.begin(); StringToParseIt
!= qWordsAndParses
.end(); StringToParseIt
++)
1611 qParse
= StringToParseIt
.data();
1617 // Now, for the template p, we have got the Common Words between pTemplate and all qTemplates
1619 if ( CommonWords
.count() == 0)
1621 // Clean the pWordsAndParses
1622 for ( StringToParseIt
= pWordsAndParses
.begin(); StringToParseIt
!= pWordsAndParses
.end(); StringToParseIt
++)
1624 pParse
= StringToParseIt
.data();
1632 // Next, we deal with these common words for pTemplate
1634 pTemplate
->AdjustMeAfterAbsorb1(CommonWords
,Conservative
, MiniMumPrefixOrSuffixNeedToBeAbsorbed
, Loopi
);
1636 // Clean the pWordsAndParses
1637 for ( StringToParseIt
= pWordsAndParses
.begin(); StringToParseIt
!= pWordsAndParses
.end(); StringToParseIt
++)
1639 pParse
= StringToParseIt
.data();
1651 // Whether qParse contains/Covers pParse
1652 bool CTemplateCollection::OneParseContainAnother(CParse
* qParse
, CParse
* pParse
)
1654 bool Success
, FoundOneCut
;
1655 int oneCutLoc
, PossibleLoc
;
1658 for (int i
= 1; i
<= pParse
->Size(); i
++)
1660 oneCutLoc
= pParse
->GetPieceLoc(i
);
1662 FoundOneCut
= false;
1663 for ( int j
=1; j
<= qParse
->Size(); j
++)
1665 PossibleLoc
= qParse
->GetPieceLoc(j
);
1667 if ( oneCutLoc
== PossibleLoc
)
1688 void CTemplateCollection::FindMorphemePrefixOrSuffixWithParadigmaticGraph(int Loopi
)
1690 int MaximumSizeOfStemColumn
= 5;
1692 for (int i
= 0; i
< GetCount(); i
++) {
1693 CTemplate
* pTemplate
= GetAtSort(i
);
1695 if ( pTemplate
->m_IsDeleted
) continue;
1697 pTemplate
->FindMorphemePrefixOrSuffixWithParadigmaticGraph(Loopi
, MaximumSizeOfStemColumn
, m_GlobalNodeStickNess2
, m_GlobalStickNess2
, m_TotalGlobalStickNess2
, m_TotalWord2
);
1704 void CTemplateCollection::CutMtCorpusWithMorphologyAnalyses(QString inFileName
, QString outFileName
, StringToPtrCStem
& MorphologyCuts
, int Strategy
)
1707 if ( Strategy
== 1) // Just take the morphemes, don't care its layers or location
1709 QFile
inFile( inFileName
);
1710 if ( inFile
.open( QIODevice::ReadOnly
) )
1720 QFile
oFile( outFileName
);
1721 if ( oFile
.open( QIODevice::WriteOnly
) )
1731 Q3TextStream
instream( &inFile
);
1732 Q3TextStream
ostream( &oFile
);
1735 // Unicode or ASCII encoding?
1736 instream
.setEncoding ( Q3TextStream::Unicode
);
1737 ostream
.setEncoding ( Q3TextStream::Unicode
);
1740 // Read these sentences in...
1741 while( !instream
.atEnd() )
1743 oneLine
= instream
.readLine();
1744 oneLine
= oneLine
.lower();
1752 // Strip the start and end white space
1754 oneLine
= oneLine
.simplifyWhiteSpace ();
1756 if ( oneLine
.length() ==0)
1758 ostream
<<" " <<endl
;
1765 Pos
= oneLine
.find(QString(" "));
1769 oneWord
= oneLine
.left(Pos
);
1771 if ( MorphologyCuts
.contains(oneWord
))
1773 theCStem
= MorphologyCuts
[oneWord
];
1774 outWord
= theCStem
->Display(QChar(' '));
1782 ostream
<< outWord
<<" ";
1784 oneLine
= oneLine
.right(oneLine
.length() - Pos
-1);
1790 if ( MorphologyCuts
.contains(oneWord
))
1792 theCStem
= MorphologyCuts
[oneWord
];
1793 outWord
= theCStem
->Display(QChar(' '));
1818 if ( Strategy
== 2) // Take the morphemes, make difference for begin and end
1820 QFile
inFile( inFileName
);
1821 if ( inFile
.open( QIODevice::ReadOnly
) )
1831 QFile
oFile( outFileName
);
1832 if ( oFile
.open( QIODevice::WriteOnly
) )
1842 Q3TextStream
instream( &inFile
);
1843 Q3TextStream
ostream( &oFile
);
1846 // Unicode or ASCII encoding?
1847 instream
.setEncoding ( Q3TextStream::Unicode
);
1848 ostream
.setEncoding ( Q3TextStream::Unicode
);
1851 // Read these sentences in...
1852 while( !instream
.atEnd() )
1854 oneLine
= instream
.readLine();
1855 oneLine
= oneLine
.lower();
1868 // Strip the start and end white space
1870 oneLine
= oneLine
.simplifyWhiteSpace ();
1872 if ( oneLine
.length() ==0)
1874 ostream
<<" " <<endl
;
1881 Pos
= oneLine
.find(QString(" "));
1885 oneWord
= oneLine
.left(Pos
);
1887 if ( MorphologyCuts
.contains(oneWord
))
1889 theCStem
= MorphologyCuts
[oneWord
];
1891 if ( theCStem
->Size() == 1)
1898 // For Stem, we keep it being
1902 for ( i
=1; i
<= theCStem
->Size(); i
++)
1904 onePiece
= theCStem
->GetPiece(i
).Display();
1905 if ( static_cast <int> (onePiece
.length()) >= StemSize
) //%%% complaining signed-unsigned; type cast to fix warning
1908 StemSize
=onePiece
.length();
1914 outWord
= QString("");
1915 for ( i
=1; i
<= theCStem
->Size(); i
++)
1917 onePiece
= theCStem
->GetPiece(i
).Display();
1919 if ( (i
== 1) && (i
== theCStem
->Size()))
1921 QMessageBox::information ( NULL
, "Debug", "Impossible Here!", "OK" );
1925 if (( i
== 1) && (i
!= StemLayer
))
1927 onePiece
= onePiece
+ "_";
1928 outWord
= outWord
+ onePiece
+ " ";
1930 else if ( (i
== theCStem
->Size()) && (i
!= StemLayer
))
1932 onePiece
= "_" + onePiece
;
1933 outWord
= outWord
+ onePiece
;
1935 else if ( (i
== theCStem
->Size()) && (i
== StemLayer
))
1937 outWord
= outWord
+ onePiece
;
1941 outWord
= outWord
+ onePiece
+ " ";
1955 ostream
<< outWord
<<" ";
1957 oneLine
= oneLine
.right(oneLine
.length() - Pos
-1);
1963 if ( MorphologyCuts
.contains(oneWord
))
1965 theCStem
= MorphologyCuts
[oneWord
];
1967 if ( theCStem
->Size() == 1)
1974 // For Stem, we keep it being
1978 for ( i
=1; i
<= theCStem
->Size(); i
++)
1980 onePiece
= theCStem
->GetPiece(i
).Display();
1981 if ( static_cast <int> (onePiece
.length()) >= StemSize
) //%%% length(); gives unsigned
1984 StemSize
=onePiece
.length();
1990 outWord
= QString("");
1991 for ( i
=1; i
<= theCStem
->Size(); i
++)
1993 onePiece
= theCStem
->GetPiece(i
).Display();
1995 if ( (i
== 1) && (i
== theCStem
->Size()))
1997 QMessageBox::information ( NULL
, "Debug", "Impossible Here!", "OK" );
2001 if (( i
== 1) && (i
!= StemLayer
))
2003 onePiece
= onePiece
+ "_";
2004 outWord
= outWord
+ onePiece
+ " ";
2006 else if ( (i
== theCStem
->Size()) && (i
!= StemLayer
))
2008 onePiece
= "_" + onePiece
;
2009 outWord
= outWord
+ onePiece
;
2011 else if ( (i
== theCStem
->Size()) && (i
== StemLayer
))
2013 outWord
= outWord
+ onePiece
;
2017 outWord
= outWord
+ onePiece
+ " ";
2047 if ( Strategy
== 3) // Take the morphemes, make difference for its location
2049 QFile
inFile( inFileName
);
2050 if ( inFile
.open( QIODevice::ReadOnly
) )
2060 QFile
oFile( outFileName
);
2061 if ( oFile
.open( QIODevice::WriteOnly
) )
2071 Q3TextStream
instream( &inFile
);
2072 Q3TextStream
ostream( &oFile
);
2075 // Unicode or ASCII encoding?
2076 instream
.setEncoding ( Q3TextStream::Unicode
);
2077 ostream
.setEncoding ( Q3TextStream::Unicode
);
2080 // Read these sentences in...
2081 while( !instream
.atEnd() )
2083 oneLine
= instream
.readLine();
2084 oneLine
= oneLine
.lower();
2097 // Strip the start and end white space
2099 oneLine
= oneLine
.simplifyWhiteSpace ();
2101 if ( oneLine
.length() ==0)
2103 ostream
<<" " <<endl
;
2110 Pos
= oneLine
.find(QString(" "));
2114 oneWord
= oneLine
.left(Pos
);
2116 if ( MorphologyCuts
.contains(oneWord
))
2118 theCStem
= MorphologyCuts
[oneWord
];
2120 if ( theCStem
->Size() == 1)
2127 // For Stem, we keep it being
2131 for ( i
=1; i
<= theCStem
->Size(); i
++)
2133 onePiece
= theCStem
->GetPiece(i
).Display();
2134 if ( static_cast <int> (onePiece
.length()) >= StemSize
) //%%% length(); gives unsigned
2137 StemSize
=onePiece
.length();
2143 outWord
= QString("");
2144 for ( i
=1; i
<= theCStem
->Size(); i
++)
2146 onePiece
= theCStem
->GetPiece(i
).Display();
2148 if ( (i
== 1) && (i
== theCStem
->Size()))
2150 QMessageBox::information ( NULL
, "Debug", "Impossible Here!", "OK" );
2154 if (( i
== 1) && (i
!= StemLayer
))
2156 onePiece
= onePiece
+ "_" + QString("%1").arg(i
);
2157 outWord
= outWord
+ onePiece
+ " ";
2159 else if ( (i
== theCStem
->Size()) && (i
!= StemLayer
))
2161 onePiece
= onePiece
+ "_" + QString("%1").arg(i
);
2162 outWord
= outWord
+ onePiece
;
2164 else if ( (i
== theCStem
->Size()) && (i
== StemLayer
))
2166 outWord
= outWord
+ onePiece
;
2170 if ( i
== StemLayer
)
2172 outWord
= outWord
+ onePiece
+ " ";
2176 onePiece
= onePiece
+ "_" + QString("%1").arg(i
);
2177 outWord
= outWord
+ onePiece
+ " ";
2192 ostream
<< outWord
<<" ";
2194 oneLine
= oneLine
.right(oneLine
.length() - Pos
-1);
2200 if ( MorphologyCuts
.contains(oneWord
))
2202 theCStem
= MorphologyCuts
[oneWord
];
2204 if ( theCStem
->Size() == 1)
2211 // For Stem, we keep it being
2215 for ( i
=1; i
<= theCStem
->Size(); i
++)
2217 onePiece
= theCStem
->GetPiece(i
).Display();
2218 if ( static_cast <int> (onePiece
.length()) >= StemSize
) //%%% length(); gives unsigned
2221 StemSize
=onePiece
.length();
2227 outWord
= QString("");
2228 for ( i
=1; i
<= theCStem
->Size(); i
++)
2230 onePiece
= theCStem
->GetPiece(i
).Display();
2232 if ( (i
== 1) && (i
== theCStem
->Size()))
2234 QMessageBox::information ( NULL
, "Debug", "Impossible Here!", "OK" );
2238 if (( i
== 1) && (i
!= StemLayer
))
2240 onePiece
= onePiece
+ "_" + QString("%1").arg(i
);
2241 outWord
= outWord
+ onePiece
+ " ";
2243 else if ( (i
== theCStem
->Size()) && (i
!= StemLayer
))
2245 onePiece
= onePiece
+ "_" + QString("%1").arg(i
);
2246 outWord
= outWord
+ onePiece
;
2248 else if ( (i
== theCStem
->Size()) && (i
== StemLayer
))
2250 outWord
= outWord
+ onePiece
;
2254 if ( i
== StemLayer
)
2256 outWord
= outWord
+ onePiece
+ " ";
2260 onePiece
= onePiece
+ "_" + QString("%1").arg(i
);
2261 outWord
= outWord
+ onePiece
+ " ";
2294 if ( Strategy
== 4) // Take the morphemes, make difference for its relative location to Stemlayer
2296 QFile
inFile( inFileName
);
2297 if ( inFile
.open( QIODevice::ReadOnly
) )
2307 QFile
oFile( outFileName
);
2308 if ( oFile
.open( QIODevice::WriteOnly
) )
2318 Q3TextStream
instream( &inFile
);
2319 Q3TextStream
ostream( &oFile
);
2322 // Unicode or ASCII encoding?
2323 instream
.setEncoding ( Q3TextStream::Unicode
);
2324 ostream
.setEncoding ( Q3TextStream::Unicode
);
2327 // Read these sentences in...
2328 while( !instream
.atEnd() )
2330 oneLine
= instream
.readLine();
2331 oneLine
= oneLine
.lower();
2344 // Strip the start and end white space
2346 oneLine
= oneLine
.simplifyWhiteSpace ();
2348 if ( oneLine
.length() ==0)
2350 ostream
<<" " <<endl
;
2357 Pos
= oneLine
.find(QString(" "));
2361 oneWord
= oneLine
.left(Pos
);
2363 if ( MorphologyCuts
.contains(oneWord
))
2365 theCStem
= MorphologyCuts
[oneWord
];
2367 if ( theCStem
->Size() == 1)
2374 // For Stem, we keep it being
2378 for ( i
=1; i
<= theCStem
->Size(); i
++)
2380 onePiece
= theCStem
->GetPiece(i
).Display();
2381 if ( static_cast <int> (onePiece
.length()) >= StemSize
) //%%% length(); gives unsigned
2384 StemSize
=onePiece
.length();
2390 outWord
= QString("");
2391 for ( i
=1; i
<= theCStem
->Size(); i
++)
2393 onePiece
= theCStem
->GetPiece(i
).Display();
2395 if ( (i
== 1) && (i
== theCStem
->Size()))
2397 QMessageBox::information ( NULL
, "Debug", "Impossible Here!", "OK" );
2401 if (( i
== 1) && (i
!= StemLayer
))
2403 onePiece
= onePiece
+ "_-" + QString("%1").arg(StemLayer
-i
);
2404 outWord
= outWord
+ onePiece
+ " ";
2406 else if ( (i
== theCStem
->Size()) && (i
!= StemLayer
))
2408 onePiece
= onePiece
+ "_+" + QString("%1").arg(i
-StemLayer
);
2409 outWord
= outWord
+ onePiece
;
2411 else if ( (i
== theCStem
->Size()) && (i
== StemLayer
))
2413 outWord
= outWord
+ onePiece
;
2417 if ( i
== StemLayer
)
2419 outWord
= outWord
+ onePiece
+ " ";
2423 if ( i
> StemLayer
)
2425 onePiece
= onePiece
+ "_+" + QString("%1").arg(i
- StemLayer
);
2426 outWord
= outWord
+ onePiece
+ " ";
2430 onePiece
= onePiece
+ "_-" + QString("%1").arg(StemLayer
-i
);
2431 outWord
= outWord
+ onePiece
+ " ";
2447 ostream
<< outWord
<<" ";
2449 oneLine
= oneLine
.right(oneLine
.length() - Pos
-1);
2455 if ( MorphologyCuts
.contains(oneWord
))
2457 theCStem
= MorphologyCuts
[oneWord
];
2459 if ( theCStem
->Size() == 1)
2466 // For Stem, we keep it being
2470 for ( i
=1; i
<= theCStem
->Size(); i
++)
2472 onePiece
= theCStem
->GetPiece(i
).Display();
2473 if ( static_cast <int> (onePiece
.length()) >= StemSize
) //%%% length(); gives unsigned
2476 StemSize
=onePiece
.length();
2482 outWord
= QString("");
2483 for ( i
=1; i
<= theCStem
->Size(); i
++)
2485 onePiece
= theCStem
->GetPiece(i
).Display();
2487 if ( (i
== 1) && (i
== theCStem
->Size()))
2489 QMessageBox::information ( NULL
, "Debug", "Impossible Here!", "OK" );
2493 if (( i
== 1) && (i
!= StemLayer
))
2495 onePiece
= onePiece
+ "_-" + QString("%1").arg(StemLayer
-i
);
2496 outWord
= outWord
+ onePiece
+ " ";
2498 else if ( (i
== theCStem
->Size()) && (i
!= StemLayer
))
2500 onePiece
= onePiece
+ "_+" + QString("%1").arg(i
-StemLayer
);
2501 outWord
= outWord
+ onePiece
;
2503 else if ( (i
== theCStem
->Size()) && (i
== StemLayer
))
2505 outWord
= outWord
+ onePiece
;
2509 if ( i
== StemLayer
)
2511 outWord
= outWord
+ onePiece
+ " ";
2515 if ( i
> StemLayer
)
2517 onePiece
= onePiece
+ "_+" + QString("%1").arg(i
- StemLayer
);
2518 outWord
= outWord
+ onePiece
+ " ";
2522 onePiece
= onePiece
+ "_-" + QString("%1").arg(StemLayer
-i
);
2523 outWord
= outWord
+ onePiece
+ " ";
2559 // FindStringEditDistance
2560 void CTemplateCollection::FindAllEditDistances(
2561 CLexicon
* MyLexicon
, CWordCollection
* MyWords
)
2563 linguistica::ui::status_user_agent
& status
= MyLexicon
->status_display();
2565 int MinimumSize
= 5;
2566 int ScoreThreshold
= 8;
2567 int MaximumNumberOfLetterDifferences
= 9;
2568 int MinimumNumberOfCommonLetters
= 5;
2571 CParse Substitution
;
2574 QMap
<QString
, CTemplate
*> Templates
;
2577 if (MyLexicon
->LogFileOn() && MyLexicon
->GetLogFileStream())
2578 *MyLexicon
->GetLogFileStream() << endl
<<
2579 "<h3 class=blue>" << "String comparisons" <<
2581 StartTable
<< StartTableRow
<<
2582 MakeTableHeader("String 1") <<
2583 MakeTableHeader("String 2") <<
2584 MakeTableHeader("something else") <<
2587 const int NumberOfWords
= MyWords
->GetCount();
2588 const int TotalNumber
= NumberOfWords
- 1;
2589 status
.major_operation
= "StringEdit:FindAllEditDistances";
2590 status
.progress
.clear();
2591 status
.progress
.set_denominator(TotalNumber
);
2592 // loop through all members of the collection.
2593 for (int i
= 0; i
< NumberOfWords
- 1; i
++) {
2594 status
.progress
= i
;
2595 pWord
= MyWords
->GetAt(i
);
2597 if (pWord
->GetKeyLength() < MinimumSize
)
2600 for (int j
= i
+1; j
< NumberOfWords
; j
++) {
2601 qWord
= MyWords
->GetAt(j
);
2602 if (qWord
->GetKeyLength() < MinimumSize
)
2604 // Our tests to see if these two words,
2606 // are similar enough
2607 // to be worth testing with string edit distance
2608 int Overlap
= OverlapOfTwoAlphabetizedLists(
2609 pWord
->GetAlphabetizedForm(),
2610 qWord
->GetAlphabetizedForm());
2611 if (Overlap
< MinimumNumberOfCommonLetters
)
2613 int Diff
= DifferencesOfTwoAlphabetizedLists(
2614 pWord
->GetAlphabetizedForm(),
2615 qWord
->GetAlphabetizedForm());
2616 if (Diff
> MaximumNumberOfLetterDifferences
)
2620 std::auto_ptr
<CAlignment
> pAlignment(
2621 new CAlignment(pWord
, qWord
));
2622 Score
= pAlignment
->FindStringEditDistance();
2624 if (Score
< ScoreThreshold
&& pAlignment
->m_Slips
== 1) {
2625 Substitution
= pAlignment
->FindSubstitution();
2626 Context
= pAlignment
->FindContext();
2627 // Substitution = pAlignment->FindSubstitution();
2628 if (MyLexicon
->LogFileOn())
2629 *MyLexicon
->GetLogFileStream() << endl
<<
2631 TableData(Substitution
.GetPiece(1).Display()) <<
2632 TableData(Context
) <<
2635 TableData(Substitution
.GetPiece(2).Display()) <<
2637 QMap
<QString
, CTemplate
*>::iterator iter
=
2638 Templates
.find(Context
.GetKey().Display());
2639 if (iter
!= Templates
.end()) {
2640 CTemplate
* pTemplate
= *iter
;
2641 pTemplate
->AddToColumn(Substitution
, pTemplate
->GetVerticalColumn());
2642 pTemplate
->IncrementCorpusCount(1);
2643 pTemplate
->AddAlignment(pAlignment
.get());
2645 if (Lexicon
->LogFileOn())
2646 *Lexicon
->GetLogFile() <<
2648 "Already present " <<
2652 std::auto_ptr
<CTemplate
> new_template(
2653 new CTemplate(pAlignment
.get()));
2654 CTemplate
* pTemplate
= new_template
.get();
2656 Templates
.insert(Context
.GetKey().Display(),
2657 new_template
.release());
2658 pTemplate
->SetCorpusCount(1);
2659 pTemplate
->AddAlignment(pAlignment
.get());
2664 status
.progress
.clear();
2666 // Now go through the templates, and add them to the real collection.
2667 QMap
<QString
, CTemplate
*>::Iterator QStringToTemplateIt
;
2668 for (QMap
<QString
, CTemplate
*>::const_iterator iter
=
2669 Templates
.constBegin();
2670 iter
!= Templates
.constEnd();
2672 QString Key
= iter
.key();
2673 CTemplate
* pTemplate
= iter
.value();
2676 CTemplate
* qTemplate
= AddTemplate(pTemplate
);
2677 qTemplate
->SetCorpusCount(pTemplate
->GetCorpusCount());
2681 status
.major_operation
.clear();
2682 if (MyLexicon
->LogFileOn())
2683 *MyLexicon
->GetLogFileStream() << endl
<< EndTable
;