CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / TemplateCollection.cpp
bloba4cd9337b934532ecd3631a1f5440336848a53d9
1 // Implementation of CTemplateCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "TemplateCollection.h"
5 #include <memory>
6 #include <Q3TextStream>
7 #include <QMessageBox>
8 #include <Q3ListView>
9 #include <QIODevice>
10 #include <QFile>
11 #include "ui/Status.h"
12 #include "Lexicon.h"
13 #include "Alignment.h"
14 #include "Template.h"
15 #include "WordCollection.h"
16 #include "HTML.h"
17 #include "log2.h"
19 CTemplateCollection::CTemplateCollection()
20 : m_AllWordsAndParses(),
21 m_TotalGlobalStickNess(0.0),
22 m_GlobalStickNess2(),
23 m_GlobalNodeStickNess2(),
24 m_TotalGlobalStickNess2(0.0),
25 m_TotalWord2(0.0),
26 m_NumberOfDeletedTemplates(0) { }
28 CTemplateCollection::~CTemplateCollection(void)
32 void CTemplateCollection::ListDisplay(
33 Q3ListView* pView, linguistica::ui::status_user_agent& status)
35 pView->setRootIsDecorated(false);
36 pView->setSorting(1);
38 // Remove all previous columns.
39 while (pView->columns() != 0)
40 pView->removeColumn(0);
42 // Add Column headers.
43 pView->addColumn("TemplateNumber");
44 pView->addColumn("NumberOfColumns");
45 pView->addColumn("Complexity");
46 pView->addColumn("Layer1");
47 pView->addColumn("Layer2");
48 pView->addColumn("Layer3");
49 pView->addColumn("Layer4");
51 // Column three or four gets really wide, so
52 // limit the width to 180.
53 pView->setColumnWidthMode(2, Q3ListView::Manual);
54 pView->setColumnWidth(2, 180);
56 pView->setColumnWidthMode(3, Q3ListView::Manual);
57 pView->setColumnWidth(3, 180);
59 pView->setColumnWidthMode(4, Q3ListView::Manual);
60 pView->setColumnWidth(4, 180);
62 pView->setColumnWidthMode(5, Q3ListView::Manual);
63 pView->setColumnWidth(5, 180);
65 status.major_operation = "Displaying Templates";
66 status.progress.clear();
67 const int TotalNumber = GetCount();
68 status.progress.set_denominator(TotalNumber);
69 for (int i = 0; i < TotalNumber; ++i) {
70 CTemplate* pTemplate = GetAt(i);
71 pTemplate->ListDisplay(pView);
72 status.progress = i;
74 status.progress.clear();
75 status.major_operation.clear();
78 CTemplate* CTemplateCollection::AddTemplate( CTemplate* pTemplate)
81 CTemplate* qTemplate;
82 CNode* pTerminal;
83 int Result;
85 QString SpelledOutTemplate = pTemplate->Display(); // yuhuask Problematic ?
87 pTerminal = Insert (CStringSurrogate(SpelledOutTemplate), &Result);
88 if ( Result == 1)
90 qTemplate = new CTemplate( * pTemplate);
91 pTerminal->SetPointer (qTemplate);
93 else
95 qTemplate =(CTemplate*) pTerminal->Get_T_Pointer();
98 IncrementCorpusCount(1);
99 qTemplate->IncrementCorpusCount(1);
101 m_SortValidFlag = FALSE;
102 m_HashHasChangedFlag = TRUE;
104 return qTemplate;
110 CTemplate* CTemplateCollection::AddToCollection(CAlignment* pAlign)
112 int Result;
113 CNode* pTerminal = Insert(
114 CStringSurrogate(pAlign->SpellOut()),
115 &Result);
117 std::auto_ptr<CTemplate> pTemplate;
118 if (Result == 1) {
119 pTemplate = std::auto_ptr<CTemplate>(new CTemplate(pAlign));
120 pTemplate->SetIndex(GetCount() - 1);
121 pTerminal->SetPointer(pTemplate.get());
122 } else {
123 pTemplate = std::auto_ptr<CTemplate>(
124 static_cast<CTemplate*>(pTerminal->Get_T_Pointer()));
127 IncrementCorpusCount(1);
128 pTemplate->IncrementCorpusCount(1);
130 m_SortValidFlag = false;
131 m_HashHasChangedFlag = true;
133 return pTemplate.release();
136 void CTemplateCollection::CheckForConflations
137 ( CTemplateCollection* InputCollection )
139 int Column;
140 bool BreakFlag;
141 CTemplateCollection TemporaryHoldings;
143 QString debugstring1, debugstring2;
144 // const char* CCDebugString1, *CCDebugString2;
145 // int debugint;
147 int i;
148 for (i = 0; i < InputCollection->GetCount(); i++)
150 CTemplate* pTemplate = InputCollection->GetAt(i);
151 BreakFlag = FALSE;
153 int j;
154 for ( j = 0; j < GetCount(); j++)
156 CTemplate* qTemplate = GetAt(j);
157 if ( pTemplate->ShouldConflate(qTemplate, Column) )
159 //debugstring1 = pTemplate ->Display();
160 //CCDebugString1 = debugstring1.ascii();
161 //debugstring2 = qTemplate->Display();
162 //CCDebugString2 = debugstring2.ascii();
163 //debugint = 1;
165 qTemplate->ConflateWith( pTemplate, Column ) ;
166 BreakFlag = TRUE;
171 if (BreakFlag) { continue; }
173 for (j = i+1; j < InputCollection->GetCount(); j++)
176 CTemplate* qTemplate = InputCollection->GetAt(j);
177 if ( pTemplate->ShouldConflate(qTemplate, Column) )
181 //debugstring1 = pTemplate ->Display();
182 //CCDebugString1 = debugstring1.ascii();
183 //debugstring2 = qTemplate->Display();
184 //CCDebugString2 = debugstring2.ascii();
185 //debugint = 1;
188 pTemplate->ConflateWith( qTemplate, Column );
189 this->AddTemplate ( pTemplate );
190 break;
196 // Keep in mind, for each column, the morpheme can be repeated
197 // e.g. b#g#_ettingbradybrady_;
201 //################################################
202 // Debug and test, should be no useful for release
206 for (i = 0; i < GetCount(); i++)
208 debugstring1 = GetAt(i)->Display();
209 CCDebugString1 = debugstring1.ascii();
210 debugint = 1;
215 //################################################
216 // ReAdjust - Pre-edit-1
220 do {
222 TemporaryHoldings.Empty();
223 for (i = 0; i < GetCount(); i++)
225 GetAt(i)->Readjust( &TemporaryHoldings );
227 for (i = 0; i < TemporaryHoldings.GetCount(); i++)
229 this->AddTemplate( TemporaryHoldings[i] );
232 while (TemporaryHoldings.GetCount() > 0 );
235 //*******************************************
236 /* Get PolyMorpheme
237 CWordCollection Words, NewWords, TempWords;
238 CPolyMorpheme* pWord;
240 for (i = 0; i < GetCount(); i++)
242 CTemplate* pTemplate = GetAt(i);
243 Words.Empty();
246 for (int j = 1; j <= pTemplate->GetColumn(0)->Size(); j++)
247 { Words << pTemplate->GetColumn(0)->GetAt_SS(j) ; }
249 for (int col = 1; col < pTemplate->GetNumberOfColumns(); col++)
251 NewWords.Empty();
253 for (int row = 1; row <= (int) pTemplate->GetColumn(col)->Size(); row++)
255 TempWords = Words;
257 CSS ss = pTemplate->GetColumn(col)->GetAt_SS(row);
258 if (ss == CSS("NULL") )
261 } else
263 TempWords.SuffixToAllWords ( pTemplate->GetColumn(col)->GetAt_SS(row) );
265 NewWords.AddWordCollection( TempWords );
267 Words = NewWords;
270 for (int k = 0; k < (int) Words.GetCount(); k++)
272 pWord = *Lexicon->GetPolyWords() << Words.GetAt(k);
273 pWord->CopyParse ( *(CParse*)Words.GetAt(k) );
284 void CTemplateCollection::OutputTemplatesForGoldStand()
286 StringToParse OneWordsAndParses;
287 StringToParse::Iterator StringToParseIt;
288 QString TheWord;
289 CParse* TheParse;
290 CParse* OneParse;
291 QString SSDisplay;
294 if ( m_AllWordsAndParses != NULL) delete m_AllWordsAndParses;
296 m_AllWordsAndParses = new StringToParse();
298 //QMessageBox::information( NULL, "Debug", "Run Here -1", "OK" );
300 for (int i = 0; i < GetCount(); i++)
302 CTemplate* pTemplate = GetAt(i);
304 OneWordsAndParses.clear();
305 pTemplate ->GetWordsAndParses(OneWordsAndParses);
308 for ( StringToParseIt = OneWordsAndParses.begin(); StringToParseIt != OneWordsAndParses.end(); StringToParseIt++)
310 TheWord = StringToParseIt.key();
311 TheParse = StringToParseIt.data();
314 if ( m_AllWordsAndParses ->contains(TheWord))
316 OneParse = (*m_AllWordsAndParses)[TheWord];
317 OneParse ->yuhuMergeParse(TheParse);
319 else
321 m_AllWordsAndParses ->insert(TheWord, TheParse);
337 void CTemplateCollection::OutputTemplates ( LPCTSTR FileName)
339 ofstream outf(FileName);
340 outf << setiosflags (ios::left);
341 outf << GetCount() ;
343 Sort (TEMPLATE_SORT);
344 for (int i = 0; i < GetCount(); i++)
346 CTemplate* pTemplate = GetAtSort(i);
347 pTemplate->OutputForFile ( outf );
348 outf << endl;
351 outf << "Number of columns: "<< pTemplate->GetNumberOfColumns() << endl;
352 outf << "Complexity: " << pTemplate->GetComplexity() << endl;
353 outf << "Words complexity: " << pTemplate->GetWordsTotalComplexity() << endl;
354 outf << "Sorting quantity: " << pTemplate->GetSortingQuantity() << endl<< endl;
357 outf.close();
360 void CTemplateCollection::ReadTemplateFile( LPCTSTR FileName)
362 const int bufferSize = 256;
363 char buffer[bufferSize];
365 CTemplate* pTemplate;
366 CParse* pParse= new CParse();
367 ifstream inf(FileName);
368 int size = 0,
369 Count = 0,
370 NumberOfColumns = 0;
371 inf.getline(buffer, bufferSize, '\n');
372 size = atoi (buffer);
374 while (inf && Count < size)
377 inf.getline(buffer, bufferSize, '\n');
378 pParse->Collapse(buffer);
379 if ( pParse->Size() <= 0) { continue; }
381 NumberOfColumns = pParse->Size();
382 pTemplate = new CTemplate (NumberOfColumns);
383 for (int i = 1; i<= (int) pParse->Size(); i++)
385 pTemplate->AddToColumn( pParse->GetAt_SS(i), i-1);
387 while (inf)
389 inf.getline (buffer, bufferSize, '\n');
390 pParse->Collapse(buffer);
391 if ( pParse->Length() == 0 )
393 break;
396 for (int i = 1; i<= (int) pParse->Size(); i++)
398 if ( pParse->GetAt_SS(i).FirstChar() == '-' ) { continue;}
399 pTemplate->AddToColumn( pParse->GetAt_SS(i), i-1);
402 AddTemplate ( pTemplate );
403 // there are 10 lines with stuff we don't read.
404 for (i = 0; i < 10; i++) { inf.getline(buffer, bufferSize, '\n'); }
407 Count++;
411 int CompareTemplateColumn1 (const void *pA, const void *pB)
413 CTemplate* pS1=*(CTemplate**) pA;
414 CTemplate* pS2=*(CTemplate**) pB;
416 int Value1 = pS1->GetColumn(0)->SpellOut().Compare(pS2->GetColumn(0)->SpellOut() );
417 if ( Value1 != 0 ) return Value1;
419 int Value2 = pS1->GetColumn(1)->SpellOut().Compare(pS2->GetColumn(1)->SpellOut() );
420 return Value2;
422 // int Value3 = pS1->GetColumn(3)->SpellOut().Compare(pS2->GetColumn(3)->SpellOut() );
423 // if ( Value2 != 0 ) return Value2;
426 int CompareTemplateColumn2 (const void *pA, const void *pB)
428 CTemplate* pS1=*(CTemplate**) pA;
429 CTemplate* pS2=*(CTemplate**) pB;
431 if ( pS1->GetColumn(1) == NULL )
433 if ( pS2->GetColumn(1) == NULL )
435 return 0;
437 return -1;
439 if ( pS2->GetColumn(1) == NULL )
441 return -1;
445 int Value1 = pS1->GetColumn(1)->SpellOut().Compare(pS2->GetColumn(1)->SpellOut() );
446 if ( Value1 != 0 ) return Value1;
448 int Value2 = pS1->GetColumn(0)->SpellOut().Compare(pS2->GetColumn(0)->SpellOut() );
450 return Value2;
452 // int Value3 = pS1->GetColumn(3)->SpellOut().Compare(pS2->GetColumn(3)->SpellOut() );
453 // if ( Value2 != 0 ) return Value2;
458 void CTemplateCollection::SortTemplates (eSortStyle SortStyle, int ColumnNumber )
460 if (SortStyle != TEMPLATE_SORT_COLUMN) { return; }
462 int Size = GetCount();
464 if (SortArray) { delete [] SortArray; }
466 SortArray = new CTemplate*[ Size ];
468 double* ValueArray = new double [ Size ];
471 for (int i = 0; i < Size; i++)
473 SortArray[i] = GetAt( i );
476 // Fix this -- make it general, not case by case
477 switch ( ColumnNumber )
479 case 1:
481 qsort((void*) SortArray, Size, sizeof(CTemplate*), CompareTemplateColumn1);
482 break;
484 case 2:
486 qsort((void*) SortArray, Size, sizeof(CTemplate*), CompareTemplateColumn2);
487 break;
489 case 3:
491 // qsort((void*) SortArray, Size, sizeof(CTemplate*), CompareTemplateColumn3);
492 break;
496 m_SortValidFlag = true;
498 m_SortStyle = SortStyle;
506 /* This function looks for templates with 3 columns in which most of the letters --
507 that is, the lexical heads -- are in the 3rd column.
511 void CTemplateCollection::FindPrefixingTemplates( CTemplateCollection* PrefixingTemplates )
513 CTemplate* pTemplate;
514 double Threshold = 0.2,
515 NumberOfLetters,
516 FirstColumnNL = 0,
517 SecondColumnNL = 0;
519 for (int i = 0; i < (int) GetCount(); i++)
521 pTemplate = GetAt(i);
522 if ( pTemplate->GetNumberOfColumns() < 3 ) { continue; }
523 NumberOfLetters = pTemplate->GetNumberOfLetters();
524 if ( (double) pTemplate->GetColumn(0)->GetLength() / NumberOfLetters < Threshold &&
525 (double) pTemplate->GetColumn(1)->GetLength() / NumberOfLetters < Threshold
528 PrefixingTemplates->AddTemplate( pTemplate );
541 eAffixationType CTemplateCollection::FindAffixationSide()
543 int HowManyTemplates = 10;
544 double nPrefix = 0,
545 nSuffix = 0,
546 nUnknown = 0;
547 double Threshold = 0.6;
549 Sort (TEMPLATE_SORT );
550 for (int i = 0; i < HowManyTemplates && i < GetCount(); i++)
552 switch (GetAtSort(i)->DetermineAffixationSide() )
554 case (TYPE_Suffix):
556 nSuffix++;
557 break;
559 case (TYPE_Prefix):
561 nPrefix++;
562 break;
564 case (TYPE_Unknown):
566 nUnknown++;
567 break;
573 int Total = int( nPrefix + nSuffix + nUnknown );
574 if (nPrefix / Total > Threshold )
575 return TYPE_Prefix;
576 if (nSuffix / Total > Threshold )
577 return TYPE_Suffix;
578 return TYPE_Unknown;
586 void CTemplateCollection::AddToCollections( CStemCollection* SuffixStems,
587 CStemCollection* PrefixStems,
588 CSuffixCollection* Suffixes,
589 CPrefixCollection* Prefixes,
590 CSignatureCollection* PrefixSignatures,
591 CSignatureCollection* SuffixSignatures )
594 eAffixationType Type;
595 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << endl << endl << "Adding templates to collections" << endl ; }
596 for (int i = 0; i < GetCount(); i++)
599 Type = GetAt(i)->DetermineAffixationSide();
600 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << endl << GetAt(i)->SpellOut(); }
601 if ( Type == TYPE_Suffix )
603 GetAt(i)->AddToCollections ( Type, SuffixStems, (CAffixCollection*) Suffixes, SuffixSignatures );
604 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << "\t Suffix"; }
607 if ( Type == TYPE_Prefix )
609 GetAt(i)->AddToCollections ( Type, PrefixStems, (CAffixCollection*) Prefixes, PrefixSignatures );
610 if ( Lexicon->GetLogFile() ) { *Lexicon->GetLogFile() << "\t Prefix"; }
620 void CTemplateCollection::UpdateGlobalStickNess2()
623 int i, column;
624 int piecei, piecej;
625 QString Morphemei, Morphemej;
626 QString HostMorpheme, SlaveMorpheme;
627 QString PreviousMorpheme, CurrentMorpheme;
628 QString oneMorpheme;
629 CParse* oneColumn;
630 float myComplexity;
631 StringToFloat* oneCollection;
632 StringToFloat::Iterator StringToFloatIt;
633 StringToStringToFloat::Iterator StringToStringToFloatIt;
634 StringToInt AllMorphemes;
635 StringToInt MorphemesInThisTemplate;
636 StringToInt::Iterator StringToIntIt;
637 float oneFloat;
638 int NumberOfWordsInThisTemplate;
639 int TotalWords;
640 QString dummystring;
643 QFile file( "paragmatic.txt" );
645 if ( !file.open( QIODevice::WriteOnly ) )
647 return;
650 Q3TextStream logf( &file );
652 Sort (TEMPLATE_SORT);
655 // Clear m_GlobalStickNess2;
656 for (StringToStringToFloatIt = m_GlobalStickNess2.begin(); StringToStringToFloatIt != m_GlobalStickNess2.end(); StringToStringToFloatIt++)
658 oneCollection = StringToStringToFloatIt.data();
659 delete oneCollection;
663 // Clear m_GlobalStickNess2 and m_GlobalNodeStickNess2
664 m_GlobalStickNess2.clear();
665 m_GlobalNodeStickNess2.clear();
669 AllMorphemes.clear();
670 m_TotalGlobalStickNess2 = 0.0;
672 TotalWords = 0;
674 for (i = 0; i < GetCount(); i++) {
675 CTemplate* pTemplate = GetAtSort(i);
677 if ( pTemplate ->m_IsDeleted) continue;
680 //myComplexity = pTemplate ->GetSortingQuantity(); in this function, we take number of words as robustness
682 MorphemesInThisTemplate.clear();
684 NumberOfWordsInThisTemplate = 1;
685 for ( column =0; column < pTemplate ->m_NumberOfColumns; column++)
687 oneColumn = pTemplate -> m_Columns[column];
688 NumberOfWordsInThisTemplate = NumberOfWordsInThisTemplate * oneColumn ->Size();
691 TotalWords += NumberOfWordsInThisTemplate;
692 myComplexity = (float) NumberOfWordsInThisTemplate;
695 for ( column =0; column < pTemplate ->m_NumberOfColumns; column++)
697 oneColumn = pTemplate -> m_Columns[column];
699 if ( oneColumn ->Size() <= 1)
701 Morphemei = oneColumn ->GetPiece(1).Display();
703 if ( column ==0)
705 Morphemei = Morphemei.right(Morphemei.length() -1);
707 if ( Morphemei.length() ==0)
709 Morphemei = TheStringNULL;
714 AllMorphemes.insert(Morphemei, 1);
715 MorphemesInThisTemplate.insert(Morphemei, 1);
716 continue;
719 for ( piecei = 1; piecei <= oneColumn ->Size() -1; piecei++)
721 Morphemei = oneColumn ->GetPiece(piecei).Display();
723 if ( column ==0)
725 Morphemei = Morphemei.right(Morphemei.length() -1);
727 if ( Morphemei.length() ==0)
729 Morphemei = TheStringNULL;
733 AllMorphemes.insert(Morphemei, 1);
734 MorphemesInThisTemplate.insert(Morphemei, 1);
736 for ( piecej = piecei + 1; piecej <= oneColumn ->Size(); piecej++)
738 Morphemej = oneColumn ->GetPiece(piecej).Display();
740 if ( column ==0)
742 Morphemej = Morphemej.right(Morphemej.length() -1);
744 if ( Morphemej.length() ==0)
746 Morphemej = TheStringNULL;
751 AllMorphemes.insert(Morphemej, 1);
752 MorphemesInThisTemplate.insert(Morphemej, 1);
755 if ( Morphemei > Morphemej)
757 HostMorpheme = Morphemei;
758 SlaveMorpheme = Morphemej;
760 else
762 HostMorpheme = Morphemej;
763 SlaveMorpheme = Morphemei;
766 if ( m_GlobalStickNess2.contains(HostMorpheme))
768 oneCollection = m_GlobalStickNess2[HostMorpheme];
770 if ( oneCollection ->contains(SlaveMorpheme))
772 (*oneCollection)[SlaveMorpheme] += myComplexity;
773 m_TotalGlobalStickNess2 += myComplexity;
776 else
778 oneCollection ->insert(SlaveMorpheme, (float)(myComplexity));
779 m_TotalGlobalStickNess2 += myComplexity;
783 else
785 oneCollection = new StringToFloat();
786 m_GlobalStickNess2.insert(HostMorpheme, oneCollection);
788 oneCollection ->insert(SlaveMorpheme, (float)(myComplexity));
789 m_TotalGlobalStickNess2 += myComplexity;
808 // Get the Vertex stickness for this template
810 for ( StringToIntIt = MorphemesInThisTemplate.begin(); StringToIntIt != MorphemesInThisTemplate.end(); StringToIntIt++)
812 oneMorpheme = StringToIntIt.key();
814 if ( m_GlobalNodeStickNess2.contains(oneMorpheme))
816 m_GlobalNodeStickNess2[oneMorpheme] += myComplexity;
818 else
820 m_GlobalNodeStickNess2.insert(oneMorpheme,(float)(myComplexity));
829 // Finally, Calculate the -log(prob) of edge stickness and vertex stickness
830 float LogValue;
832 logf << "********The Paragmatic Edge StickNess******"<<endl<<endl;
834 for ( StringToStringToFloatIt = m_GlobalStickNess2.begin(); StringToStringToFloatIt != m_GlobalStickNess2.end(); StringToStringToFloatIt++)
836 HostMorpheme = StringToStringToFloatIt.key();
837 oneCollection = StringToStringToFloatIt.data();
840 for ( StringToFloatIt = oneCollection ->begin(); StringToFloatIt != oneCollection ->end(); StringToFloatIt++)
842 SlaveMorpheme = StringToFloatIt.key();
843 oneFloat = StringToFloatIt.data();
845 LogValue = -base2log((oneFloat)/m_TotalGlobalStickNess2);
846 (*oneCollection)[SlaveMorpheme] = LogValue;
848 // Log Info
849 logf << HostMorpheme << "-" << SlaveMorpheme << " : "<<LogValue << endl;
857 logf <<endl<<endl<<endl;
860 logf << "********The Paragmatic Node StickNess******"<<endl<<endl;
863 for ( StringToFloatIt = m_GlobalNodeStickNess2.begin(); StringToFloatIt != m_GlobalNodeStickNess2.end(); StringToFloatIt++)
865 oneMorpheme = StringToFloatIt.key();
866 oneFloat = StringToFloatIt.data();
868 LogValue = -base2log((oneFloat)/(float)TotalWords);
869 m_GlobalNodeStickNess2[oneMorpheme] = LogValue;
871 // Log Info
872 logf << oneMorpheme << " : " << LogValue << endl;
875 m_TotalWord2 = (float)TotalWords;
877 logf <<endl;
880 file.close();
886 void CTemplateCollection::AdjustTemplatesByMovingCommonTailOrHead2(int Loopi)
888 // Parameter
889 const int MaximumSizeOfEachColumn = 3;
890 const int MaximumNumberOfMovingLetters = 3;
891 bool PrintChangedTemplates = true;
893 Sort (TEMPLATE_SORT);
896 // Check Through each Template Once
897 for (int i = 0; i < GetCount(); i++) {
898 CTemplate* pTemplate = GetAtSort(i);
900 if (pTemplate ->m_IsDeleted) continue;
903 if ( i == GetCount() -1)
905 QMessageBox::information (NULL, "Debug", QString("Run Template %1").arg(i), "OK");
909 pTemplate ->AdjustMeByMovingCommonTailOrHead2(m_GlobalNodeStickNess2,MaximumSizeOfEachColumn, \
910 MaximumNumberOfMovingLetters,PrintChangedTemplates, Loopi, true, m_GlobalStickNess2, m_TotalGlobalStickNess2, m_TotalWord2);
916 bool CTemplateCollection::CollapseAlgorithm1(int loopnumber)
921 // Parameters
922 int MinimumSizeOfStemColumn = 5;
923 int OnlyConsiderTemplatesWithSlotNumber =3;
924 int MaximumSymmetricError = 2;
925 int MinimumCommonStem =2;
926 int MaximumOutputMorphemesInOneColumn=10000;
927 bool PrintCollapsedTemplates = true;
928 bool PrintCreatedWords = true;
929 bool DisplayOldDeletedTemplates = false;
931 int i,j;
932 CTemplate* qTemplate;
933 int column;
934 CParse* pLeftColumn, *pRightColumn, *pStemColumn;
935 CParse* qLeftColumn, *qRightColumn, *qStemColumn;
936 int pIndexOfFirstOfNonStemColumns,pIndexOfStemColumn;
937 int qIndexOfFirstOfNonStemColumns,qIndexOfStemColumn;
938 bool pShouldConsiderMe;
939 bool qShouldConsiderMe;
940 bool CollapsedAnyOne;
941 bool AnyChanged ;
943 AnyChanged = false;
946 CollapsedAnyOne = true;
948 // Loop until no more templates are collpased
949 while ( CollapsedAnyOne)
952 // Check Through each Template for possible collapsing
953 for (i = 0; i < GetCount()-1; i++) {
954 CTemplate* pTemplate = GetAtSort(i);
955 pShouldConsiderMe = false;
956 CollapsedAnyOne = false;
958 if (pTemplate ->m_IsDeleted)
960 continue;
964 if (pTemplate ->m_NumberOfColumns != OnlyConsiderTemplatesWithSlotNumber)
966 continue;
970 for ( column =0; column < pTemplate ->m_NumberOfColumns -1; column++)
972 pLeftColumn = pTemplate ->m_Columns[column];
973 pRightColumn = pTemplate ->m_Columns[column+1];
974 pIndexOfFirstOfNonStemColumns = column;
976 if ( column ==0)
978 pStemColumn = pTemplate ->m_Columns[column+2];
979 pIndexOfStemColumn = column + 2;
981 else
983 pStemColumn = pTemplate ->m_Columns[0];
984 pIndexOfStemColumn =0;
987 if (( pLeftColumn ->Size() < MinimumSizeOfStemColumn) \
988 && ( pRightColumn ->Size() < MinimumSizeOfStemColumn)\
989 && ( pStemColumn ->Size() >= MinimumSizeOfStemColumn))
991 pShouldConsiderMe = true;
992 break;
996 if ( !pShouldConsiderMe)
998 continue;
1002 for (j = i+1; j < GetCount(); j++)
1004 qTemplate = GetAtSort(j);
1005 qShouldConsiderMe = false;
1007 if (qTemplate ->m_IsDeleted)
1009 continue;
1012 if (qTemplate ->m_NumberOfColumns != OnlyConsiderTemplatesWithSlotNumber)
1014 continue;
1018 for ( column =0; column < qTemplate ->m_NumberOfColumns -1; column++)
1021 if ( column != pIndexOfFirstOfNonStemColumns)
1023 continue;
1026 qLeftColumn = qTemplate ->m_Columns[column];
1027 qRightColumn = qTemplate ->m_Columns[column+1];
1028 qIndexOfFirstOfNonStemColumns = column;
1030 if ( column ==0)
1032 qStemColumn = qTemplate ->m_Columns[column+2];
1033 qIndexOfStemColumn = column + 2;
1035 else
1037 qStemColumn = qTemplate ->m_Columns[0];
1038 qIndexOfStemColumn =0;
1041 if (( qLeftColumn ->Size() < MinimumSizeOfStemColumn) \
1042 && ( qRightColumn ->Size() < MinimumSizeOfStemColumn)\
1043 && ( qStemColumn ->Size() >= MinimumSizeOfStemColumn))
1045 qShouldConsiderMe = true;
1046 break;
1050 if ( !qShouldConsiderMe)
1052 continue;
1055 //*******************************************************
1056 // Now the pTemplate and qTemplate are ready to compare
1057 // Creteria:
1058 // 1. One of the two non-stem columns is indentical
1059 // 2. Another non-stem column's symmetric error less than MaximumSymmetricError
1060 // 3. The # of common stems is bigger than MinimumCommonStem
1061 //********************************************************
1063 bool PassNonStemColumnTest;
1064 bool PassStemColumnTest;
1065 int NumberOfSymmetricError;
1066 int NumberOfCommonStems;
1067 int piecei;
1068 int outputi, outputj;
1069 QString TheMorpheme;
1070 int EqualNonStemColumn;
1071 int MergeNonStemColumn;
1072 CParse* pOneColumn, *qOneColumn;
1073 CStringSurrogate TempCSS;
1074 //ofstream outf;
1079 pLeftColumn ->Alphabetize();
1080 pRightColumn ->Alphabetize();
1082 qLeftColumn ->Alphabetize();
1083 qRightColumn ->Alphabetize();
1085 PassNonStemColumnTest = false;
1086 PassStemColumnTest = false;
1088 // Check whether LeftColumn is equal
1090 if ( (*pLeftColumn) == qLeftColumn )
1092 NumberOfSymmetricError =0;
1094 for (piecei =1; piecei<= pRightColumn ->Size(); piecei++)
1096 TheMorpheme = pRightColumn->GetPiece(piecei).Display();
1098 TempCSS = CStringSurrogate(TheMorpheme);
1099 if ( !qRightColumn ->Contains(TempCSS))
1101 NumberOfSymmetricError++;
1105 for (piecei =1; piecei<= qRightColumn ->Size(); piecei++)
1107 TheMorpheme = qRightColumn->GetPiece(piecei).Display();
1109 TempCSS = CStringSurrogate(TheMorpheme);
1110 if ( !pRightColumn ->Contains(TempCSS))
1112 NumberOfSymmetricError++;
1117 if (NumberOfSymmetricError <= MaximumSymmetricError)
1119 PassNonStemColumnTest = true;
1120 EqualNonStemColumn = pIndexOfFirstOfNonStemColumns;
1121 MergeNonStemColumn = pIndexOfFirstOfNonStemColumns + 1;
1123 else
1125 PassNonStemColumnTest = false;
1128 }else if ((*pRightColumn) == qRightColumn)
1130 NumberOfSymmetricError =0;
1132 for (piecei =1; piecei<= pLeftColumn ->Size(); piecei++)
1134 TheMorpheme = pLeftColumn->GetPiece(piecei).Display();
1136 TempCSS = CStringSurrogate(TheMorpheme);
1137 if ( !qLeftColumn ->Contains(TempCSS) )
1139 NumberOfSymmetricError++;
1143 for (piecei =1; piecei<= qLeftColumn ->Size(); piecei++)
1145 TheMorpheme = qLeftColumn->GetPiece(piecei).Display();
1147 TempCSS = CStringSurrogate(TheMorpheme);
1148 if ( !pLeftColumn ->Contains(TempCSS))
1150 NumberOfSymmetricError++;
1155 if (NumberOfSymmetricError <= MaximumSymmetricError)
1157 PassNonStemColumnTest = true;
1158 EqualNonStemColumn = pIndexOfFirstOfNonStemColumns +1;
1159 MergeNonStemColumn = pIndexOfFirstOfNonStemColumns;
1161 else
1163 PassNonStemColumnTest = false;
1168 if ( !PassNonStemColumnTest)
1170 continue;
1174 NumberOfCommonStems =0;
1176 for (piecei =1; piecei<= pStemColumn ->Size(); piecei++)
1178 TheMorpheme = pStemColumn->GetPiece(piecei).Display();
1180 TempCSS = CStringSurrogate(TheMorpheme);
1181 if ( qStemColumn ->Contains(TempCSS) )
1183 NumberOfCommonStems++;
1187 if ( NumberOfCommonStems >= MinimumCommonStem)
1189 PassStemColumnTest = true;
1191 else
1193 PassStemColumnTest = false;
1194 continue;
1198 // Now, we collapse these two template together
1199 CollapsedAnyOne = true;
1200 AnyChanged = true;
1201 if (!DisplayOldDeletedTemplates)
1203 m_NumberOfDeletedTemplates++;
1206 qTemplate ->m_IsDeleted = true;
1207 // Print the attempted templates
1208 if ( PrintCollapsedTemplates)
1210 QString DisplayOfOneColumn;
1211 int RealTrimSize;
1213 QFile file( "CollapseWithAlgorithm1.txt" );
1215 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append) )
1217 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1218 return false;
1221 Q3TextStream outf( &file );
1225 //outf.open ("CollapseWithAlgorithm1.txt", ofstream::out | ofstream::app);
1227 outf << "***************"<<loopnumber<<"********************" <<endl;
1228 outf << " Original P-Template:" << endl;
1229 outf << " ";
1231 for ( outputi = 0; outputi < pTemplate ->m_NumberOfColumns; outputi++)
1233 pOneColumn = pTemplate ->m_Columns[outputi];
1235 DisplayOfOneColumn = QString("{ ");
1237 RealTrimSize = MaximumOutputMorphemesInOneColumn;
1238 if ( pOneColumn ->Size() < RealTrimSize)
1240 RealTrimSize = pOneColumn ->Size();
1243 for (outputj = 1; outputj <= RealTrimSize;outputj++)
1245 DisplayOfOneColumn += pOneColumn ->GetPiece(outputj).Display();
1246 if ( outputj != RealTrimSize )
1248 DisplayOfOneColumn += QString(" , ");
1252 DisplayOfOneColumn += QString(" }");
1253 outf << DisplayOfOneColumn;
1254 if ( outputi != pTemplate ->m_NumberOfColumns -1)
1256 outf << " --> ";
1260 outf << endl << endl;
1262 outf << " Original q-Template:" << endl;
1263 outf << " ";
1265 for ( outputi = 0; outputi < qTemplate ->m_NumberOfColumns; outputi++)
1267 qOneColumn = qTemplate ->m_Columns[outputi];
1269 DisplayOfOneColumn = QString("{ ");
1271 RealTrimSize = MaximumOutputMorphemesInOneColumn;
1272 if ( qOneColumn ->Size() < RealTrimSize)
1274 RealTrimSize = qOneColumn ->Size();
1277 for (outputj = 1; outputj <= RealTrimSize;outputj++)
1279 DisplayOfOneColumn += qOneColumn ->GetPiece(outputj).Display();
1280 if ( outputj != RealTrimSize )
1282 DisplayOfOneColumn += QString(" , ");
1286 DisplayOfOneColumn += QString(" }");
1287 outf << DisplayOfOneColumn;
1288 if ( outputi != qTemplate ->m_NumberOfColumns -1)
1290 outf << " --> ";
1294 outf << endl << endl;
1296 file.close();
1300 // To Print out the new created words, Keep old words
1301 StringToParse OneWordsAndParses;
1302 StringToParse OldAllWordsAndParses;
1303 StringToParse NewAllWordsAndParses;
1304 StringToParse::iterator StringToParseIt;
1305 QString TheWord;
1306 CParse* DumpParse;
1309 // Populate the old word list
1310 if ( PrintCreatedWords)
1312 OneWordsAndParses.clear();
1313 pTemplate ->GetWordsAndParses(OneWordsAndParses);
1315 for ( StringToParseIt = OneWordsAndParses.begin(); StringToParseIt != OneWordsAndParses.end(); StringToParseIt++)
1317 TheWord = StringToParseIt.key();
1318 DumpParse = StringToParseIt.data();
1320 delete DumpParse;
1322 if ( !OldAllWordsAndParses.contains(TheWord))
1324 OldAllWordsAndParses.insert(TheWord, NULL);
1330 OneWordsAndParses.clear();
1331 qTemplate ->GetWordsAndParses(OneWordsAndParses);
1334 for ( StringToParseIt = OneWordsAndParses.begin(); StringToParseIt != OneWordsAndParses.end(); StringToParseIt++)
1337 TheWord = StringToParseIt.key();
1338 DumpParse = StringToParseIt.data();
1339 delete DumpParse;
1341 if ( !OldAllWordsAndParses.contains(TheWord))
1343 OldAllWordsAndParses.insert(TheWord, NULL);
1350 // First, merge not-equal non-stem column from qTemplate into pTemplate
1351 pOneColumn = pTemplate ->m_Columns[MergeNonStemColumn];
1352 qOneColumn = qTemplate ->m_Columns[MergeNonStemColumn];
1354 for (piecei =1; piecei<= qOneColumn ->Size(); piecei++)
1356 TheMorpheme = qOneColumn->GetPiece(piecei).Display();
1358 TempCSS = CStringSurrogate(TheMorpheme);
1359 if ( !pOneColumn ->Contains(TempCSS) )
1361 // pOneColumn ->AppendInAlphabeticalOrder(TempCSS);
1362 pOneColumn ->Append (TempCSS);
1369 // Second, merge the stem column from qTemplate into pTemplate
1370 pOneColumn = pTemplate ->m_Columns[pIndexOfStemColumn];
1371 qOneColumn = qTemplate ->m_Columns[qIndexOfStemColumn];
1373 for (piecei =1; piecei<= qOneColumn ->Size(); piecei++)
1375 TheMorpheme = qOneColumn->GetPiece(piecei).Display();
1377 TempCSS = CStringSurrogate(TheMorpheme);
1378 if ( !pOneColumn ->Contains(TempCSS) )
1380 // pOneColumn ->AppendInAlphabeticalOrder(TempCSS);
1381 pOneColumn ->Append (TempCSS);
1385 // Change the flag m_IsNewAfterCollapse1 and m_StemColumnInCollapse1
1386 pTemplate ->m_IsNewAfterCollapse1 = true;
1387 pTemplate ->m_StemColumnInCollapse1 = pIndexOfStemColumn;
1390 // Print the attempted templates
1391 if ( PrintCollapsedTemplates)
1393 QString DisplayOfOneColumn;
1394 int RealTrimSize;
1396 QFile file( "CollapseWithAlgorithm1.txt" );
1398 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append) )
1400 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1401 return false;
1404 Q3TextStream outf( &file );
1408 //outf.open ("CollapseWithAlgorithm1.txt", ofstream::out | ofstream::app);
1409 outf << " Collapsed -Template:" << endl;
1410 outf << " ";
1412 for ( outputi = 0; outputi < pTemplate ->m_NumberOfColumns; outputi++)
1414 pOneColumn = pTemplate ->m_Columns[outputi];
1416 DisplayOfOneColumn = QString("{ ");
1418 RealTrimSize = MaximumOutputMorphemesInOneColumn;
1419 if ( pOneColumn ->Size() < RealTrimSize)
1421 RealTrimSize = pOneColumn ->Size();
1424 for (outputj = 1; outputj <= RealTrimSize;outputj++)
1426 DisplayOfOneColumn += pOneColumn ->GetPiece(outputj).Display();
1427 if ( outputj != RealTrimSize )
1429 DisplayOfOneColumn += QString(" , ");
1433 DisplayOfOneColumn += QString(" }");
1434 outf << DisplayOfOneColumn;
1435 if ( outputi != pTemplate ->m_NumberOfColumns -1)
1437 outf << " --> ";
1441 outf << endl << endl;
1442 file.close();
1445 // Populate the new word list
1446 if ( PrintCreatedWords)
1448 OneWordsAndParses.clear();
1449 pTemplate ->GetWordsAndParses(OneWordsAndParses);
1452 for ( StringToParseIt = OneWordsAndParses.begin(); StringToParseIt != OneWordsAndParses.end(); StringToParseIt++)
1454 TheWord = StringToParseIt.key();
1455 DumpParse = StringToParseIt.data();
1456 delete DumpParse;
1458 if ( !NewAllWordsAndParses.contains(TheWord))
1460 NewAllWordsAndParses.insert(TheWord, NULL);
1468 // Print out the new created words
1469 if ( PrintCreatedWords)
1471 QFile file( "CollapseWithAlgorithm1.txt" );
1473 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append) )
1475 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1476 return false;
1479 Q3TextStream outf( &file );
1481 //outf.open ("CollapseWithAlgorithm1.txt", ofstream::out | ofstream::app);
1482 outf << " New-Created Words:" << endl;
1483 outf << " " << "{";
1485 for ( StringToParseIt = NewAllWordsAndParses.begin(); StringToParseIt != NewAllWordsAndParses.end(); StringToParseIt++)
1488 TheWord = StringToParseIt.key();
1490 if ( !OldAllWordsAndParses.contains(TheWord))
1492 outf << " "<< TheWord <<" ,";
1497 outf << "}" << endl << endl <<endl;
1499 file.close();
1503 break;
1507 if ( CollapsedAnyOne)
1509 break;
1515 return AnyChanged;
1520 void CTemplateCollection::SetSwitchOfSortingValue(bool value)
1522 for (int i = 0; i < GetCount(); ++i) {
1523 CTemplate* pTemplate = GetAtSort(i);
1524 pTemplate ->SetSwitchOfSortingValue(value);
1530 void CTemplateCollection::AbsorbWords1(int Loopi)
1534 bool Conservative = false;
1535 int MiniMumPrefixOrSuffixNeedToBeAbsorbed = 2;
1536 int UnTouchedTopTemplate = 5;
1537 int i, j;
1538 int TemplateIndex;
1539 CTemplate* qTemplate;
1540 StringToParse pWordsAndParses;
1541 StringToParse qWordsAndParses;
1542 StringToParse::iterator StringToParseIt;
1543 StringToInt CommonWords;
1544 QString oneWord;
1545 CParse* pParse, *qParse;
1546 bool Contained;
1549 TemplateIndex =0;
1550 for (i = 1; i < GetCount()-1; i++) {
1551 CTemplate* pTemplate = GetAtSort(i);
1553 if (pTemplate ->m_IsDeleted)
1555 continue;
1558 TemplateIndex++;
1559 if ( TemplateIndex <= UnTouchedTopTemplate)
1561 continue;
1564 pWordsAndParses.clear();
1565 pTemplate ->GetWordsAndParses(pWordsAndParses);
1567 CommonWords.clear();
1569 for ( j=0; j< i; j++)
1571 qTemplate = GetAtSort(j);
1573 if (qTemplate ->m_IsDeleted)
1575 continue;
1578 // Check the common word between pTemplate and qTemplate
1579 qWordsAndParses.clear();
1580 qTemplate ->GetWordsAndParses(qWordsAndParses);
1583 for ( StringToParseIt= qWordsAndParses.begin(); StringToParseIt != qWordsAndParses.end(); StringToParseIt++)
1585 oneWord = StringToParseIt.key();
1586 qParse = StringToParseIt.data();
1588 if ( pWordsAndParses.contains(oneWord))
1590 pParse = pWordsAndParses[oneWord];
1591 Contained = OneParseContainAnother(qParse, pParse);
1593 if (!Contained) continue;
1595 // We found this word in pParse can be absorbed to higher ranking template
1596 CommonWords.insert(oneWord, 1);
1599 else
1601 continue;
1607 // Clean the qWordsAndParse
1609 for ( StringToParseIt= qWordsAndParses.begin(); StringToParseIt != qWordsAndParses.end(); StringToParseIt++)
1611 qParse = StringToParseIt.data();
1612 delete qParse;
1617 // Now, for the template p, we have got the Common Words between pTemplate and all qTemplates
1619 if ( CommonWords.count() == 0)
1621 // Clean the pWordsAndParses
1622 for ( StringToParseIt= pWordsAndParses.begin(); StringToParseIt != pWordsAndParses.end(); StringToParseIt++)
1624 pParse = StringToParseIt.data();
1625 delete pParse;
1628 continue;
1632 // Next, we deal with these common words for pTemplate
1634 pTemplate ->AdjustMeAfterAbsorb1(CommonWords,Conservative, MiniMumPrefixOrSuffixNeedToBeAbsorbed, Loopi );
1636 // Clean the pWordsAndParses
1637 for ( StringToParseIt= pWordsAndParses.begin(); StringToParseIt != pWordsAndParses.end(); StringToParseIt++)
1639 pParse = StringToParseIt.data();
1640 delete pParse;
1651 // Whether qParse contains/Covers pParse
1652 bool CTemplateCollection::OneParseContainAnother(CParse* qParse, CParse* pParse)
1654 bool Success, FoundOneCut;
1655 int oneCutLoc, PossibleLoc;
1657 Success = true;
1658 for (int i = 1; i<= pParse ->Size(); i++)
1660 oneCutLoc = pParse ->GetPieceLoc(i);
1662 FoundOneCut = false;
1663 for ( int j=1; j<= qParse ->Size(); j++)
1665 PossibleLoc = qParse ->GetPieceLoc(j);
1667 if ( oneCutLoc == PossibleLoc)
1669 FoundOneCut = true;
1670 break;
1675 if ( !FoundOneCut)
1677 Success = false;
1678 break;
1683 return Success;
1688 void CTemplateCollection::FindMorphemePrefixOrSuffixWithParadigmaticGraph(int Loopi)
1690 int MaximumSizeOfStemColumn = 5;
1692 for (int i = 0; i < GetCount(); i++) {
1693 CTemplate* pTemplate = GetAtSort(i);
1695 if ( pTemplate ->m_IsDeleted) continue;
1697 pTemplate ->FindMorphemePrefixOrSuffixWithParadigmaticGraph(Loopi, MaximumSizeOfStemColumn, m_GlobalNodeStickNess2, m_GlobalStickNess2, m_TotalGlobalStickNess2, m_TotalWord2);
1704 void CTemplateCollection::CutMtCorpusWithMorphologyAnalyses(QString inFileName, QString outFileName, StringToPtrCStem& MorphologyCuts, int Strategy)
1707 if ( Strategy == 1) // Just take the morphemes, don't care its layers or location
1709 QFile inFile( inFileName );
1710 if ( inFile.open( QIODevice::ReadOnly ) )
1714 else
1716 return;
1720 QFile oFile( outFileName );
1721 if ( oFile.open( QIODevice::WriteOnly ) )
1725 else
1727 return;
1731 Q3TextStream instream( &inFile );
1732 Q3TextStream ostream( &oFile );
1733 QString oneLine;
1735 // Unicode or ASCII encoding?
1736 instream.setEncoding ( Q3TextStream::Unicode);
1737 ostream.setEncoding ( Q3TextStream::Unicode);
1740 // Read these sentences in...
1741 while( !instream.atEnd() )
1743 oneLine = instream.readLine();
1744 oneLine = oneLine.lower();
1746 QString oneWord;
1747 QString outWord;
1748 bool endOfLine;
1749 int Pos;
1750 CStem* theCStem;
1752 // Strip the start and end white space
1754 oneLine = oneLine.simplifyWhiteSpace ();
1756 if ( oneLine.length() ==0)
1758 ostream <<" " <<endl;
1759 continue;
1762 endOfLine = false;
1763 while ( !endOfLine)
1765 Pos = oneLine.find(QString(" "));
1767 if ( Pos != -1)
1769 oneWord = oneLine.left(Pos);
1771 if ( MorphologyCuts.contains(oneWord))
1773 theCStem = MorphologyCuts[oneWord];
1774 outWord = theCStem ->Display(QChar(' '));
1777 else
1779 outWord = oneWord;
1782 ostream << outWord <<" ";
1784 oneLine = oneLine.right(oneLine.length() - Pos-1);
1786 else
1788 oneWord = oneLine;
1790 if ( MorphologyCuts.contains(oneWord))
1792 theCStem = MorphologyCuts[oneWord];
1793 outWord = theCStem ->Display(QChar(' '));
1795 else
1797 outWord = oneWord;
1800 ostream << outWord;
1801 endOfLine = true;
1802 break;
1808 ostream <<endl;
1812 inFile.close();
1813 oFile.close();
1815 return;
1818 if ( Strategy == 2) // Take the morphemes, make difference for begin and end
1820 QFile inFile( inFileName );
1821 if ( inFile.open( QIODevice::ReadOnly ) )
1825 else
1827 return;
1831 QFile oFile( outFileName );
1832 if ( oFile.open( QIODevice::WriteOnly ) )
1836 else
1838 return;
1842 Q3TextStream instream( &inFile );
1843 Q3TextStream ostream( &oFile );
1844 QString oneLine;
1846 // Unicode or ASCII encoding?
1847 instream.setEncoding ( Q3TextStream::Unicode);
1848 ostream.setEncoding ( Q3TextStream::Unicode);
1851 // Read these sentences in...
1852 while( !instream.atEnd() )
1854 oneLine = instream.readLine();
1855 oneLine = oneLine.lower();
1857 QString oneWord;
1858 QString outWord;
1859 QString onePiece;
1860 bool endOfLine;
1861 int Pos;
1862 CStem* theCStem;
1863 int i;
1864 int StemLayer;
1865 int StemSize;
1868 // Strip the start and end white space
1870 oneLine = oneLine.simplifyWhiteSpace ();
1872 if ( oneLine.length() ==0)
1874 ostream <<" " <<endl;
1875 continue;
1878 endOfLine = false;
1879 while ( !endOfLine)
1881 Pos = oneLine.find(QString(" "));
1883 if ( Pos != -1)
1885 oneWord = oneLine.left(Pos);
1887 if ( MorphologyCuts.contains(oneWord))
1889 theCStem = MorphologyCuts[oneWord];
1891 if ( theCStem ->Size() == 1)
1893 outWord = oneWord;
1895 else
1898 // For Stem, we keep it being
1899 StemLayer =1;
1900 StemSize =0;
1902 for ( i=1; i<= theCStem ->Size(); i++)
1904 onePiece = theCStem ->GetPiece(i).Display();
1905 if ( static_cast <int> (onePiece.length()) >= StemSize ) //%%% complaining signed-unsigned; type cast to fix warning
1907 StemLayer =i;
1908 StemSize =onePiece.length();
1914 outWord = QString("");
1915 for ( i=1; i<= theCStem ->Size(); i++)
1917 onePiece = theCStem ->GetPiece(i).Display();
1919 if ( (i== 1) && (i== theCStem ->Size()))
1921 QMessageBox::information ( NULL, "Debug", "Impossible Here!", "OK" );
1922 return;
1925 if (( i == 1) && (i != StemLayer))
1927 onePiece = onePiece + "_";
1928 outWord = outWord + onePiece+ " ";
1930 else if ( (i == theCStem ->Size()) && (i != StemLayer))
1932 onePiece = "_" + onePiece;
1933 outWord = outWord + onePiece;
1935 else if ( (i == theCStem ->Size()) && (i == StemLayer))
1937 outWord = outWord + onePiece;
1939 else
1941 outWord = outWord + onePiece + " ";
1950 else
1952 outWord = oneWord;
1955 ostream << outWord <<" ";
1957 oneLine = oneLine.right(oneLine.length() - Pos-1);
1959 else
1961 oneWord = oneLine;
1963 if ( MorphologyCuts.contains(oneWord))
1965 theCStem = MorphologyCuts[oneWord];
1967 if ( theCStem ->Size() == 1)
1969 outWord = oneWord;
1971 else
1974 // For Stem, we keep it being
1975 StemLayer =1;
1976 StemSize =0;
1978 for ( i=1; i<= theCStem ->Size(); i++)
1980 onePiece = theCStem ->GetPiece(i).Display();
1981 if ( static_cast <int> (onePiece.length()) >= StemSize ) //%%% length(); gives unsigned
1983 StemLayer =i;
1984 StemSize =onePiece.length();
1990 outWord = QString("");
1991 for ( i=1; i<= theCStem ->Size(); i++)
1993 onePiece = theCStem ->GetPiece(i).Display();
1995 if ( (i== 1) && (i== theCStem ->Size()))
1997 QMessageBox::information ( NULL, "Debug", "Impossible Here!", "OK" );
1998 return;
2001 if (( i == 1) && (i != StemLayer))
2003 onePiece = onePiece + "_";
2004 outWord = outWord + onePiece+ " ";
2006 else if ( (i == theCStem ->Size()) && (i != StemLayer))
2008 onePiece = "_" + onePiece;
2009 outWord = outWord + onePiece;
2011 else if ( (i == theCStem ->Size()) && (i == StemLayer))
2013 outWord = outWord + onePiece;
2015 else
2017 outWord = outWord + onePiece + " ";
2024 else
2026 outWord = oneWord;
2029 ostream << outWord;
2030 endOfLine = true;
2031 break;
2037 ostream <<endl;
2041 inFile.close();
2042 oFile.close();
2044 return;
2047 if ( Strategy == 3) // Take the morphemes, make difference for its location
2049 QFile inFile( inFileName );
2050 if ( inFile.open( QIODevice::ReadOnly ) )
2054 else
2056 return;
2060 QFile oFile( outFileName );
2061 if ( oFile.open( QIODevice::WriteOnly ) )
2065 else
2067 return;
2071 Q3TextStream instream( &inFile );
2072 Q3TextStream ostream( &oFile );
2073 QString oneLine;
2075 // Unicode or ASCII encoding?
2076 instream.setEncoding ( Q3TextStream::Unicode);
2077 ostream.setEncoding ( Q3TextStream::Unicode);
2080 // Read these sentences in...
2081 while( !instream.atEnd() )
2083 oneLine = instream.readLine();
2084 oneLine = oneLine.lower();
2086 QString oneWord;
2087 QString outWord;
2088 QString onePiece;
2089 bool endOfLine;
2090 int Pos;
2091 CStem* theCStem;
2092 int i;
2093 int StemLayer;
2094 int StemSize;
2097 // Strip the start and end white space
2099 oneLine = oneLine.simplifyWhiteSpace ();
2101 if ( oneLine.length() ==0)
2103 ostream <<" " <<endl;
2104 continue;
2107 endOfLine = false;
2108 while ( !endOfLine)
2110 Pos = oneLine.find(QString(" "));
2112 if ( Pos != -1)
2114 oneWord = oneLine.left(Pos);
2116 if ( MorphologyCuts.contains(oneWord))
2118 theCStem = MorphologyCuts[oneWord];
2120 if ( theCStem ->Size() == 1)
2122 outWord = oneWord;
2124 else
2127 // For Stem, we keep it being
2128 StemLayer =1;
2129 StemSize =0;
2131 for ( i=1; i<= theCStem ->Size(); i++)
2133 onePiece = theCStem ->GetPiece(i).Display();
2134 if ( static_cast <int> (onePiece.length()) >= StemSize ) //%%% length(); gives unsigned
2136 StemLayer =i;
2137 StemSize =onePiece.length();
2143 outWord = QString("");
2144 for ( i=1; i<= theCStem ->Size(); i++)
2146 onePiece = theCStem ->GetPiece(i).Display();
2148 if ( (i== 1) && (i== theCStem ->Size()))
2150 QMessageBox::information ( NULL, "Debug", "Impossible Here!", "OK" );
2151 return;
2154 if (( i == 1) && (i != StemLayer))
2156 onePiece = onePiece + "_" + QString("%1").arg(i);
2157 outWord = outWord + onePiece+ " ";
2159 else if ( (i == theCStem ->Size()) && (i != StemLayer))
2161 onePiece = onePiece + "_" + QString("%1").arg(i);
2162 outWord = outWord + onePiece;
2164 else if ( (i == theCStem ->Size()) && (i == StemLayer))
2166 outWord = outWord + onePiece;
2168 else
2170 if ( i == StemLayer)
2172 outWord = outWord + onePiece + " ";
2174 else
2176 onePiece = onePiece + "_" + QString("%1").arg(i);
2177 outWord = outWord + onePiece + " ";
2187 else
2189 outWord = oneWord;
2192 ostream << outWord <<" ";
2194 oneLine = oneLine.right(oneLine.length() - Pos-1);
2196 else
2198 oneWord = oneLine;
2200 if ( MorphologyCuts.contains(oneWord))
2202 theCStem = MorphologyCuts[oneWord];
2204 if ( theCStem ->Size() == 1)
2206 outWord = oneWord;
2208 else
2211 // For Stem, we keep it being
2212 StemLayer =1;
2213 StemSize =0;
2215 for ( i=1; i<= theCStem ->Size(); i++)
2217 onePiece = theCStem ->GetPiece(i).Display();
2218 if ( static_cast <int> (onePiece.length()) >= StemSize ) //%%% length(); gives unsigned
2220 StemLayer =i;
2221 StemSize =onePiece.length();
2227 outWord = QString("");
2228 for ( i=1; i<= theCStem ->Size(); i++)
2230 onePiece = theCStem ->GetPiece(i).Display();
2232 if ( (i== 1) && (i== theCStem ->Size()))
2234 QMessageBox::information ( NULL, "Debug", "Impossible Here!", "OK" );
2235 return;
2238 if (( i == 1) && (i != StemLayer))
2240 onePiece = onePiece + "_" + QString("%1").arg(i);
2241 outWord = outWord + onePiece+ " ";
2243 else if ( (i == theCStem ->Size()) && (i != StemLayer))
2245 onePiece = onePiece + "_" + QString("%1").arg(i);
2246 outWord = outWord + onePiece;
2248 else if ( (i == theCStem ->Size()) && (i == StemLayer))
2250 outWord = outWord + onePiece;
2252 else
2254 if ( i == StemLayer)
2256 outWord = outWord + onePiece + " ";
2258 else
2260 onePiece = onePiece + "_" + QString("%1").arg(i);
2261 outWord = outWord + onePiece + " ";
2271 else
2273 outWord = oneWord;
2276 ostream << outWord;
2277 endOfLine = true;
2278 break;
2284 ostream <<endl;
2288 inFile.close();
2289 oFile.close();
2291 return;
2294 if ( Strategy == 4) // Take the morphemes, make difference for its relative location to Stemlayer
2296 QFile inFile( inFileName );
2297 if ( inFile.open( QIODevice::ReadOnly ) )
2301 else
2303 return;
2307 QFile oFile( outFileName );
2308 if ( oFile.open( QIODevice::WriteOnly ) )
2312 else
2314 return;
2318 Q3TextStream instream( &inFile );
2319 Q3TextStream ostream( &oFile );
2320 QString oneLine;
2322 // Unicode or ASCII encoding?
2323 instream.setEncoding ( Q3TextStream::Unicode);
2324 ostream.setEncoding ( Q3TextStream::Unicode);
2327 // Read these sentences in...
2328 while( !instream.atEnd() )
2330 oneLine = instream.readLine();
2331 oneLine = oneLine.lower();
2333 QString oneWord;
2334 QString outWord;
2335 QString onePiece;
2336 bool endOfLine;
2337 int Pos;
2338 CStem* theCStem;
2339 int i;
2340 int StemLayer;
2341 int StemSize;
2344 // Strip the start and end white space
2346 oneLine = oneLine.simplifyWhiteSpace ();
2348 if ( oneLine.length() ==0)
2350 ostream <<" " <<endl;
2351 continue;
2354 endOfLine = false;
2355 while ( !endOfLine)
2357 Pos = oneLine.find(QString(" "));
2359 if ( Pos != -1)
2361 oneWord = oneLine.left(Pos);
2363 if ( MorphologyCuts.contains(oneWord))
2365 theCStem = MorphologyCuts[oneWord];
2367 if ( theCStem ->Size() == 1)
2369 outWord = oneWord;
2371 else
2374 // For Stem, we keep it being
2375 StemLayer =1;
2376 StemSize =0;
2378 for ( i=1; i<= theCStem ->Size(); i++)
2380 onePiece = theCStem ->GetPiece(i).Display();
2381 if ( static_cast <int> (onePiece.length()) >= StemSize ) //%%% length(); gives unsigned
2383 StemLayer =i;
2384 StemSize =onePiece.length();
2390 outWord = QString("");
2391 for ( i=1; i<= theCStem ->Size(); i++)
2393 onePiece = theCStem ->GetPiece(i).Display();
2395 if ( (i== 1) && (i== theCStem ->Size()))
2397 QMessageBox::information ( NULL, "Debug", "Impossible Here!", "OK" );
2398 return;
2401 if (( i == 1) && (i != StemLayer))
2403 onePiece = onePiece + "_-" + QString("%1").arg(StemLayer -i);
2404 outWord = outWord + onePiece+ " ";
2406 else if ( (i == theCStem ->Size()) && (i != StemLayer))
2408 onePiece = onePiece + "_+" + QString("%1").arg(i-StemLayer);
2409 outWord = outWord + onePiece;
2411 else if ( (i == theCStem ->Size()) && (i == StemLayer))
2413 outWord = outWord + onePiece;
2415 else
2417 if ( i == StemLayer)
2419 outWord = outWord + onePiece + " ";
2421 else
2423 if ( i > StemLayer )
2425 onePiece = onePiece + "_+" + QString("%1").arg(i- StemLayer);
2426 outWord = outWord + onePiece + " ";
2428 else
2430 onePiece = onePiece + "_-" + QString("%1").arg(StemLayer -i);
2431 outWord = outWord + onePiece + " ";
2442 else
2444 outWord = oneWord;
2447 ostream << outWord <<" ";
2449 oneLine = oneLine.right(oneLine.length() - Pos-1);
2451 else
2453 oneWord = oneLine;
2455 if ( MorphologyCuts.contains(oneWord))
2457 theCStem = MorphologyCuts[oneWord];
2459 if ( theCStem ->Size() == 1)
2461 outWord = oneWord;
2463 else
2466 // For Stem, we keep it being
2467 StemLayer =1;
2468 StemSize =0;
2470 for ( i=1; i<= theCStem ->Size(); i++)
2472 onePiece = theCStem ->GetPiece(i).Display();
2473 if ( static_cast <int> (onePiece.length()) >= StemSize ) //%%% length(); gives unsigned
2475 StemLayer =i;
2476 StemSize =onePiece.length();
2482 outWord = QString("");
2483 for ( i=1; i<= theCStem ->Size(); i++)
2485 onePiece = theCStem ->GetPiece(i).Display();
2487 if ( (i== 1) && (i== theCStem ->Size()))
2489 QMessageBox::information ( NULL, "Debug", "Impossible Here!", "OK" );
2490 return;
2493 if (( i == 1) && (i != StemLayer))
2495 onePiece = onePiece + "_-" + QString("%1").arg(StemLayer -i);
2496 outWord = outWord + onePiece+ " ";
2498 else if ( (i == theCStem ->Size()) && (i != StemLayer))
2500 onePiece = onePiece + "_+" + QString("%1").arg(i-StemLayer);
2501 outWord = outWord + onePiece;
2503 else if ( (i == theCStem ->Size()) && (i == StemLayer))
2505 outWord = outWord + onePiece;
2507 else
2509 if ( i == StemLayer)
2511 outWord = outWord + onePiece + " ";
2513 else
2515 if ( i > StemLayer )
2517 onePiece = onePiece + "_+" + QString("%1").arg(i- StemLayer);
2518 outWord = outWord + onePiece + " ";
2520 else
2522 onePiece = onePiece + "_-" + QString("%1").arg(StemLayer -i);
2523 outWord = outWord + onePiece + " ";
2534 else
2536 outWord = oneWord;
2539 ostream << outWord;
2540 endOfLine = true;
2541 break;
2547 ostream <<endl;
2551 inFile.close();
2552 oFile.close();
2554 return;
2559 // FindStringEditDistance
2560 void CTemplateCollection::FindAllEditDistances(
2561 CLexicon* MyLexicon, CWordCollection* MyWords)
2563 linguistica::ui::status_user_agent& status = MyLexicon->status_display();
2565 int MinimumSize = 5;
2566 int ScoreThreshold = 8;
2567 int MaximumNumberOfLetterDifferences = 9;
2568 int MinimumNumberOfCommonLetters = 5;
2569 CStem* pWord;
2570 CStem* qWord;
2571 CParse Substitution;
2572 CParse Context;
2573 double Score;
2574 QMap<QString, CTemplate*> Templates;
2575 int DebugPair = 0;
2577 if (MyLexicon->LogFileOn() && MyLexicon->GetLogFileStream())
2578 *MyLexicon->GetLogFileStream() << endl <<
2579 "<h3 class=blue>" << "String comparisons" <<
2580 "</h3>" << endl <<
2581 StartTable << StartTableRow <<
2582 MakeTableHeader("String 1") <<
2583 MakeTableHeader("String 2") <<
2584 MakeTableHeader("something else") <<
2585 EndTableRow;
2587 const int NumberOfWords = MyWords->GetCount();
2588 const int TotalNumber = NumberOfWords - 1;
2589 status.major_operation = "StringEdit:FindAllEditDistances";
2590 status.progress.clear();
2591 status.progress.set_denominator(TotalNumber);
2592 // loop through all members of the collection.
2593 for (int i = 0; i < NumberOfWords - 1; i++) {
2594 status.progress = i;
2595 pWord = MyWords ->GetAt(i);
2597 if (pWord->GetKeyLength() < MinimumSize)
2598 continue;
2600 for (int j = i+1; j < NumberOfWords; j++) {
2601 qWord = MyWords ->GetAt(j);
2602 if (qWord->GetKeyLength() < MinimumSize)
2603 continue;
2604 // Our tests to see if these two words,
2605 // pWord and qWord,
2606 // are similar enough
2607 // to be worth testing with string edit distance
2608 int Overlap = OverlapOfTwoAlphabetizedLists(
2609 pWord->GetAlphabetizedForm(),
2610 qWord->GetAlphabetizedForm());
2611 if (Overlap < MinimumNumberOfCommonLetters)
2612 continue;
2613 int Diff = DifferencesOfTwoAlphabetizedLists(
2614 pWord->GetAlphabetizedForm(),
2615 qWord->GetAlphabetizedForm());
2616 if (Diff > MaximumNumberOfLetterDifferences)
2617 continue;
2618 // end of tests
2620 std::auto_ptr<CAlignment> pAlignment(
2621 new CAlignment(pWord, qWord));
2622 Score = pAlignment->FindStringEditDistance();
2624 if (Score < ScoreThreshold && pAlignment->m_Slips == 1) {
2625 Substitution = pAlignment->FindSubstitution();
2626 Context = pAlignment->FindContext();
2627 // Substitution = pAlignment->FindSubstitution();
2628 if (MyLexicon->LogFileOn())
2629 *MyLexicon->GetLogFileStream() << endl <<
2630 StartTableRow <<
2631 TableData(Substitution.GetPiece(1).Display()) <<
2632 TableData(Context) <<
2633 EndTableRow <<
2634 StartTableRow <<
2635 TableData(Substitution.GetPiece(2).Display()) <<
2636 EndTableRow;
2637 QMap<QString, CTemplate*>::iterator iter =
2638 Templates.find(Context.GetKey().Display());
2639 if (iter != Templates.end()) {
2640 CTemplate* pTemplate = *iter;
2641 pTemplate->AddToColumn(Substitution, pTemplate->GetVerticalColumn());
2642 pTemplate->IncrementCorpusCount(1);
2643 pTemplate->AddAlignment(pAlignment.get());
2644 #if 0
2645 if (Lexicon->LogFileOn())
2646 *Lexicon->GetLogFile() <<
2647 endl <<
2648 "Already present " <<
2649 Context.GetKey();
2650 #endif
2651 } else {
2652 std::auto_ptr<CTemplate> new_template(
2653 new CTemplate(pAlignment.get()));
2654 CTemplate* pTemplate = new_template.get();
2656 Templates.insert(Context.GetKey().Display(),
2657 new_template.release());
2658 pTemplate->SetCorpusCount(1);
2659 pTemplate->AddAlignment(pAlignment.get());
2664 status.progress.clear();
2666 // Now go through the templates, and add them to the real collection.
2667 QMap<QString, CTemplate*>::Iterator QStringToTemplateIt;
2668 for (QMap<QString, CTemplate*>::const_iterator iter =
2669 Templates.constBegin();
2670 iter != Templates.constEnd();
2671 ++iter) {
2672 QString Key = iter.key();
2673 CTemplate* pTemplate = iter.value();
2675 ++DebugPair;
2676 CTemplate* qTemplate = AddTemplate(pTemplate);
2677 qTemplate->SetCorpusCount(pTemplate->GetCorpusCount());
2678 delete pTemplate;
2681 status.major_operation.clear();
2682 if (MyLexicon->LogFileOn())
2683 *MyLexicon->GetLogFileStream() << endl << EndTable;