HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / MiniLexicon.cpp
blobc8efd5dd316c22e2f52d39d5fcc47058cc0beba6
1 // Implementation of core CMiniLexicon methods
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
5 #include <iostream>
6 #include <Q3TextStream>
7 #include <QTime>
8 #include "Lexicon.h"
9 #include "GUIclasses.h"
10 #include "FSA.h"
11 #include "DescriptionLength.h"
12 #include "Datum.h"
13 #include "Stem.h"
14 #include "SignatureCollection.h"
15 #include "PrefixCollection.h"
16 #include "SuffixCollection.h"
17 #include "WordCollection.h"
18 #include "StemCollection.h"
19 #include "POSCollection.h"
20 #include "AffixLocation.h"
21 #include "log2.h"
22 #include "Suffix.h"
23 #include "Prefix.h"
24 #include "HTML.h"
26 CMiniLexicon::CMiniLexicon(CLexicon* lexicon, int index,
27 enum eAffixLocation affixLocation)
28 : m_pLexicon(lexicon),
29 m_Index(index),
30 m_AffixLocation(affixLocation),
31 m_pWords(new CWordCollection(this)),
32 m_pSuffixes(is_initial(affixLocation) ?
33 0 :
34 new CSuffixCollection(this)),
35 m_pPrefixes(is_initial(affixLocation) ?
36 new CPrefixCollection(this) :
37 0),
38 m_pStems(new CStemCollection(this)),
39 m_pSignatures(is_initial(affixLocation) ?
40 new CSignatureCollection(this, m_pPrefixes, affixLocation) :
41 new CSignatureCollection(this, m_pSuffixes, affixLocation)),
42 m_pPOS(),
43 m_DescriptionLength(),
44 m_CorpusCountOfUnanalyzedWords(0.0),
45 m_PhonologicalInformationOfUnanalyzedWords(0.0),
46 m_GUIWords(new GUIWordCollection(this, m_pWords)),
47 m_DataMap(),
48 m_pFSA(0) { }
50 CMiniLexicon::~CMiniLexicon()
52 // Update corpus words when deleting mini-lexicon
53 for (int i = 0; i < m_pWords->GetCount(); ++i) {
54 CStem* word = m_pWords->GetAt(i);
56 word->SimplifyParseStructure();
57 m_pLexicon->UpdateWord(word);
59 m_pLexicon->DoWordUpdates();
61 delete m_pWords;
62 delete m_pSuffixes;
63 delete m_pPrefixes;
64 delete m_pStems;
65 delete m_pSignatures;
66 delete m_pPOS;
67 delete m_DescriptionLength;
68 delete m_GUIWords;
69 delete m_pFSA;
72 void CMiniLexicon::AddToScreen( QString msg )
74 m_pLexicon->AddToScreen( msg );
77 GUIWordCollection* CMiniLexicon::GetGUIWords()
79 return m_GUIWords;
82 int CMiniLexicon::GetCorpusCount()
84 return m_pLexicon->GetCorpusCount();
88 int CMiniLexicon::GetNumberOfCharacterTypes()
90 return m_pLexicon->GetNumberOfCharacterTypes();
94 CDLHistory* CMiniLexicon::GetDLHistory()
96 return m_pLexicon->GetDLHistory();
99 CStem* CMiniLexicon::GetWordFromStemSuffix(CStem* pStem, CSuffix* pSuffix)
101 if (pSuffix->Display() == TheStringNULL)
103 return *m_pWords ^= pStem->Display();
105 else
107 return *m_pWords ^= pStem->Display() + pSuffix->Display();
110 CStem* CMiniLexicon::GetWordFromStemPrefix(CStem* pStem, CPrefix* pPrefix)
112 if (pPrefix->Display() == TheStringNULL)
114 return *m_pWords ^= pStem->Display();
116 else
118 return *m_pWords ^= pPrefix->Display() + pStem->Display();
121 void CMiniLexicon::AddToWordCollection(CWordCollection* pWords,
122 enum which_words subset)
124 CStem* pWord, * qWord;
125 int w;
126 CStringSurrogate css;
128 for( w = 0; w < (int) pWords->GetCount(); w++ )
130 pWord = pWords->GetAt(w);
132 switch (subset) {
133 case WW_All:
134 qWord = (*m_pWords) << pWord;
135 qWord->SetWordType( pWord->GetWordType() );
136 break;
137 case WW_AnalyzedOnly:
138 if( pWord->Size() > 1 )
140 qWord = (*m_pWords) << pWord;
141 qWord->SetWordType( pWord->GetWordType() );
143 break;
144 case WW_UnanalyzedOnly:
145 default:
146 if( pWord->Size() <= 1 )
148 qWord = (*m_pWords) << pWord;
149 qWord->SetWordType( pWord->GetWordType() );
151 break;
157 void CMiniLexicon::AddToWordCollection( CStemCollection* pWords )
159 for (int w = 0; w < pWords->GetCount(); ++w) {
160 CStem* pWord = pWords->GetAt(w);
161 CStem* qWord = (*m_pWords) << pWord;
163 const bool compound =
164 pWord->GetStemType() == CStem::BIWORD_COMPOUND ||
165 pWord->GetStemType() == CStem::MULTIPLE_COMPOUND;
166 qWord->SetWordType(compound ?
167 CStem::STEM_COMPOUND : CStem::STEM_NORMAL);
172 void CMiniLexicon::ClearAll()
174 if( m_pStems ) m_pStems->Empty();
175 if( m_pWords ) m_pWords->Empty();
177 if( m_pSuffixes ) m_pSuffixes->Empty();
178 if( m_pPrefixes ) m_pPrefixes->Empty();
180 if( m_pSignatures ) m_pSignatures->Empty();
184 CCorpusWord* CMiniLexicon::FindAWord(CStem* pStem, CSuffix* pSuffix)
186 return m_pLexicon->FindAWord(pStem, pSuffix);
190 CSuffixCollection* CMiniLexicon::FindSuffixes() //Suffixes/Run all
192 QTime t;
193 t.start();
195 QString mini_name( "Mini-Lexicon %1" );
196 mini_name = mini_name.arg( m_Index );
198 QString remark;
200 if( m_AffixLocation == STEM_FINAL || m_AffixLocation == WORD_FINAL )
202 m_pWords->SuccessorFreq1(GetStems(),
203 GetSuffixes(), GetSignatures(), SF1,
204 CStem::NUMBER | CStem::UNKNOWN);
207 CheckSignatures();
210 ExtendKnownStemsToKnownAffixes();
211 TakeSignaturesFindStems();
212 TakeSignaturesFindStems();
213 ExtendKnownStemsToKnownAffixes();
214 FromStemsFindAffixes(); // problem here @@@ oct 2008 jg
216 LooseFit();
218 CheckSignatures();
220 FindSingletonSignatures(); //problem here jan 2010
222 CheckSignatures();
224 FindMajorSignatures();
225 m_pWords->m_DisplayMode = CWordListViewItem::MiniLexicon_MorphologyStuffFirst;
226 CalculateDescriptionLength();
227 FindAllomorphy();
228 m_pFSA = new FSA(this);
235 std::cout << "Find Suffixes: Time elapsed: " <<
236 t.elapsed() << "ms." << std::endl;
238 return m_pSuffixes;
242 CPrefixCollection* CMiniLexicon::FindPrefixes()
244 if( m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL )
246 m_pWords->CreateReverseTrie();
248 m_pWords->PredecessorFreq1(GetStems(),
249 GetPrefixes(), GetSignatures(), SF1,
250 CStem::NUMBER | CStem::UNKNOWN);
252 CheckSignatures();
254 ExtendKnownStemsToKnownAffixes();
256 TakeSignaturesFindStems();
258 ExtendKnownStemsToKnownAffixes();
260 FromStemsFindAffixes();
262 ExtendKnownStemsToKnownAffixes();
264 LooseFit();
266 CheckSignatures();
268 FindSingletonSignatures();
270 return m_pPrefixes;
272 else return NULL;
276 LinguisticaMainWindow* CMiniLexicon::GetDocument()
278 return m_pLexicon->GetDocument();
281 int CMiniLexicon::GetIntParameter( QString strParam, int iDefault )
283 return m_pLexicon->GetIntParameter( strParam, iDefault );
287 QTextStream* CMiniLexicon::GetLogFile()
289 return m_pLexicon->GetLogFileStream();
293 int CMiniLexicon::GetMiniCount()
295 return m_pLexicon->GetMiniCount();
299 int CMiniLexicon::GetMiniSize()
301 return m_pLexicon->GetMiniSize();
305 CMiniLexicon* CMiniLexicon::GetMiniLexicon( int index )
307 return m_pLexicon->GetMiniLexicon( index );
311 StringToString* CMiniLexicon::GetOutFilter()
313 return m_pLexicon->GetOutFilter();
317 bool CMiniLexicon::LogFileOn()
319 return (CLexicon*)m_pLexicon->LogFileOn();
323 bool CMiniLexicon::SetAffixLocation(enum eAffixLocation affixLoc)
325 if (m_AffixLocation == affixLoc)
326 // done, without lifting a finger!
327 return true;
329 if (is_initial(m_AffixLocation) == is_initial(affixLoc)) {
330 m_AffixLocation = affixLoc;
331 return true;
334 // Affix types differ: throw away discoveries.
336 if (m_pStems->GetCount() != 0)
337 // someone else could be using our discovered stems
338 return false;
340 if (is_initial(affixLoc)) {
341 Q_ASSERT(m_pPrefixes == 0);
342 m_pPrefixes = new CPrefixCollection(this);
344 delete m_pSuffixes;
345 m_pSuffixes = 0;
346 } else {
347 delete m_pPrefixes;
348 m_pPrefixes = 0;
350 Q_ASSERT(m_pSuffixes == 0);
351 m_pSuffixes = new CSuffixCollection(this);
354 m_AffixLocation = affixLoc;
356 delete m_pSignatures;
357 m_pSignatures = is_initial(affixLoc) ?
358 new CSignatureCollection(this, m_pPrefixes, affixLoc) :
359 new CSignatureCollection(this, m_pSuffixes, affixLoc);
360 return true;
363 int CMiniLexicon::GetCorpusCountOfUnanalyzedWords ( )
365 int i = 0;
368 m_CorpusCountOfUnanalyzedWords = 0;
370 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
372 for (i = 0; i < m_pWords->GetCount(); i++)
374 if ( 0 == m_pWords->GetAt(i)->GetSuffixSignature() )
376 m_CorpusCountOfUnanalyzedWords += m_pWords->GetAt(i)->GetCorpusCount();
380 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
383 for (i = 0; i < m_pWords->GetCount(); i++)
385 if ( 0 == m_pWords->GetAt(i)->GetPrefixSignature() )
387 m_CorpusCountOfUnanalyzedWords += m_pWords->GetAt(i)->GetCorpusCount();
392 return (int) m_CorpusCountOfUnanalyzedWords;
394 double CMiniLexicon::CalculateSumOfPointersToMyUnanalyzedWords ( eMDL_STYLE MDLflag)
396 int i = 0;
397 double total = 0;
398 double denominator = 0;
400 switch (MDLflag)
402 case CorpusCount:
405 denominator = GetCorpusCountOfUnanalyzedWords ( ) +
406 m_pStems->GetCorpusCount();
409 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
411 for (i = 0; i < m_pWords->GetCount(); i++)
413 if ( 0 == m_pWords->GetAt(i)->GetSuffixLoc() )
415 total += base2log ( denominator / (double) m_pWords->GetAt(i)->GetCorpusCount() ) ;
419 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
422 for (i = 0; i < m_pWords->GetCount(); i++)
424 if ( 0 == m_pWords->GetAt(i)->GetPrefixLoc() )
426 total += base2log ( denominator / m_pWords->GetAt(i)->GetCorpusCount() ) ;
430 break;
432 case GrammarCount:
433 int NumberOfUnanalyzedWords;
434 GetNumberOfAnalyzedWords(NumberOfUnanalyzedWords);
436 denominator = NumberOfUnanalyzedWords +
437 m_pStems->GetTotalUseCount ( );
439 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
441 for (i = 0; i < m_pWords->GetCount(); i++)
443 if ( 0 == m_pWords->GetAt(i)->GetSuffixLoc() )
445 total += base2log ( denominator ) ;
449 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
452 for (i = 0; i < m_pWords->GetCount(); i++)
454 if ( 0 == m_pWords->GetAt(i)->GetPrefixLoc() )
456 total += base2log ( denominator ) ;
462 break;
465 return total;
468 double CMiniLexicon::CalculateUnanalyzedWordsTotalPhonologicalInformationContent( )
470 CLexicon* MotherLexicon = GetLexicon();
471 int i;
472 if ( m_PhonologicalInformationOfUnanalyzedWords == 0)
474 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
476 for (i = 0; i < m_pWords->GetCount(); i++)
478 if ( 0 == m_pWords->GetAt(i)->GetSuffixLoc() )
480 m_PhonologicalInformationOfUnanalyzedWords += m_pWords->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon );
484 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
486 for (i = 0; i < m_pWords->GetCount(); i++)
488 if ( 0 == m_pWords->GetAt(i)->GetPrefixLoc() )
490 m_PhonologicalInformationOfUnanalyzedWords += m_pWords->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon );
497 return m_PhonologicalInformationOfUnanalyzedWords;
502 double CMiniLexicon::CalculateCompressedLengthOfUnanalyzedWords( )
504 double CompressedLengthOfUnanalyzedWords = 0;
505 CLexicon* MotherLexicon = GetLexicon();
506 int i;
508 if ( GetAffixLocation() == WORD_FINAL || GetAffixLocation() == STEM_FINAL )
510 for (i = 0; i < GetWords()->GetCount(); i++)
512 if ( NULL == GetWords()->GetAt(i)->GetSuffixSignature() )
514 CompressedLengthOfUnanalyzedWords +=
515 GetWords()->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon )
516 * GetWords()->GetAt(i)->GetCorpusCount();
520 else if ( GetAffixLocation() == WORD_INITIAL || GetAffixLocation() == STEM_INITIAL )
522 for (i = 0; i < m_pWords->GetCount(); i++)
524 if ( 0 == m_pWords->GetAt(i)->GetPrefixSignature() )
526 CompressedLengthOfUnanalyzedWords +=
527 m_pWords->GetAt(i)->CalculatePhonologicalInformationContent( MotherLexicon )
528 * m_pWords->GetAt(i)->GetCorpusCount();
535 return CompressedLengthOfUnanalyzedWords;
538 int CMiniLexicon::GetNumberOfAnalyzedWords (int& NumberOfUnanalyzedWords)
540 CStem* pWord;
541 int NumberOfAnalyzedWords = 0;
542 NumberOfUnanalyzedWords = 0;
543 for (int i = 0;i < m_pWords->GetCount(); i++)
545 pWord = m_pWords->GetAt(i);
546 if (pWord->IsAnalyzed() )
548 NumberOfAnalyzedWords++;
549 } else
551 NumberOfUnanalyzedWords++;
554 return NumberOfAnalyzedWords;
558 // Log File functions
562 void CMiniLexicon::LogFileHeader(QString s1, QString s2, QString s3)
563 { if (LogFileOn()) *GetLogFile() <<
564 StartTable <<
565 StartTableRow << MakeTableHeader(s1) << MakeTableHeader(s2) << MakeTableHeader(s3) <<
566 EndTableRow;
569 void CMiniLexicon::LogFileSmallTitle(QString s1, QString s2, QString s3)
570 { if (LogFileOn()) { *GetLogFile() << SmallTitle( s1) <<
571 StartTable <<
572 StartTableRow << MakeTableHeader(s2) << MakeTableHeader(s3) ;
575 void CMiniLexicon::LogFileSmallTitle(QString s) { if (LogFileOn()) *GetLogFile() << SmallTitle( s );}
576 void CMiniLexicon::LogFileSmallTitle(QString s, QString t) { if (LogFileOn()) *GetLogFile() << SmallTitle( s ) << StartTableRow << MakeTableHeader(t) << EndTableRow; }
577 void CMiniLexicon::LogFileLargeTitle(QString title) { if (LogFileOn()) *GetLogFile() << LargeTitle(title) << endl; }
578 void CMiniLexicon::LogFileStartTable() { if (LogFileOn()) *GetLogFile() << StartTable;}
579 void CMiniLexicon::LogFileEndTable() { if (LogFileOn()) *GetLogFile() << EndTable;}
580 void CMiniLexicon::LogFileStartRow() { if (LogFileOn()) *GetLogFile() << StartTableRow; }
581 void CMiniLexicon::LogFileEndRow() { if (LogFileOn()) *GetLogFile() << EndTableRow; }
582 void CMiniLexicon::LogFileStartRow(QString str) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(str); }
583 void CMiniLexicon::LogFile1SimpleString(QString s) { if (LogFileOn()) *GetLogFile() << TableData(s); }
584 void CMiniLexicon::LogFile (double d) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(d) << EndTableRow; }
585 void CMiniLexicon::LogFile (QString s) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << EndTableRow; }
586 void CMiniLexicon::LogFile (QString s, int n) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << TableData (n) << EndTableRow; }
587 void CMiniLexicon::LogFile (int n, QString s) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(n) << TableData(s) << EndTableRow; }
588 void CMiniLexicon::LogFile (QString s1, QString s2) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s1) << TableData(s2) << EndTableRow; }
589 void CMiniLexicon::LogFile (QString s1, double d) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s1) << TableData( d) << EndTableRow; }
590 void CMiniLexicon::LogFile (QString s,int i,double d){ if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << TableData(i) << TableData( d) << EndTableRow; }
591 void CMiniLexicon::LogFile (QString s, int n, int m, double d, double e, double f) { if (LogFileOn()) *GetLogFile() << StartTableRow << TableData(s) << TableData(n) << TableData(m) << TableData(d) << TableData(e) << TableData(f) << EndTableRow; }
592 void CMiniLexicon::LogFile (QString s, QString t, QString u) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << EndTableRow;}
593 void CMiniLexicon::LogFile (QString s, QString t, QString u, QString v) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << TableData(v) << EndTableRow;}
594 void CMiniLexicon::LogFile (QString s, QString t, QString u, QString v, QString w) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << TableData(v) << TableData(w) << EndTableRow;}
595 void CMiniLexicon::LogFile (QString s, QString t, QString u, QString v, QString w, QString x) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(s) << TableData(t) << TableData(u) << TableData(v) << TableData(w) << TableData (x) << EndTableRow;}
596 void CMiniLexicon::LogFileHeader( QString s) { if (LogFileOn()) *GetLogFile() << StartTableRow << MakeTableHeader(s) << EndTableRow; }
597 void CMiniLexicon::LogFileHeader( QString s, QString t) { if (LogFileOn()) *GetLogFile() << StartTableRow << MakeTableHeader(s) << MakeTableHeader(t) << EndTableRow; }
598 void CMiniLexicon::LogFileHeader (QString s, QString t, QString u, QString v, QString w, QString x) { if (LogFileOn()) *GetLogFile() << StartTableRow << MakeTableHeader(s) << MakeTableHeader(t) << MakeTableHeader(u) << MakeTableHeader(v) << MakeTableHeader(w) << MakeTableHeader (x) << EndTableRow;}
599 void CMiniLexicon::LogFile (int n, double d, QString s) { if (LogFileOn()) *GetLogFile()<< StartTableRow << TableData(n) << TableData(d) << TableData(s) << EndTableRow;}