1 // Implementation of core CMiniLexicon methods
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
6 #include <Q3TextStream>
9 #include "GUIclasses.h"
11 #include "DescriptionLength.h"
14 #include "SignatureCollection.h"
15 #include "PrefixCollection.h"
16 #include "SuffixCollection.h"
17 #include "WordCollection.h"
18 #include "StemCollection.h"
19 #include "POSCollection.h"
20 #include "AffixLocation.h"
26 CMiniLexicon::CMiniLexicon(CLexicon
* lexicon
, int index
,
27 enum eAffixLocation affixLocation
)
28 : m_pLexicon(lexicon
),
30 m_AffixLocation(affixLocation
),
31 m_pWords(new CWordCollection(this)),
32 m_pSuffixes(is_initial(affixLocation
) ?
34 new CSuffixCollection(this)),
35 m_pPrefixes(is_initial(affixLocation
) ?
36 new CPrefixCollection(this) :
38 m_pStems(new CStemCollection(this)),
39 m_pSignatures(is_initial(affixLocation
) ?
40 new CSignatureCollection(this, m_pPrefixes
, affixLocation
) :
41 new CSignatureCollection(this, m_pSuffixes
, affixLocation
)),
43 m_DescriptionLength(),
44 m_CorpusCountOfUnanalyzedWords(0.0),
45 m_PhonologicalInformationOfUnanalyzedWords(0.0),
46 m_GUIWords(new GUIWordCollection(this, m_pWords
)),
50 CMiniLexicon::~CMiniLexicon()
52 // Update corpus words when deleting mini-lexicon
53 for (int i
= 0; i
< m_pWords
->GetCount(); ++i
) {
54 CStem
* word
= m_pWords
->GetAt(i
);
56 word
->SimplifyParseStructure();
57 m_pLexicon
->UpdateWord(word
);
59 m_pLexicon
->DoWordUpdates();
67 delete m_DescriptionLength
;
72 void CMiniLexicon::AddToScreen( QString msg
)
74 m_pLexicon
->AddToScreen( msg
);
77 GUIWordCollection
* CMiniLexicon::GetGUIWords()
82 int CMiniLexicon::GetCorpusCount()
84 return m_pLexicon
->GetCorpusCount();
88 int CMiniLexicon::GetNumberOfCharacterTypes()
90 return m_pLexicon
->GetNumberOfCharacterTypes();
94 CDLHistory
* CMiniLexicon::GetDLHistory()
96 return m_pLexicon
->GetDLHistory();
99 CStem
* CMiniLexicon::GetWordFromStemSuffix(CStem
* pStem
, CSuffix
* pSuffix
)
101 if (pSuffix
->Display() == TheStringNULL
)
103 return *m_pWords
^= pStem
->Display();
107 return *m_pWords
^= pStem
->Display() + pSuffix
->Display();
110 CStem
* CMiniLexicon::GetWordFromStemPrefix(CStem
* pStem
, CPrefix
* pPrefix
)
112 if (pPrefix
->Display() == TheStringNULL
)
114 return *m_pWords
^= pStem
->Display();
118 return *m_pWords
^= pPrefix
->Display() + pStem
->Display();
121 void CMiniLexicon::AddToWordCollection(CWordCollection
* pWords
,
122 enum which_words subset
)
124 CStem
* pWord
, * qWord
;
126 CStringSurrogate css
;
128 for( w
= 0; w
< (int) pWords
->GetCount(); w
++ )
130 pWord
= pWords
->GetAt(w
);
134 qWord
= (*m_pWords
) << pWord
;
135 qWord
->SetWordType( pWord
->GetWordType() );
137 case WW_AnalyzedOnly
:
138 if( pWord
->Size() > 1 )
140 qWord
= (*m_pWords
) << pWord
;
141 qWord
->SetWordType( pWord
->GetWordType() );
144 case WW_UnanalyzedOnly
:
146 if( pWord
->Size() <= 1 )
148 qWord
= (*m_pWords
) << pWord
;
149 qWord
->SetWordType( pWord
->GetWordType() );
157 void CMiniLexicon::AddToWordCollection( CStemCollection
* pWords
)
159 for (int w
= 0; w
< pWords
->GetCount(); ++w
) {
160 CStem
* pWord
= pWords
->GetAt(w
);
161 CStem
* qWord
= (*m_pWords
) << pWord
;
163 const bool compound
=
164 pWord
->GetStemType() == CStem::BIWORD_COMPOUND
||
165 pWord
->GetStemType() == CStem::MULTIPLE_COMPOUND
;
166 qWord
->SetWordType(compound
?
167 CStem::STEM_COMPOUND
: CStem::STEM_NORMAL
);
172 void CMiniLexicon::ClearAll()
174 if( m_pStems
) m_pStems
->Empty();
175 if( m_pWords
) m_pWords
->Empty();
177 if( m_pSuffixes
) m_pSuffixes
->Empty();
178 if( m_pPrefixes
) m_pPrefixes
->Empty();
180 if( m_pSignatures
) m_pSignatures
->Empty();
184 CCorpusWord
* CMiniLexicon::FindAWord(CStem
* pStem
, CSuffix
* pSuffix
)
186 return m_pLexicon
->FindAWord(pStem
, pSuffix
);
190 CSuffixCollection
* CMiniLexicon::FindSuffixes() //Suffixes/Run all
195 QString
mini_name( "Mini-Lexicon %1" );
196 mini_name
= mini_name
.arg( m_Index
);
200 if( m_AffixLocation
== STEM_FINAL
|| m_AffixLocation
== WORD_FINAL
)
202 m_pWords
->SuccessorFreq1(GetStems(),
203 GetSuffixes(), GetSignatures(), SF1
,
204 CStem::NUMBER
| CStem::UNKNOWN
);
210 ExtendKnownStemsToKnownAffixes();
211 TakeSignaturesFindStems();
212 TakeSignaturesFindStems();
213 ExtendKnownStemsToKnownAffixes();
214 FromStemsFindAffixes(); // problem here @@@ oct 2008 jg
220 FindSingletonSignatures(); //problem here jan 2010
224 FindMajorSignatures();
225 m_pWords
->m_DisplayMode
= CWordListViewItem::MiniLexicon_MorphologyStuffFirst
;
226 CalculateDescriptionLength();
228 m_pFSA
= new FSA(this);
235 std::cout
<< "Find Suffixes: Time elapsed: " <<
236 t
.elapsed() << "ms." << std::endl
;
242 CPrefixCollection
* CMiniLexicon::FindPrefixes()
244 if( m_AffixLocation
== STEM_INITIAL
|| m_AffixLocation
== WORD_INITIAL
)
246 m_pWords
->CreateReverseTrie();
248 m_pWords
->PredecessorFreq1(GetStems(),
249 GetPrefixes(), GetSignatures(), SF1
,
250 CStem::NUMBER
| CStem::UNKNOWN
);
254 ExtendKnownStemsToKnownAffixes();
256 TakeSignaturesFindStems();
258 ExtendKnownStemsToKnownAffixes();
260 FromStemsFindAffixes();
262 ExtendKnownStemsToKnownAffixes();
268 FindSingletonSignatures();
276 LinguisticaMainWindow
* CMiniLexicon::GetDocument()
278 return m_pLexicon
->GetDocument();
281 int CMiniLexicon::GetIntParameter( QString strParam
, int iDefault
)
283 return m_pLexicon
->GetIntParameter( strParam
, iDefault
);
287 QTextStream
* CMiniLexicon::GetLogFile()
289 return m_pLexicon
->GetLogFileStream();
293 int CMiniLexicon::GetMiniCount()
295 return m_pLexicon
->GetMiniCount();
299 int CMiniLexicon::GetMiniSize()
301 return m_pLexicon
->GetMiniSize();
305 CMiniLexicon
* CMiniLexicon::GetMiniLexicon( int index
)
307 return m_pLexicon
->GetMiniLexicon( index
);
311 StringToString
* CMiniLexicon::GetOutFilter()
313 return m_pLexicon
->GetOutFilter();
317 bool CMiniLexicon::LogFileOn()
319 return (CLexicon
*)m_pLexicon
->LogFileOn();
323 bool CMiniLexicon::SetAffixLocation(enum eAffixLocation affixLoc
)
325 if (m_AffixLocation
== affixLoc
)
326 // done, without lifting a finger!
329 if (is_initial(m_AffixLocation
) == is_initial(affixLoc
)) {
330 m_AffixLocation
= affixLoc
;
334 // Affix types differ: throw away discoveries.
336 if (m_pStems
->GetCount() != 0)
337 // someone else could be using our discovered stems
340 if (is_initial(affixLoc
)) {
341 Q_ASSERT(m_pPrefixes
== 0);
342 m_pPrefixes
= new CPrefixCollection(this);
350 Q_ASSERT(m_pSuffixes
== 0);
351 m_pSuffixes
= new CSuffixCollection(this);
354 m_AffixLocation
= affixLoc
;
356 delete m_pSignatures
;
357 m_pSignatures
= is_initial(affixLoc
) ?
358 new CSignatureCollection(this, m_pPrefixes
, affixLoc
) :
359 new CSignatureCollection(this, m_pSuffixes
, affixLoc
);
363 int CMiniLexicon::GetCorpusCountOfUnanalyzedWords ( )
368 m_CorpusCountOfUnanalyzedWords
= 0;
370 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
372 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
374 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixSignature() )
376 m_CorpusCountOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->GetCorpusCount();
380 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
383 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
385 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixSignature() )
387 m_CorpusCountOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->GetCorpusCount();
392 return (int) m_CorpusCountOfUnanalyzedWords
;
394 double CMiniLexicon::CalculateSumOfPointersToMyUnanalyzedWords ( eMDL_STYLE MDLflag
)
398 double denominator
= 0;
405 denominator
= GetCorpusCountOfUnanalyzedWords ( ) +
406 m_pStems
->GetCorpusCount();
409 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
411 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
413 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixLoc() )
415 total
+= base2log ( denominator
/ (double) m_pWords
->GetAt(i
)->GetCorpusCount() ) ;
419 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
422 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
424 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixLoc() )
426 total
+= base2log ( denominator
/ m_pWords
->GetAt(i
)->GetCorpusCount() ) ;
433 int NumberOfUnanalyzedWords
;
434 GetNumberOfAnalyzedWords(NumberOfUnanalyzedWords
);
436 denominator
= NumberOfUnanalyzedWords
+
437 m_pStems
->GetTotalUseCount ( );
439 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
441 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
443 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixLoc() )
445 total
+= base2log ( denominator
) ;
449 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
452 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
454 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixLoc() )
456 total
+= base2log ( denominator
) ;
468 double CMiniLexicon::CalculateUnanalyzedWordsTotalPhonologicalInformationContent( )
470 CLexicon
* MotherLexicon
= GetLexicon();
472 if ( m_PhonologicalInformationOfUnanalyzedWords
== 0)
474 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
476 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
478 if ( 0 == m_pWords
->GetAt(i
)->GetSuffixLoc() )
480 m_PhonologicalInformationOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
);
484 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
486 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
488 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixLoc() )
490 m_PhonologicalInformationOfUnanalyzedWords
+= m_pWords
->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
);
497 return m_PhonologicalInformationOfUnanalyzedWords
;
502 double CMiniLexicon::CalculateCompressedLengthOfUnanalyzedWords( )
504 double CompressedLengthOfUnanalyzedWords
= 0;
505 CLexicon
* MotherLexicon
= GetLexicon();
508 if ( GetAffixLocation() == WORD_FINAL
|| GetAffixLocation() == STEM_FINAL
)
510 for (i
= 0; i
< GetWords()->GetCount(); i
++)
512 if ( NULL
== GetWords()->GetAt(i
)->GetSuffixSignature() )
514 CompressedLengthOfUnanalyzedWords
+=
515 GetWords()->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
)
516 * GetWords()->GetAt(i
)->GetCorpusCount();
520 else if ( GetAffixLocation() == WORD_INITIAL
|| GetAffixLocation() == STEM_INITIAL
)
522 for (i
= 0; i
< m_pWords
->GetCount(); i
++)
524 if ( 0 == m_pWords
->GetAt(i
)->GetPrefixSignature() )
526 CompressedLengthOfUnanalyzedWords
+=
527 m_pWords
->GetAt(i
)->CalculatePhonologicalInformationContent( MotherLexicon
)
528 * m_pWords
->GetAt(i
)->GetCorpusCount();
535 return CompressedLengthOfUnanalyzedWords
;
538 int CMiniLexicon::GetNumberOfAnalyzedWords (int& NumberOfUnanalyzedWords
)
541 int NumberOfAnalyzedWords
= 0;
542 NumberOfUnanalyzedWords
= 0;
543 for (int i
= 0;i
< m_pWords
->GetCount(); i
++)
545 pWord
= m_pWords
->GetAt(i
);
546 if (pWord
->IsAnalyzed() )
548 NumberOfAnalyzedWords
++;
551 NumberOfUnanalyzedWords
++;
554 return NumberOfAnalyzedWords
;
558 // Log File functions
562 void CMiniLexicon::LogFileHeader(QString s1
, QString s2
, QString s3
)
563 { if (LogFileOn()) *GetLogFile() <<
565 StartTableRow
<< MakeTableHeader(s1
) << MakeTableHeader(s2
) << MakeTableHeader(s3
) <<
569 void CMiniLexicon::LogFileSmallTitle(QString s1
, QString s2
, QString s3
)
570 { if (LogFileOn()) { *GetLogFile() << SmallTitle( s1
) <<
572 StartTableRow
<< MakeTableHeader(s2
) << MakeTableHeader(s3
) ;
575 void CMiniLexicon::LogFileSmallTitle(QString s
) { if (LogFileOn()) *GetLogFile() << SmallTitle( s
);}
576 void CMiniLexicon::LogFileSmallTitle(QString s
, QString t
) { if (LogFileOn()) *GetLogFile() << SmallTitle( s
) << StartTableRow
<< MakeTableHeader(t
) << EndTableRow
; }
577 void CMiniLexicon::LogFileLargeTitle(QString title
) { if (LogFileOn()) *GetLogFile() << LargeTitle(title
) << endl
; }
578 void CMiniLexicon::LogFileStartTable() { if (LogFileOn()) *GetLogFile() << StartTable
;}
579 void CMiniLexicon::LogFileEndTable() { if (LogFileOn()) *GetLogFile() << EndTable
;}
580 void CMiniLexicon::LogFileStartRow() { if (LogFileOn()) *GetLogFile() << StartTableRow
; }
581 void CMiniLexicon::LogFileEndRow() { if (LogFileOn()) *GetLogFile() << EndTableRow
; }
582 void CMiniLexicon::LogFileStartRow(QString str
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(str
); }
583 void CMiniLexicon::LogFile1SimpleString(QString s
) { if (LogFileOn()) *GetLogFile() << TableData(s
); }
584 void CMiniLexicon::LogFile (double d
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(d
) << EndTableRow
; }
585 void CMiniLexicon::LogFile (QString s
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << EndTableRow
; }
586 void CMiniLexicon::LogFile (QString s
, int n
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << TableData (n
) << EndTableRow
; }
587 void CMiniLexicon::LogFile (int n
, QString s
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(n
) << TableData(s
) << EndTableRow
; }
588 void CMiniLexicon::LogFile (QString s1
, QString s2
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s1
) << TableData(s2
) << EndTableRow
; }
589 void CMiniLexicon::LogFile (QString s1
, double d
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s1
) << TableData( d
) << EndTableRow
; }
590 void CMiniLexicon::LogFile (QString s
,int i
,double d
){ if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << TableData(i
) << TableData( d
) << EndTableRow
; }
591 void CMiniLexicon::LogFile (QString s
, int n
, int m
, double d
, double e
, double f
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< TableData(s
) << TableData(n
) << TableData(m
) << TableData(d
) << TableData(e
) << TableData(f
) << EndTableRow
; }
592 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << EndTableRow
;}
593 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
, QString v
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << TableData(v
) << EndTableRow
;}
594 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
, QString v
, QString w
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << TableData(v
) << TableData(w
) << EndTableRow
;}
595 void CMiniLexicon::LogFile (QString s
, QString t
, QString u
, QString v
, QString w
, QString x
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(s
) << TableData(t
) << TableData(u
) << TableData(v
) << TableData(w
) << TableData (x
) << EndTableRow
;}
596 void CMiniLexicon::LogFileHeader( QString s
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< MakeTableHeader(s
) << EndTableRow
; }
597 void CMiniLexicon::LogFileHeader( QString s
, QString t
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< MakeTableHeader(s
) << MakeTableHeader(t
) << EndTableRow
; }
598 void CMiniLexicon::LogFileHeader (QString s
, QString t
, QString u
, QString v
, QString w
, QString x
) { if (LogFileOn()) *GetLogFile() << StartTableRow
<< MakeTableHeader(s
) << MakeTableHeader(t
) << MakeTableHeader(u
) << MakeTableHeader(v
) << MakeTableHeader(w
) << MakeTableHeader (x
) << EndTableRow
;}
599 void CMiniLexicon::LogFile (int n
, double d
, QString s
) { if (LogFileOn()) *GetLogFile()<< StartTableRow
<< TableData(n
) << TableData(d
) << TableData(s
) << EndTableRow
;}