CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / DescriptionLength.cpp
blob96a51feb729a8252665650c7d0964d3db0a1fc4f
1 // Description length calculations, CDescriptionLength methods
2 // Copyright © 2009 The University of Chicago
3 #include "DescriptionLength.h"
4 #include "MiniLexicon.h"
6 #include "Lexicon.h"
7 #include "Signature.h"
8 #include "Suffix.h"
9 #include "Compound.h"
10 #include "Stem.h"
11 #include "SignatureCollection.h"
12 #include "SuffixCollection.h"
13 #include "StemCollection.h"
14 #include "StringFunc.h"
15 #include "log2.h"
17 // The CDLHistory class is the main user for these functions.
19 //-----------------------------------------------------------------
20 // Mini Lexicon function
21 //-----------------------------------------------------------------
24 double CMiniLexicon::CalculateDescriptionLength()
26 if (m_DescriptionLength) delete m_DescriptionLength;
27 m_DescriptionLength = new CDescriptionLength (this);
28 m_DescriptionLength->GetStemsTotalPhonologicalInformationContent ( );
29 m_DescriptionLength->GetSuffixesTotalPhonologicalInformationContent ( ); // Should be affixes, deal with prefixes case
31 return 0;
35 //-----------------------------------------------------------------
36 // Constructors/Destructor
37 //-----------------------------------------------------------------
39 CDescriptionLength::CDescriptionLength( CMiniLexicon* mini)
41 m_MyMiniLexicon = mini;
42 m_MyLexicon = mini->GetLexicon();
43 m_TotalDL = 0;
44 m_MDLStyle = CorpusCount;
46 // Stems
47 m_NumberOfStems = mini->GetStems()->GetCount();
48 m_StemsTotalPhonologicalInformationContent = 0;
49 m_SumOfPointersToMyStems_CorpusCount = 0;
50 m_SumOfPointersToMyStems_GrammarCount = 0;
52 m_NumberOfAnalyzedWords = mini->GetNumberOfAnalyzedWords(m_NumberOfUnanalyzedWords);
54 // Unanalyzed words
55 m_UnanalyzedWordsTotalPhonologicalInformationContent = 0;
56 m_SumOfPointersToMyUnanalyzedWords_CorpusCount = 0;
57 m_SumOfPointersToMyUnanalyzedWords_GrammarCount = 0;
60 // Suffixes
61 m_SuffixesTotalPhonologicalInformationContent = 0;
62 m_SumOfPointersToMySuffixes_CorpusCount = 0;
63 m_SumOfPointersToMySuffixes_GrammarCount = 0;
65 // Suffix signatures
66 m_SumOfInformationContentOfPointersInMySuffixSignatures = 0;
67 m_SumOfPointersToMySuffixSignatures_CorpusCount = 0;
68 m_SumOfPointersToMySuffixSignatures_GrammarCount = 0;
70 // Corpus
71 m_CompressedLengthOfCorpus = 0;
76 CDescriptionLength::CDescriptionLength( CLexicon* lex)
78 m_MyMiniLexicon = NULL;
79 m_MyLexicon = lex;
80 m_TotalDL = 0;
81 m_MDLStyle = CorpusCount;
83 // Stems
84 m_NumberOfStems = lex->GetAllStems()->count();
85 m_StemsTotalPhonologicalInformationContent = 0;
86 m_SumOfPointersToMyStems_CorpusCount = 0;
87 m_SumOfPointersToMyStems_GrammarCount = 0;
89 // Unanalyzed words
90 m_NumberOfUnanalyzedWords = 0;
91 m_UnanalyzedWordsTotalPhonologicalInformationContent = 0;
92 m_SumOfPointersToMyUnanalyzedWords_CorpusCount = 0;
93 m_SumOfPointersToMyUnanalyzedWords_GrammarCount = 0;
95 m_NumberOfAnalyzedWords = 0;
97 // Suffixes
98 m_SuffixesTotalPhonologicalInformationContent = 0;
99 m_SumOfPointersToMySuffixes_CorpusCount = 0;
100 m_SumOfPointersToMySuffixes_GrammarCount = 0;
102 // Suffix signatures
103 m_SumOfInformationContentOfPointersInMySuffixSignatures = 0;
104 m_SumOfPointersToMySuffixSignatures_CorpusCount = 0;
105 m_SumOfPointersToMySuffixSignatures_GrammarCount = 0;
107 // Corpus
108 m_CompressedLengthOfCorpus = 0;
113 CDescriptionLength::~CDescriptionLength()
117 //-----------------------------------------------------------------
118 // DescriptionLength ListViewItem
119 //-----------------------------------------------------------------
122 CDescriptionLengthListViewItem::CDescriptionLengthListViewItem( Q3ListView *parent )
123 : Q3ListViewItem( parent )
126 // DL f2
127 CDescriptionLengthListViewItem::CDescriptionLengthListViewItem( Q3ListView *parent,
128 eDocumentType type,
129 QString label,
130 CDescriptionLength* pDL,
131 int index )
132 : Q3ListViewItem( parent, label )
134 m_DL = pDL;
135 m_index = index;
136 m_SpellOutType = type;
137 // m_label = label;
141 // DL f3
142 CDescriptionLengthListViewItem::CDescriptionLengthListViewItem( Q3ListViewItem *parent,
143 eDocumentType type,
144 QString mini_name,
145 CDescriptionLength* pDL,
146 int index )
147 : Q3ListViewItem( parent, mini_name )
149 m_DL = pDL;
150 m_index = index;
151 m_SpellOutType = type;
154 // DL f5
155 QString CDescriptionLengthListViewItem::SpellOutType() const
157 switch (m_SpellOutType)
159 case STEMS:
160 return QString ("Stems"); break;
161 case SUFFIXES:
162 return QString ("Suffixes"); break;
163 case SUFFIX_SIGNATURES:
164 return QString ("Suffix signatures"); break;
165 case CORPUS_DOC:
166 return QString ("Corpus"); break;
167 case UNANALYZED_WORDS:
168 return QString ("Unanalyzed words"); break;
169 default:
170 return QString ("");
173 // DL f6
174 QString CDescriptionLengthListViewItem::GetSubstance() const
176 switch (m_SpellOutType)
178 case STEMS:
179 return DblToStringWithCommas( m_DL->GetStemsTotalPhonologicalInformationContent( ) );
180 break;
181 case UNANALYZED_WORDS:
182 return DblToStringWithCommas( m_DL->GetUnanalyzedWordsTotalPhonologicalInformationContent( ) );
183 break;
184 case SUFFIXES:
185 return DblToStringWithCommas( m_DL->GetSuffixesTotalPhonologicalInformationContent( ) );
186 break;
187 case SUFFIX_SIGNATURES:
188 return QString ("0.0"); break;
189 default:
190 return QString ("");
194 // DL f7
195 QString CDescriptionLengthListViewItem::GetLengthOfInternalPointers() const
197 switch (m_SpellOutType)
199 case STEMS:
200 case SUFFIXES:
201 case UNANALYZED_WORDS:
202 return DblToStringWithCommas( 0.0 );
203 break;
204 case SUFFIX_SIGNATURES:
205 return DblToStringWithCommas( m_DL->GetSumOfInformationContentOfPointersInMySuffixSignatures() );; break;
206 case CORPUS_DOC:
207 return DblToStringWithCommas(m_DL-> CalculateCompressedLengthOfCorpus() );
209 default:
210 return QString ("");
214 // DL f8
215 QString CDescriptionLengthListViewItem::GetLengthOfPointersToMe () const
217 switch (m_SpellOutType)
219 case STEMS:
220 return DblToStringWithCommas(m_DL->GetSumOfPointersToMyStems ( ) );
221 break;
222 case UNANALYZED_WORDS:
223 return DblToStringWithCommas(m_DL->GetSumOfPointersToMyUnanalyzedWords ( ) );
224 break;
225 case SUFFIXES:
226 return DblToStringWithCommas(m_DL-> GetSumOfPointersToMySuffixes ( ) );
227 break;
228 case SUFFIX_SIGNATURES:
229 return DblToStringWithCommas(m_DL-> GetSumOfPointersToMySuffixSignatures ( ) ); break;
230 default:
231 return QString ("");
238 // DL f4.1
239 QString CDescriptionLengthListViewItem::key(int column, bool) const
240 { switch( column )
242 case 0:
243 return SpellOutType();
244 case 1:
245 return GetSubstance();
246 // case 2:
247 // return m_dlhistory->getRemark( m_index );
248 // case 3:
249 // total_dl += m_dlhistory->getStemDL(m_index);
250 // total_dl += m_dlhistory->getAffixDL(m_index);
251 // total_dl += m_dlhistory->getSigDL(m_index);
253 // return QString("%1").arg( (int)(1000 * total_dl), 10 );
254 // case 4:
255 // return QString("%1").arg( (int)(1000 * m_dlhistory->getUnanalyzedStemDL(m_index)), 10 );
256 // case 5:
257 // return QString("%1").arg( (int)(1000 * m_dlhistory->getAnalyzedStemDL(m_index)), 10 );
258 // case 6:
259 // return QString("%1").arg( (int)(1000 * m_dlhistory->getStemDL(m_index)), 10 );
260 // case 7:
261 // return QString("%1").arg( (int)(1000 * m_dlhistory->getAffixDL(m_index)), 10 );
262 // case 8:
263 // return QString("%1").arg( (int)(1000 * m_dlhistory->getSigDL(m_index)), 10 );
264 default:
265 return QString ("x");
266 // return QListViewItem::key( column, ascending );
269 // DL f4.2
270 QString CDescriptionLengthListViewItem::text( int column ) const
272 // double total_dl = 0.0; LK 7.18.08: unused varible
274 switch( column )
276 case 0:
277 return SpellOutType();
278 case 1:
279 return GetSubstance() ;
280 case 2:
281 return GetLengthOfInternalPointers();
282 // case 3:
283 case 4:
284 return GetLengthOfPointersToMe() ;
285 // case 5:
286 // return DblToStringWithCommas( m_dlhistory->getAnalyzedStemDL(m_index) );
287 // case 6:
288 // return DblToStringWithCommas( m_dlhistory->getStemDL(m_index) );
289 // case 7:
290 // return DblToStringWithCommas( m_dlhistory->getAffixDL(m_index) );
291 // case 8:
292 // return DblToStringWithCommas( m_dlhistory->getSigDL(m_index) );
293 default:
294 return QString (" ");
295 // return QListViewItem::text( column );
303 //-----------------------------------------------------------------
304 // DescriptionLength ListDisplay
305 //-----------------------------------------------------------------
307 // DL f1
308 void CDescriptionLength::DescriptionLengthListDisplay( Q3ListView* list )
310 // Remove all previous columns
311 while( list->columns() ) list->removeColumn( 0 );
313 // Add Column headers
314 list->addColumn( "Component" );
315 list->addColumn( "Phonological content" );
316 list->addColumn( "Internal pointers" );
317 list->addColumn( "External pointers" );
318 list->addColumn( "Pointers to me" );
319 list->addColumn( "Unanalyzed words");
320 list->addColumn( "Stems");
322 list->setColumnAlignment( 0, Qt::AlignLeft );
323 list->setColumnAlignment( 1, Qt::AlignRight ); //Phonological content
324 list->setColumnAlignment( 2, Qt::AlignRight );
325 list->setColumnAlignment( 3, Qt::AlignRight );
326 list->setColumnAlignment( 4, Qt::AlignRight );
327 list->setColumnAlignment( 5, Qt::AlignRight );
328 list->setColumnAlignment( 6, Qt::AlignRight );
329 list->setColumnAlignment( 7, Qt::AlignRight );
330 list->setColumnAlignment( 8, Qt::AlignRight );
332 int LineNumber = 0;
333 (void) new CDescriptionLengthListViewItem ( list, CORPUS_DOC, QString ("Corpus"), this, ++LineNumber );
334 (void) new CDescriptionLengthListViewItem ( list, SUFFIX_SIGNATURES, QString ("Suffix signatures"), this, ++LineNumber );
335 (void) new CDescriptionLengthListViewItem ( list, SUFFIXES, QString ("Suffixes"), this, ++LineNumber );
336 (void) new CDescriptionLengthListViewItem ( list, STEMS, QString ("Stems"), this, ++LineNumber );
337 (void) new CDescriptionLengthListViewItem ( list, UNANALYZED_WORDS, QString ("UnanalyzedWords"), this, ++LineNumber );
341 // list->setSorting (1); //go back to sorting
344 // CDescriptionLengthListViewItem* item = new CDescriptionLengthListViewItem( list,
345 // QString("Stems"),
346 // this, 0);
350 //-----------------------------------------------------------------
351 // Phonological content
352 //-----------------------------------------------------------------
354 double CDescriptionLength::GetStemsTotalPhonologicalInformationContent ( )
357 if ( m_StemsTotalPhonologicalInformationContent > 0)
359 return m_StemsTotalPhonologicalInformationContent;
362 if (m_MyMiniLexicon)
364 m_StemsTotalPhonologicalInformationContent = m_MyMiniLexicon->GetStems()->CalculateTotalPhonologicalInformationContent(m_MyLexicon);
366 else if (m_MyLexicon)
368 m_StemsTotalPhonologicalInformationContent = m_MyLexicon->CalculateTotalPhonologicalInformationContentOfStems();
371 return m_StemsTotalPhonologicalInformationContent;
375 double CLexicon::CalculateTotalPhonologicalInformationContentOfStems()
377 double total = 0.0;
378 for (Q3DictIterator<StemSet> iter(m_AllStems);
379 iter.current() != 0; ++iter) {
380 StemSet& stems = *iter.current();
382 foreach (CStem* pStem, stems)
383 total += pStem->GetPhonologicalInformationContent(this);
385 return total;
389 double CDescriptionLength::GetUnanalyzedWordsTotalPhonologicalInformationContent ( )
390 //double CDescriptionLength::GetUnanalyzedWordsTotalPhonologicalContent ( eMDL_STYLE CorpusCount) LK 7.18.08: unused parameter 'CorpusCount'
392 if ( m_UnanalyzedWordsTotalPhonologicalInformationContent > 0)
394 return m_UnanalyzedWordsTotalPhonologicalInformationContent;
397 if (m_MyMiniLexicon)
399 m_UnanalyzedWordsTotalPhonologicalInformationContent = m_MyMiniLexicon-> CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
401 else if (m_MyLexicon)
403 m_StemsTotalPhonologicalInformationContent = m_MyLexicon->CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
405 return m_UnanalyzedWordsTotalPhonologicalInformationContent;
409 double CLexicon::CalculateUnanalyzedWordsTotalPhonologicalInformationContent()
411 double total = 0;
412 CStem* pWord;
414 Q3DictIterator<StemSet> it( m_AllWords );
415 for( ; it.current(); ++it )
417 //for( pWord = it.current()->first(); pWord; pWord = it.current()->next() )
418 for (int z = 0; z < it.current()->size(); z++)
419 { pWord = it.current()->at(z);
420 total += pWord->CalculatePhonologicalInformationContent( this);
423 return total;
426 double CDescriptionLength::GetSuffixesTotalPhonologicalInformationContent ( )
428 int i;
429 CSuffix* pSuffix;
430 if ( m_SuffixesTotalPhonologicalInformationContent > 0)
432 return m_SuffixesTotalPhonologicalInformationContent;
435 if (m_MyMiniLexicon)
437 m_SuffixesTotalPhonologicalInformationContent = 0;
438 CSuffixCollection& suffixes = *m_MyMiniLexicon->GetSuffixes();
439 for (i = 0; i < suffixes.GetCount(); ++i) {
440 pSuffix = suffixes.GetAt(i);
442 m_SuffixesTotalPhonologicalInformationContent +=
443 pSuffix->GetPhonologicalInformationContent();
446 else if (m_MyLexicon)
448 m_SuffixesTotalPhonologicalInformationContent = 0;
449 m_SuffixesTotalPhonologicalInformationContent = m_MyLexicon->GetPhonologicalInformationContentOfSuffixes( );
452 return m_SuffixesTotalPhonologicalInformationContent;
456 double CLexicon::GetPhonologicalInformationContentOfSuffixes( ) //double check this. This is not used for MiniLexicons -- only for full CLexicon.
458 double total = 0.0;
459 for (Q3DictIterator<SuffixSet> iter(m_AllSuffixes);
460 iter.current() != 0; ++iter) {
461 SuffixSet& suffixes = *iter.current();
463 foreach (CSuffix* pSuffix, suffixes)
464 total += pSuffix->GetPhonologicalInformationContent();
466 return total;
469 //-----------------------------------------------------------------
470 // Pointer information: Stems
471 //-----------------------------------------------------------------
473 double CDescriptionLength::GetSumOfPointersToMyStems( eMDL_STYLE MDLStyle )
475 double result;
477 switch ( MDLStyle )
479 case CorpusCount:
480 if ( m_SumOfPointersToMyStems_CorpusCount == 0 )
482 result = m_MyMiniLexicon->GetStems()->CalculateSumOfPointersToMyStems (CorpusCount);
483 m_SumOfPointersToMyStems_CorpusCount = result;
485 else
487 return m_SumOfPointersToMyStems_CorpusCount;
489 break;
490 case GrammarCount:
491 if ( m_SumOfPointersToMyStems_CorpusCount == 0 )
493 result = m_MyMiniLexicon->GetStems()->CalculateSumOfPointersToMyStems (GrammarCount);
494 m_SumOfPointersToMyStems_GrammarCount = result;
496 else
498 return m_SumOfPointersToMyStems_GrammarCount;
500 break;
501 default:
502 return 0;
505 return 0;
508 double CDescriptionLength::GetSumOfPointersToMyUnanalyzedWords( eMDL_STYLE MDLStyle )
510 double result;
512 switch ( MDLStyle )
514 case CorpusCount:
515 if ( m_SumOfPointersToMyUnanalyzedWords_CorpusCount == 0 )
517 result = m_MyMiniLexicon->CalculateSumOfPointersToMyUnanalyzedWords (CorpusCount);
518 m_SumOfPointersToMyUnanalyzedWords_CorpusCount = result;
520 else
522 return m_SumOfPointersToMyUnanalyzedWords_CorpusCount;
524 break;
525 case GrammarCount:
526 if ( m_SumOfPointersToMyUnanalyzedWords_CorpusCount == 0 )
528 result = m_MyMiniLexicon->CalculateSumOfPointersToMyUnanalyzedWords(GrammarCount);
529 m_SumOfPointersToMyUnanalyzedWords_GrammarCount = result;
531 else
533 return m_SumOfPointersToMyUnanalyzedWords_GrammarCount;
535 break;
536 default:
537 return 0;
540 return 0;
544 //-----------------------------------------------------------------
545 // Pointer information: Suffixes
546 //-----------------------------------------------------------------
548 double CDescriptionLength::GetSumOfPointersToMySuffixes( eMDL_STYLE MDLStyle )
550 double result;
552 switch ( MDLStyle )
554 case CorpusCount:
555 if ( m_SumOfPointersToMySuffixes_CorpusCount == 0 )
557 result = m_MyMiniLexicon->GetSuffixes()->CalculatePointersToMySuffixes (CorpusCount);
558 m_SumOfPointersToMySuffixes_CorpusCount = result;
560 return m_SumOfPointersToMySuffixes_CorpusCount;
561 break;
562 case GrammarCount:
563 if ( m_SumOfPointersToMySuffixes_CorpusCount == 0 )
565 result = m_MyMiniLexicon->GetSuffixes()->CalculatePointersToMySuffixes (GrammarCount);
566 m_SumOfPointersToMySuffixes_GrammarCount = result;
568 else
570 return m_SumOfPointersToMySuffixes_GrammarCount;
572 break;
573 default:
574 return 0;
577 return 0;
581 //-----------------------------------------------------------------
582 // Pointer information: Suffix Signatures
583 //-----------------------------------------------------------------
584 //double CDescriptionLength::GetSumOfInformationContentOfPointersInMySuffixSignatures ( ) unused parameter 'RecomputeFlag
585 double CDescriptionLength::GetSumOfInformationContentOfPointersInMySuffixSignatures ( )
588 m_SumOfInformationContentOfPointersInMySuffixSignatures =
589 m_MyMiniLexicon->GetSignatures()->ComputeDLofInternalPointersOfEachMember( GrammarCount );
591 return m_SumOfInformationContentOfPointersInMySuffixSignatures;
594 double CDescriptionLength::GetSumOfPointersToMySuffixSignatures( eMDL_STYLE MDLStyle )
596 double result = 0;
598 switch ( MDLStyle )
600 case CorpusCount:
601 if ( m_SumOfPointersToMySuffixSignatures_CorpusCount == 0 )
603 result = m_MyMiniLexicon->GetSignatures()->ComputeLengthOfPointersToEachOfMyMembers (CorpusCount);
604 m_SumOfPointersToMySuffixSignatures_CorpusCount = result;
607 return m_SumOfPointersToMySuffixSignatures_CorpusCount;
609 break;
610 case GrammarCount:
611 if ( m_SumOfPointersToMySuffixSignatures_CorpusCount == 0 )
613 result = m_MyMiniLexicon->GetSignatures()->ComputeLengthOfPointersToEachOfMyMembers(GrammarCount);
614 m_SumOfPointersToMySuffixSignatures_GrammarCount = result;
616 else
618 return m_SumOfPointersToMySuffixSignatures_GrammarCount;
620 break;
621 default:
622 return 0;
625 return 0;
632 //-----------------------------------------------------------------
633 // Corpus
634 //-----------------------------------------------------------------
635 double CDescriptionLength::CalculateCompressedLengthOfCorpus()
637 int i;
638 CSignature * pSig;
639 m_CompressedLengthOfCorpus = 0;
641 for (i= 0; i < m_MyMiniLexicon->GetSignatures()->GetCount(); i++)
643 pSig = m_MyMiniLexicon->GetSignatures()->GetAt(i);
644 m_CompressedLengthOfCorpus += pSig->ComputeDLofMyCorpus();
647 m_CompressedLengthOfCorpus += m_MyMiniLexicon->CalculateCompressedLengthOfUnanalyzedWords( );
649 return m_CompressedLengthOfCorpus ;
654 double CalculateTotalPhonologicalContentOfStems();
655 double CalculateUnanalyzedWordsTotalPhonologicalContent();
656 double GetPhonologicalInformationContentOfSuffixes( );
659 //////////////////////////////////////////////////////////////////////////////
661 /// Description Length of Stem Collection
663 //////////////////////////////////////////////////////////////////////////////
665 double CStemCollection::CalculateTotalPhonologicalInformationContent(CLexicon* MotherLexicon)
667 double StemsTotalPhonologicalContent = 0;
668 CStem* pStem;
670 for (int i = 0; i < m_pMiniLex->GetStems()->GetCount(); ++i) {
671 pStem = m_pMiniLex->GetStems()->GetAt(i);
672 StemsTotalPhonologicalContent += pStem->CalculatePhonologicalInformationContent( MotherLexicon );
674 return StemsTotalPhonologicalContent;
676 double CStemCollection::CalculateSumOfPointersToMyStems ( eMDL_STYLE style)
678 int i;
679 double TotalLength = 0,
680 length = 0,
681 denominator = 0;
682 switch (style)
684 case CorpusCount:
685 for (i = 0; i < GetCount();i++)
687 denominator += GetAt(i)->GetCorpusCount();
689 //////////////
690 denominator += GetMiniLexicon()->GetCorpusCountOfUnanalyzedWords();
691 /////////////
692 for (i = 0; i < GetCount();i++)
694 length = base2log ( denominator / GetAt(i)->GetCorpusCount() ) ;
695 GetAt(i)->SetLengthOfPointerToMe ( length );
696 TotalLength += length;
698 break;
701 case GrammarCount:
702 for (i = 0; i < GetCount();i++)
704 denominator += GetAt(i)->GetNumberOfSuffixes();
706 int NumberOfUnanalyzedWords;
707 GetMiniLexicon()->GetNumberOfAnalyzedWords ( NumberOfUnanalyzedWords);
708 //////////////
709 denominator += NumberOfUnanalyzedWords;
710 /////////////
712 for (i = 0; i < GetCount();i++)
714 length = base2log ( denominator / GetAt(i)->GetNumberOfSuffixes() );
715 GetAt(i)->SetLengthOfPointerToMe ( length );
716 TotalLength += length;
718 break;
723 return TotalLength;