1 // Description length calculations, CDescriptionLength methods
2 // Copyright © 2009 The University of Chicago
3 #include "DescriptionLength.h"
4 #include "MiniLexicon.h"
11 #include "SignatureCollection.h"
12 #include "SuffixCollection.h"
13 #include "StemCollection.h"
14 #include "StringFunc.h"
17 // The CDLHistory class is the main user for these functions.
19 //-----------------------------------------------------------------
20 // Mini Lexicon function
21 //-----------------------------------------------------------------
24 double CMiniLexicon::CalculateDescriptionLength()
26 if (m_DescriptionLength
) delete m_DescriptionLength
;
27 m_DescriptionLength
= new CDescriptionLength (this);
28 m_DescriptionLength
->GetStemsTotalPhonologicalInformationContent ( );
29 m_DescriptionLength
->GetSuffixesTotalPhonologicalInformationContent ( ); // Should be affixes, deal with prefixes case
35 //-----------------------------------------------------------------
36 // Constructors/Destructor
37 //-----------------------------------------------------------------
39 CDescriptionLength::CDescriptionLength( CMiniLexicon
* mini
)
41 m_MyMiniLexicon
= mini
;
42 m_MyLexicon
= mini
->GetLexicon();
44 m_MDLStyle
= CorpusCount
;
47 m_NumberOfStems
= mini
->GetStems()->GetCount();
48 m_StemsTotalPhonologicalInformationContent
= 0;
49 m_SumOfPointersToMyStems_CorpusCount
= 0;
50 m_SumOfPointersToMyStems_GrammarCount
= 0;
52 m_NumberOfAnalyzedWords
= mini
->GetNumberOfAnalyzedWords(m_NumberOfUnanalyzedWords
);
55 m_UnanalyzedWordsTotalPhonologicalInformationContent
= 0;
56 m_SumOfPointersToMyUnanalyzedWords_CorpusCount
= 0;
57 m_SumOfPointersToMyUnanalyzedWords_GrammarCount
= 0;
61 m_SuffixesTotalPhonologicalInformationContent
= 0;
62 m_SumOfPointersToMySuffixes_CorpusCount
= 0;
63 m_SumOfPointersToMySuffixes_GrammarCount
= 0;
66 m_SumOfInformationContentOfPointersInMySuffixSignatures
= 0;
67 m_SumOfPointersToMySuffixSignatures_CorpusCount
= 0;
68 m_SumOfPointersToMySuffixSignatures_GrammarCount
= 0;
71 m_CompressedLengthOfCorpus
= 0;
76 CDescriptionLength::CDescriptionLength( CLexicon
* lex
)
78 m_MyMiniLexicon
= NULL
;
81 m_MDLStyle
= CorpusCount
;
84 m_NumberOfStems
= lex
->GetAllStems()->count();
85 m_StemsTotalPhonologicalInformationContent
= 0;
86 m_SumOfPointersToMyStems_CorpusCount
= 0;
87 m_SumOfPointersToMyStems_GrammarCount
= 0;
90 m_NumberOfUnanalyzedWords
= 0;
91 m_UnanalyzedWordsTotalPhonologicalInformationContent
= 0;
92 m_SumOfPointersToMyUnanalyzedWords_CorpusCount
= 0;
93 m_SumOfPointersToMyUnanalyzedWords_GrammarCount
= 0;
95 m_NumberOfAnalyzedWords
= 0;
98 m_SuffixesTotalPhonologicalInformationContent
= 0;
99 m_SumOfPointersToMySuffixes_CorpusCount
= 0;
100 m_SumOfPointersToMySuffixes_GrammarCount
= 0;
103 m_SumOfInformationContentOfPointersInMySuffixSignatures
= 0;
104 m_SumOfPointersToMySuffixSignatures_CorpusCount
= 0;
105 m_SumOfPointersToMySuffixSignatures_GrammarCount
= 0;
108 m_CompressedLengthOfCorpus
= 0;
113 CDescriptionLength::~CDescriptionLength()
117 //-----------------------------------------------------------------
118 // DescriptionLength ListViewItem
119 //-----------------------------------------------------------------
122 CDescriptionLengthListViewItem::CDescriptionLengthListViewItem( Q3ListView
*parent
)
123 : Q3ListViewItem( parent
)
127 CDescriptionLengthListViewItem::CDescriptionLengthListViewItem( Q3ListView
*parent
,
130 CDescriptionLength
* pDL
,
132 : Q3ListViewItem( parent
, label
)
136 m_SpellOutType
= type
;
142 CDescriptionLengthListViewItem::CDescriptionLengthListViewItem( Q3ListViewItem
*parent
,
145 CDescriptionLength
* pDL
,
147 : Q3ListViewItem( parent
, mini_name
)
151 m_SpellOutType
= type
;
155 QString
CDescriptionLengthListViewItem::SpellOutType() const
157 switch (m_SpellOutType
)
160 return QString ("Stems"); break;
162 return QString ("Suffixes"); break;
163 case SUFFIX_SIGNATURES
:
164 return QString ("Suffix signatures"); break;
166 return QString ("Corpus"); break;
167 case UNANALYZED_WORDS
:
168 return QString ("Unanalyzed words"); break;
174 QString
CDescriptionLengthListViewItem::GetSubstance() const
176 switch (m_SpellOutType
)
179 return DblToStringWithCommas( m_DL
->GetStemsTotalPhonologicalInformationContent( ) );
181 case UNANALYZED_WORDS
:
182 return DblToStringWithCommas( m_DL
->GetUnanalyzedWordsTotalPhonologicalInformationContent( ) );
185 return DblToStringWithCommas( m_DL
->GetSuffixesTotalPhonologicalInformationContent( ) );
187 case SUFFIX_SIGNATURES
:
188 return QString ("0.0"); break;
195 QString
CDescriptionLengthListViewItem::GetLengthOfInternalPointers() const
197 switch (m_SpellOutType
)
201 case UNANALYZED_WORDS
:
202 return DblToStringWithCommas( 0.0 );
204 case SUFFIX_SIGNATURES
:
205 return DblToStringWithCommas( m_DL
->GetSumOfInformationContentOfPointersInMySuffixSignatures() );; break;
207 return DblToStringWithCommas(m_DL
-> CalculateCompressedLengthOfCorpus() );
215 QString
CDescriptionLengthListViewItem::GetLengthOfPointersToMe () const
217 switch (m_SpellOutType
)
220 return DblToStringWithCommas(m_DL
->GetSumOfPointersToMyStems ( ) );
222 case UNANALYZED_WORDS
:
223 return DblToStringWithCommas(m_DL
->GetSumOfPointersToMyUnanalyzedWords ( ) );
226 return DblToStringWithCommas(m_DL
-> GetSumOfPointersToMySuffixes ( ) );
228 case SUFFIX_SIGNATURES
:
229 return DblToStringWithCommas(m_DL
-> GetSumOfPointersToMySuffixSignatures ( ) ); break;
239 QString
CDescriptionLengthListViewItem::key(int column
, bool) const
243 return SpellOutType();
245 return GetSubstance();
247 // return m_dlhistory->getRemark( m_index );
249 // total_dl += m_dlhistory->getStemDL(m_index);
250 // total_dl += m_dlhistory->getAffixDL(m_index);
251 // total_dl += m_dlhistory->getSigDL(m_index);
253 // return QString("%1").arg( (int)(1000 * total_dl), 10 );
255 // return QString("%1").arg( (int)(1000 * m_dlhistory->getUnanalyzedStemDL(m_index)), 10 );
257 // return QString("%1").arg( (int)(1000 * m_dlhistory->getAnalyzedStemDL(m_index)), 10 );
259 // return QString("%1").arg( (int)(1000 * m_dlhistory->getStemDL(m_index)), 10 );
261 // return QString("%1").arg( (int)(1000 * m_dlhistory->getAffixDL(m_index)), 10 );
263 // return QString("%1").arg( (int)(1000 * m_dlhistory->getSigDL(m_index)), 10 );
265 return QString ("x");
266 // return QListViewItem::key( column, ascending );
270 QString
CDescriptionLengthListViewItem::text( int column
) const
272 // double total_dl = 0.0; LK 7.18.08: unused varible
277 return SpellOutType();
279 return GetSubstance() ;
281 return GetLengthOfInternalPointers();
284 return GetLengthOfPointersToMe() ;
286 // return DblToStringWithCommas( m_dlhistory->getAnalyzedStemDL(m_index) );
288 // return DblToStringWithCommas( m_dlhistory->getStemDL(m_index) );
290 // return DblToStringWithCommas( m_dlhistory->getAffixDL(m_index) );
292 // return DblToStringWithCommas( m_dlhistory->getSigDL(m_index) );
294 return QString (" ");
295 // return QListViewItem::text( column );
303 //-----------------------------------------------------------------
304 // DescriptionLength ListDisplay
305 //-----------------------------------------------------------------
308 void CDescriptionLength::DescriptionLengthListDisplay( Q3ListView
* list
)
310 // Remove all previous columns
311 while( list
->columns() ) list
->removeColumn( 0 );
313 // Add Column headers
314 list
->addColumn( "Component" );
315 list
->addColumn( "Phonological content" );
316 list
->addColumn( "Internal pointers" );
317 list
->addColumn( "External pointers" );
318 list
->addColumn( "Pointers to me" );
319 list
->addColumn( "Unanalyzed words");
320 list
->addColumn( "Stems");
322 list
->setColumnAlignment( 0, Qt::AlignLeft
);
323 list
->setColumnAlignment( 1, Qt::AlignRight
); //Phonological content
324 list
->setColumnAlignment( 2, Qt::AlignRight
);
325 list
->setColumnAlignment( 3, Qt::AlignRight
);
326 list
->setColumnAlignment( 4, Qt::AlignRight
);
327 list
->setColumnAlignment( 5, Qt::AlignRight
);
328 list
->setColumnAlignment( 6, Qt::AlignRight
);
329 list
->setColumnAlignment( 7, Qt::AlignRight
);
330 list
->setColumnAlignment( 8, Qt::AlignRight
);
333 (void) new CDescriptionLengthListViewItem ( list
, CORPUS_DOC
, QString ("Corpus"), this, ++LineNumber
);
334 (void) new CDescriptionLengthListViewItem ( list
, SUFFIX_SIGNATURES
, QString ("Suffix signatures"), this, ++LineNumber
);
335 (void) new CDescriptionLengthListViewItem ( list
, SUFFIXES
, QString ("Suffixes"), this, ++LineNumber
);
336 (void) new CDescriptionLengthListViewItem ( list
, STEMS
, QString ("Stems"), this, ++LineNumber
);
337 (void) new CDescriptionLengthListViewItem ( list
, UNANALYZED_WORDS
, QString ("UnanalyzedWords"), this, ++LineNumber
);
341 // list->setSorting (1); //go back to sorting
344 // CDescriptionLengthListViewItem* item = new CDescriptionLengthListViewItem( list,
350 //-----------------------------------------------------------------
351 // Phonological content
352 //-----------------------------------------------------------------
354 double CDescriptionLength::GetStemsTotalPhonologicalInformationContent ( )
357 if ( m_StemsTotalPhonologicalInformationContent
> 0)
359 return m_StemsTotalPhonologicalInformationContent
;
364 m_StemsTotalPhonologicalInformationContent
= m_MyMiniLexicon
->GetStems()->CalculateTotalPhonologicalInformationContent(m_MyLexicon
);
366 else if (m_MyLexicon
)
368 m_StemsTotalPhonologicalInformationContent
= m_MyLexicon
->CalculateTotalPhonologicalInformationContentOfStems();
371 return m_StemsTotalPhonologicalInformationContent
;
375 double CLexicon::CalculateTotalPhonologicalInformationContentOfStems()
378 for (Q3DictIterator
<StemSet
> iter(m_AllStems
);
379 iter
.current() != 0; ++iter
) {
380 StemSet
& stems
= *iter
.current();
382 foreach (CStem
* pStem
, stems
)
383 total
+= pStem
->GetPhonologicalInformationContent(this);
389 double CDescriptionLength::GetUnanalyzedWordsTotalPhonologicalInformationContent ( )
390 //double CDescriptionLength::GetUnanalyzedWordsTotalPhonologicalContent ( eMDL_STYLE CorpusCount) LK 7.18.08: unused parameter 'CorpusCount'
392 if ( m_UnanalyzedWordsTotalPhonologicalInformationContent
> 0)
394 return m_UnanalyzedWordsTotalPhonologicalInformationContent
;
399 m_UnanalyzedWordsTotalPhonologicalInformationContent
= m_MyMiniLexicon
-> CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
401 else if (m_MyLexicon
)
403 m_StemsTotalPhonologicalInformationContent
= m_MyLexicon
->CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
405 return m_UnanalyzedWordsTotalPhonologicalInformationContent
;
409 double CLexicon::CalculateUnanalyzedWordsTotalPhonologicalInformationContent()
414 Q3DictIterator
<StemSet
> it( m_AllWords
);
415 for( ; it
.current(); ++it
)
417 //for( pWord = it.current()->first(); pWord; pWord = it.current()->next() )
418 for (int z
= 0; z
< it
.current()->size(); z
++)
419 { pWord
= it
.current()->at(z
);
420 total
+= pWord
->CalculatePhonologicalInformationContent( this);
426 double CDescriptionLength::GetSuffixesTotalPhonologicalInformationContent ( )
430 if ( m_SuffixesTotalPhonologicalInformationContent
> 0)
432 return m_SuffixesTotalPhonologicalInformationContent
;
437 m_SuffixesTotalPhonologicalInformationContent
= 0;
438 CSuffixCollection
& suffixes
= *m_MyMiniLexicon
->GetSuffixes();
439 for (i
= 0; i
< suffixes
.GetCount(); ++i
) {
440 pSuffix
= suffixes
.GetAt(i
);
442 m_SuffixesTotalPhonologicalInformationContent
+=
443 pSuffix
->GetPhonologicalInformationContent();
446 else if (m_MyLexicon
)
448 m_SuffixesTotalPhonologicalInformationContent
= 0;
449 m_SuffixesTotalPhonologicalInformationContent
= m_MyLexicon
->GetPhonologicalInformationContentOfSuffixes( );
452 return m_SuffixesTotalPhonologicalInformationContent
;
456 double CLexicon::GetPhonologicalInformationContentOfSuffixes( ) //double check this. This is not used for MiniLexicons -- only for full CLexicon.
459 for (Q3DictIterator
<SuffixSet
> iter(m_AllSuffixes
);
460 iter
.current() != 0; ++iter
) {
461 SuffixSet
& suffixes
= *iter
.current();
463 foreach (CSuffix
* pSuffix
, suffixes
)
464 total
+= pSuffix
->GetPhonologicalInformationContent();
469 //-----------------------------------------------------------------
470 // Pointer information: Stems
471 //-----------------------------------------------------------------
473 double CDescriptionLength::GetSumOfPointersToMyStems( eMDL_STYLE MDLStyle
)
480 if ( m_SumOfPointersToMyStems_CorpusCount
== 0 )
482 result
= m_MyMiniLexicon
->GetStems()->CalculateSumOfPointersToMyStems (CorpusCount
);
483 m_SumOfPointersToMyStems_CorpusCount
= result
;
487 return m_SumOfPointersToMyStems_CorpusCount
;
491 if ( m_SumOfPointersToMyStems_CorpusCount
== 0 )
493 result
= m_MyMiniLexicon
->GetStems()->CalculateSumOfPointersToMyStems (GrammarCount
);
494 m_SumOfPointersToMyStems_GrammarCount
= result
;
498 return m_SumOfPointersToMyStems_GrammarCount
;
508 double CDescriptionLength::GetSumOfPointersToMyUnanalyzedWords( eMDL_STYLE MDLStyle
)
515 if ( m_SumOfPointersToMyUnanalyzedWords_CorpusCount
== 0 )
517 result
= m_MyMiniLexicon
->CalculateSumOfPointersToMyUnanalyzedWords (CorpusCount
);
518 m_SumOfPointersToMyUnanalyzedWords_CorpusCount
= result
;
522 return m_SumOfPointersToMyUnanalyzedWords_CorpusCount
;
526 if ( m_SumOfPointersToMyUnanalyzedWords_CorpusCount
== 0 )
528 result
= m_MyMiniLexicon
->CalculateSumOfPointersToMyUnanalyzedWords(GrammarCount
);
529 m_SumOfPointersToMyUnanalyzedWords_GrammarCount
= result
;
533 return m_SumOfPointersToMyUnanalyzedWords_GrammarCount
;
544 //-----------------------------------------------------------------
545 // Pointer information: Suffixes
546 //-----------------------------------------------------------------
548 double CDescriptionLength::GetSumOfPointersToMySuffixes( eMDL_STYLE MDLStyle
)
555 if ( m_SumOfPointersToMySuffixes_CorpusCount
== 0 )
557 result
= m_MyMiniLexicon
->GetSuffixes()->CalculatePointersToMySuffixes (CorpusCount
);
558 m_SumOfPointersToMySuffixes_CorpusCount
= result
;
560 return m_SumOfPointersToMySuffixes_CorpusCount
;
563 if ( m_SumOfPointersToMySuffixes_CorpusCount
== 0 )
565 result
= m_MyMiniLexicon
->GetSuffixes()->CalculatePointersToMySuffixes (GrammarCount
);
566 m_SumOfPointersToMySuffixes_GrammarCount
= result
;
570 return m_SumOfPointersToMySuffixes_GrammarCount
;
581 //-----------------------------------------------------------------
582 // Pointer information: Suffix Signatures
583 //-----------------------------------------------------------------
584 //double CDescriptionLength::GetSumOfInformationContentOfPointersInMySuffixSignatures ( ) unused parameter 'RecomputeFlag
585 double CDescriptionLength::GetSumOfInformationContentOfPointersInMySuffixSignatures ( )
588 m_SumOfInformationContentOfPointersInMySuffixSignatures
=
589 m_MyMiniLexicon
->GetSignatures()->ComputeDLofInternalPointersOfEachMember( GrammarCount
);
591 return m_SumOfInformationContentOfPointersInMySuffixSignatures
;
594 double CDescriptionLength::GetSumOfPointersToMySuffixSignatures( eMDL_STYLE MDLStyle
)
601 if ( m_SumOfPointersToMySuffixSignatures_CorpusCount
== 0 )
603 result
= m_MyMiniLexicon
->GetSignatures()->ComputeLengthOfPointersToEachOfMyMembers (CorpusCount
);
604 m_SumOfPointersToMySuffixSignatures_CorpusCount
= result
;
607 return m_SumOfPointersToMySuffixSignatures_CorpusCount
;
611 if ( m_SumOfPointersToMySuffixSignatures_CorpusCount
== 0 )
613 result
= m_MyMiniLexicon
->GetSignatures()->ComputeLengthOfPointersToEachOfMyMembers(GrammarCount
);
614 m_SumOfPointersToMySuffixSignatures_GrammarCount
= result
;
618 return m_SumOfPointersToMySuffixSignatures_GrammarCount
;
632 //-----------------------------------------------------------------
634 //-----------------------------------------------------------------
635 double CDescriptionLength::CalculateCompressedLengthOfCorpus()
639 m_CompressedLengthOfCorpus
= 0;
641 for (i
= 0; i
< m_MyMiniLexicon
->GetSignatures()->GetCount(); i
++)
643 pSig
= m_MyMiniLexicon
->GetSignatures()->GetAt(i
);
644 m_CompressedLengthOfCorpus
+= pSig
->ComputeDLofMyCorpus();
647 m_CompressedLengthOfCorpus
+= m_MyMiniLexicon
->CalculateCompressedLengthOfUnanalyzedWords( );
649 return m_CompressedLengthOfCorpus
;
654 double CalculateTotalPhonologicalContentOfStems();
655 double CalculateUnanalyzedWordsTotalPhonologicalContent();
656 double GetPhonologicalInformationContentOfSuffixes( );
659 //////////////////////////////////////////////////////////////////////////////
661 /// Description Length of Stem Collection
663 //////////////////////////////////////////////////////////////////////////////
665 double CStemCollection::CalculateTotalPhonologicalInformationContent(CLexicon
* MotherLexicon
)
667 double StemsTotalPhonologicalContent
= 0;
670 for (int i
= 0; i
< m_pMiniLex
->GetStems()->GetCount(); ++i
) {
671 pStem
= m_pMiniLex
->GetStems()->GetAt(i
);
672 StemsTotalPhonologicalContent
+= pStem
->CalculatePhonologicalInformationContent( MotherLexicon
);
674 return StemsTotalPhonologicalContent
;
676 double CStemCollection::CalculateSumOfPointersToMyStems ( eMDL_STYLE style
)
679 double TotalLength
= 0,
685 for (i
= 0; i
< GetCount();i
++)
687 denominator
+= GetAt(i
)->GetCorpusCount();
690 denominator
+= GetMiniLexicon()->GetCorpusCountOfUnanalyzedWords();
692 for (i
= 0; i
< GetCount();i
++)
694 length
= base2log ( denominator
/ GetAt(i
)->GetCorpusCount() ) ;
695 GetAt(i
)->SetLengthOfPointerToMe ( length
);
696 TotalLength
+= length
;
702 for (i
= 0; i
< GetCount();i
++)
704 denominator
+= GetAt(i
)->GetNumberOfSuffixes();
706 int NumberOfUnanalyzedWords
;
707 GetMiniLexicon()->GetNumberOfAnalyzedWords ( NumberOfUnanalyzedWords
);
709 denominator
+= NumberOfUnanalyzedWords
;
712 for (i
= 0; i
< GetCount();i
++)
714 length
= base2log ( denominator
/ GetAt(i
)->GetNumberOfSuffixes() );
715 GetAt(i
)->SetLengthOfPointerToMe ( length
);
716 TotalLength
+= length
;