CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / CompoundCollection.cpp
blob1e945d0e6c5182186e75522dc03d8d2fce041ab4
1 // Implementation of CCompoundCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "CompoundCollection.h"
5 #include <QList>
6 #include "linguisticamainwindow.h"
7 #include "ui/Status.h"
8 #include "MiniLexicon.h"
9 #include "Lexicon.h"
10 #include "Linker.h"
11 #include "Edge.h"
12 #include "Stem.h"
13 #include "LinkerCollection.h"
14 #include "WordCollection.h"
15 #include "log2.h"
17 class CPhoneCollection;
19 //////////////////////////////////////////////////////////////////////
20 // Construction/Destruction
21 //////////////////////////////////////////////////////////////////////
23 CCompoundCollection::CCompoundCollection( CMiniLexicon* Lex )
25 m_pMiniLex = Lex;
26 if( m_pMiniLex ) m_pLexicon = m_pMiniLex->GetLexicon();
27 m_ComponentCount = 0.0;
28 m_LinkerCount = 0.0;
31 CCompoundCollection::CCompoundCollection( CLexicon* Lex )
33 m_pMiniLex = NULL;
34 m_pLexicon = Lex;
35 m_ComponentCount = 0.0;
36 m_LinkerCount = 0.0;
39 CCompoundCollection::~CCompoundCollection()
41 m_pMiniLex = NULL;
42 m_pLexicon = NULL;
43 m_ComponentCount = 0.0;
44 m_LinkerCount = 0.0;
47 void CCompoundCollection::FindMostProbableParse()
49 QList<CStem*>* stemSet;
50 CMiniLexicon* mini;
51 mini = NULL;
53 CCompound* pCompound;
54 CLinker* pLinker;
55 CEdge* pEdge;
56 CParse oneParse;
58 int j;
60 linguistica::ui::status_user_agent status = m_pLexicon->status_display();
62 status.major_operation = "Counting Components";
63 status.progress.clear();
64 status.progress.set_denominator(GetCount());
65 Sort(KEY);
66 for (int i = 0; i < GetCount(); i++) {
67 status.progress = i;
68 pCompound = GetAtSort(i);
70 double component_count,
71 parse_log_prob,
72 smallest_plog = 0.0;
74 uint most_probable_parse = 0;
76 for( pEdge = pCompound->GetParses()->first(); pEdge; pEdge = pCompound->GetParses()->next() )
77 //for (int z = 0; z < pCompound->GetParses()->size(); z++)
78 { // pEdge = pCompound->GetParses()->at(z);
79 pEdge->GetParse( &oneParse );
80 parse_log_prob = 0.0;
81 for( j = 1; j <= oneParse.Size(); j++ )
83 stemSet = m_pLexicon->GetStemSet( oneParse.GetPiece(j) );
84 if( stemSet )
86 // TODO: we might be adding the same stem more than once, check and fix
87 component_count = 0.0;
88 if (stemSet->size() > 0) {
89 for (int y = 0; y < stemSet->size(); y++) {
90 CStem* pStem = stemSet->at(y);
91 component_count += pStem->GetCompoundCount();
94 CStem* pStem = stemSet->at(stemSet->size() - 1);
95 parse_log_prob -= base2log( pStem->GetCompoundCount() / m_ComponentCount + m_LinkerCount );
98 else
100 // Since this is not in the list of stems, it could be an
101 // unanalyzed word. We should only look at the last mini-
102 // lexicon
103 for( i = m_pLexicon->GetMiniSize(); i >= 0; i++ )
105 mini = m_pLexicon->GetMiniLexicon(i);
106 if( mini )
108 if (CStem* pStem = *mini->GetWords() ^= oneParse.GetPiece(j)) {
109 parse_log_prob -= base2log( pStem->GetCompoundCount() / m_ComponentCount + m_LinkerCount );
111 break;
115 // Otherwise must be a linker
116 if( !mini )
118 pLinker = *m_pLexicon->GetLinkers() ^= oneParse.GetPiece(j);
119 Q_ASSERT( pLinker );
120 if( pLinker )
122 parse_log_prob -= base2log( pLinker->GetCompoundCount() / m_ComponentCount + m_LinkerCount );
128 if( parse_log_prob < smallest_plog || smallest_plog == 0.0 )
130 smallest_plog = parse_log_prob;
131 most_probable_parse = pCompound->GetParses()->find( pEdge );
134 pCompound->SetBestParse( most_probable_parse );
136 m_pLexicon->UpdateCompound( pCompound->Display() );
138 status.progress.clear();
139 status.major_operation.clear();
142 void CCompoundCollection::CheckAndRecount()
144 CCompound* pCompound;
145 CEdge* pParse;
147 CParse parse;
149 StemSet* pStemSet;
150 CStem* pStem;
152 QList<CCompound*> invalidCmpds;
154 // Reset all stem's compound counts to 0.0
155 Q3DictIterator<StemSet> it( *m_pLexicon->GetAllStems() );
156 for( ; it.current(); ++it )
158 //for( pStem = it.current()->first(); pStem; pStem = it.current()->next() )
159 for (int z= 0; z < it.current()->size(); z++)
161 pStem = it.current()->at(z);
162 pStem->SetCompoundCount( 0.0 );
166 it = Q3DictIterator<StemSet>( *m_pLexicon->GetAllWords() );
167 for( ; it.current(); ++it )
169 //for( pStem = it.current()->first(); pStem; pStem = it.current()->next() )
170 for (int z = 0; z < it.current()->size(); z++)
171 { pStem = it.current()->at(z);
172 pStem->SetCompoundCount( 0.0 );
176 // Count components and remove compounds with missing components
177 for (int i = 0; i < GetCount(); ++i) {
178 pCompound = GetAt(i);
180 QList<CEdge*> invalidParses;
182 for( pParse = pCompound->GetParses()->first(); pParse; pParse = pCompound->GetParses()->next() )
183 //for (int z = 0; z < pCompound->GetParses()->size(); z++)
184 { // pParse = pCompound->GetParses()->at(z);
185 pParse->GetParse( &parse );
187 for( int j = 1; j <= parse.Size(); j++ )
189 pStemSet = (*m_pLexicon->GetAllStems())[ parse.GetPiece(j).Display() ];
190 if( !pStemSet ) pStemSet = (*m_pLexicon->GetAllWords())[ parse.GetPiece(j).Display() ];
191 if( !pStemSet )
193 invalidParses.append( pParse );
194 break;
197 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
198 for (int y = 0; y < pStemSet->size(); y++)
199 { pStem = pStemSet->at(y);
200 pStem->IncrementCompoundCount(
201 double(pCompound->GetCorpusCount()) /
202 pCompound->GetParses()->count());
207 // Delete invalid parses
208 //for( pParse = invalidParses.first(); pParse; pParse = invalidParses.next() )
209 for (int x=0; x < invalidParses.size(); x++)
210 { pParse=invalidParses.at(x);
211 pCompound->RemoveParse( pParse );
212 m_pLexicon->UpdateCompound( pCompound->Display() );
215 if( pCompound->GetParses()->count() == 0 ) invalidCmpds.append( pCompound );
218 // Delete invalid compounds
219 //for( pCompound = invalidCmpds.first(); pCompound; pCompound = invalidCmpds.next() )
220 for (int w = 0; w < invalidCmpds.size(); w++)
221 { pCompound = invalidCmpds.at(w);
222 m_pLexicon->UpdateCompound( pCompound->Display() );
223 RemoveMember( pCompound );