1 // Implementation of CCompoundCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "CompoundCollection.h"
6 #include "linguisticamainwindow.h"
8 #include "MiniLexicon.h"
13 #include "LinkerCollection.h"
14 #include "WordCollection.h"
17 class CPhoneCollection
;
19 //////////////////////////////////////////////////////////////////////
20 // Construction/Destruction
21 //////////////////////////////////////////////////////////////////////
23 CCompoundCollection::CCompoundCollection( CMiniLexicon
* Lex
)
26 if( m_pMiniLex
) m_pLexicon
= m_pMiniLex
->GetLexicon();
27 m_ComponentCount
= 0.0;
31 CCompoundCollection::CCompoundCollection( CLexicon
* Lex
)
35 m_ComponentCount
= 0.0;
39 CCompoundCollection::~CCompoundCollection()
43 m_ComponentCount
= 0.0;
47 void CCompoundCollection::FindMostProbableParse()
49 QList
<CStem
*>* stemSet
;
60 linguistica::ui::status_user_agent status
= m_pLexicon
->status_display();
62 status
.major_operation
= "Counting Components";
63 status
.progress
.clear();
64 status
.progress
.set_denominator(GetCount());
66 for (int i
= 0; i
< GetCount(); i
++) {
68 pCompound
= GetAtSort(i
);
70 double component_count
,
74 uint most_probable_parse
= 0;
76 for( pEdge
= pCompound
->GetParses()->first(); pEdge
; pEdge
= pCompound
->GetParses()->next() )
77 //for (int z = 0; z < pCompound->GetParses()->size(); z++)
78 { // pEdge = pCompound->GetParses()->at(z);
79 pEdge
->GetParse( &oneParse
);
81 for( j
= 1; j
<= oneParse
.Size(); j
++ )
83 stemSet
= m_pLexicon
->GetStemSet( oneParse
.GetPiece(j
) );
86 // TODO: we might be adding the same stem more than once, check and fix
87 component_count
= 0.0;
88 if (stemSet
->size() > 0) {
89 for (int y
= 0; y
< stemSet
->size(); y
++) {
90 CStem
* pStem
= stemSet
->at(y
);
91 component_count
+= pStem
->GetCompoundCount();
94 CStem
* pStem
= stemSet
->at(stemSet
->size() - 1);
95 parse_log_prob
-= base2log( pStem
->GetCompoundCount() / m_ComponentCount
+ m_LinkerCount
);
100 // Since this is not in the list of stems, it could be an
101 // unanalyzed word. We should only look at the last mini-
103 for( i
= m_pLexicon
->GetMiniSize(); i
>= 0; i
++ )
105 mini
= m_pLexicon
->GetMiniLexicon(i
);
108 if (CStem
* pStem
= *mini
->GetWords() ^= oneParse
.GetPiece(j
)) {
109 parse_log_prob
-= base2log( pStem
->GetCompoundCount() / m_ComponentCount
+ m_LinkerCount
);
115 // Otherwise must be a linker
118 pLinker
= *m_pLexicon
->GetLinkers() ^= oneParse
.GetPiece(j
);
122 parse_log_prob
-= base2log( pLinker
->GetCompoundCount() / m_ComponentCount
+ m_LinkerCount
);
128 if( parse_log_prob
< smallest_plog
|| smallest_plog
== 0.0 )
130 smallest_plog
= parse_log_prob
;
131 most_probable_parse
= pCompound
->GetParses()->find( pEdge
);
134 pCompound
->SetBestParse( most_probable_parse
);
136 m_pLexicon
->UpdateCompound( pCompound
->Display() );
138 status
.progress
.clear();
139 status
.major_operation
.clear();
142 void CCompoundCollection::CheckAndRecount()
144 CCompound
* pCompound
;
152 QList
<CCompound
*> invalidCmpds
;
154 // Reset all stem's compound counts to 0.0
155 Q3DictIterator
<StemSet
> it( *m_pLexicon
->GetAllStems() );
156 for( ; it
.current(); ++it
)
158 //for( pStem = it.current()->first(); pStem; pStem = it.current()->next() )
159 for (int z
= 0; z
< it
.current()->size(); z
++)
161 pStem
= it
.current()->at(z
);
162 pStem
->SetCompoundCount( 0.0 );
166 it
= Q3DictIterator
<StemSet
>( *m_pLexicon
->GetAllWords() );
167 for( ; it
.current(); ++it
)
169 //for( pStem = it.current()->first(); pStem; pStem = it.current()->next() )
170 for (int z
= 0; z
< it
.current()->size(); z
++)
171 { pStem
= it
.current()->at(z
);
172 pStem
->SetCompoundCount( 0.0 );
176 // Count components and remove compounds with missing components
177 for (int i
= 0; i
< GetCount(); ++i
) {
178 pCompound
= GetAt(i
);
180 QList
<CEdge
*> invalidParses
;
182 for( pParse
= pCompound
->GetParses()->first(); pParse
; pParse
= pCompound
->GetParses()->next() )
183 //for (int z = 0; z < pCompound->GetParses()->size(); z++)
184 { // pParse = pCompound->GetParses()->at(z);
185 pParse
->GetParse( &parse
);
187 for( int j
= 1; j
<= parse
.Size(); j
++ )
189 pStemSet
= (*m_pLexicon
->GetAllStems())[ parse
.GetPiece(j
).Display() ];
190 if( !pStemSet
) pStemSet
= (*m_pLexicon
->GetAllWords())[ parse
.GetPiece(j
).Display() ];
193 invalidParses
.append( pParse
);
197 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
198 for (int y
= 0; y
< pStemSet
->size(); y
++)
199 { pStem
= pStemSet
->at(y
);
200 pStem
->IncrementCompoundCount(
201 double(pCompound
->GetCorpusCount()) /
202 pCompound
->GetParses()->count());
207 // Delete invalid parses
208 //for( pParse = invalidParses.first(); pParse; pParse = invalidParses.next() )
209 for (int x
=0; x
< invalidParses
.size(); x
++)
210 { pParse
=invalidParses
.at(x
);
211 pCompound
->RemoveParse( pParse
);
212 m_pLexicon
->UpdateCompound( pCompound
->Display() );
215 if( pCompound
->GetParses()->count() == 0 ) invalidCmpds
.append( pCompound
);
218 // Delete invalid compounds
219 //for( pCompound = invalidCmpds.first(); pCompound; pCompound = invalidCmpds.next() )
220 for (int w
= 0; w
< invalidCmpds
.size(); w
++)
221 { pCompound
= invalidCmpds
.at(w
);
222 m_pLexicon
->UpdateCompound( pCompound
->Display() );
223 RemoveMember( pCompound
);