HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Stem_Phonology.cpp
blob922a0a5a3658c30f9a4a6a303c05be10bf43bfa0
1 // Implementation of CStem phonology methods
2 // Copyright © 2009 The University of Chicago
3 #include "Stem.h"
4 #include "Biphone.h"
5 #include "Phone.h"
6 #include "BiphoneCollection.h"
7 #include "PhoneCollection.h"
8 #include "WordCollection.h"
9 #include "log2.h"
11 void CStem::ComputeProbabilities(CWordCollection* Words)
13 m_BigramLogProb = 0;
14 m_UnigramLogProb = 0;
16 m_Tier2_LocalMI_Score = 0;
17 m_Tier2_DistantMI_Score = 0;
19 m_LocalMI_TotalBoltzmannScore = 0;
20 m_DistantMI_TotalBoltzmannScore = 0;
22 CPhone* prevPhone = 0;
23 for (int i = 1; i <= m_Phonology_Tier1.Size(); ++i) {
24 CPhone* pPhone = *Words->GetPhones() ^=
25 m_Phonology_Tier1.GetPiece(i);
26 Q_ASSERT(pPhone != 0);
28 if (i == 1) {
29 prevPhone = pPhone;
30 continue;
32 m_UnigramLogProb += pPhone->m_LogFreq;
33 CBiphone* pBiphone = Words->GetPhones()->GetMyBiphones()
34 ->GetBiphone(prevPhone, pPhone);
35 if (pBiphone == 0)
36 return;
38 m_BigramLogProb += pPhone->m_LogFreq - pBiphone->m_MI;
39 prevPhone = pPhone;
42 m_UnigramComplexity = m_UnigramLogProb / (m_Phonology_Tier1.Size()-1);
43 m_BigramComplexity = m_BigramLogProb / (m_Phonology_Tier1.Size()-1);
45 if (m_Phonology_Tier2.Size() == 0)
46 return;
48 for (int i = 1; i <= m_Phonology_Tier2.Size(); ++i) {
49 CPhone* pPhone = *Words->GetPhones() ^=
50 m_Phonology_Tier2.GetPiece(i);
51 if (i == 1) {
52 prevPhone = pPhone;
53 continue;
55 CBiphone* pBiphone = Words->GetPhones_Tier2()->GetMyBiphones()
56 ->GetBiphone(prevPhone, pPhone);
57 if (pBiphone == 0)
58 return;
59 else
60 m_Tier2_LocalMI_Score += pBiphone->m_MI;
61 prevPhone = pPhone;
64 m_LocalMI_TotalBoltzmannScore = m_BigramLogProb - m_Tier2_LocalMI_Score;
65 m_Tier2_DistantMI_Score = 0;
67 for (int i = 1; i <= m_Phonology_Tier2.Size(); ++i) {
68 CPhone* pPhone = *Words->GetPhones() ^=
69 m_Phonology_Tier2.GetPiece (i);
70 for (int j = i+1; j <= m_Phonology_Tier2.Size(); ++j) {
71 CPhone* qPhone = *Words->GetPhones() ^=
72 m_Phonology_Tier2.GetPiece (j);
73 CBiphone* pBiphone = Words->m_Phones_Tier2->GetMyBiphones()
74 ->GetBiphone(pPhone, qPhone);
75 if (pBiphone != 0)
76 m_Tier2_DistantMI_Score += pBiphone->m_MI / (j-i);
79 m_DistantMI_TotalBoltzmannScore = m_BigramLogProb -
80 m_Tier2_DistantMI_Score;
83 void CStem::ComputeBoltzmannProbabilities(double Z, double ZStar)
85 // XXX. what if Z or Zstar == 0.0?
87 if (Z != 0.0)
88 m_LocalMI_Plog = m_LocalMI_TotalBoltzmannScore + log2(Z);
90 if (ZStar != 0.0)
91 m_DistantMI_Plog = m_BigramLogProb - m_Tier2_DistantMI_Score +
92 log2(ZStar);
95 void CStem::GetPhonogyTier1InfoForGraph(CWordCollection* Words)
97 int i;
98 CPhone* pPhone, *prevPhone;
99 prevPhone = NULL;
100 QString biphone;
101 CBiphone* pBiphone;
102 double ugram;
103 double mi;
105 m_phonologies.clear();
106 m_unigrams.clear();
107 m_mis.clear();
109 m_countofunigrams = 0;
110 m_countofmis = 0;
111 m_maxpositive =0;
112 m_maxnegative =0;
114 for (i= 1; i <= m_Phonology_Tier1.Size(); i++)
116 QString temp2 = m_Phonology_Tier1.GetPiece (i).Display();
117 pPhone = *Words->GetPhones() ^= m_Phonology_Tier1.GetPiece (i);
118 Q_ASSERT (pPhone);
119 if (i == 1)
121 prevPhone = pPhone;
122 continue;
124 ugram = pPhone->m_LogFreq;
126 pBiphone = Words->GetPhones()->GetMyBiphones()->GetBiphone (prevPhone, pPhone);
128 Q_ASSERT (pBiphone);
130 mi = pBiphone->m_MI;
133 m_phonologies.insert(m_countofunigrams, temp2);
134 m_unigrams.insert(m_countofunigrams, ugram);
135 m_mis.insert(m_countofmis, mi);
137 m_countofunigrams++;
138 m_countofmis++;
140 prevPhone = pPhone;
142 if ( ugram > 0)
144 if ( ugram > m_maxpositive)
146 m_maxpositive = ugram;
150 if ( mi > 0)
152 if ( mi > m_maxpositive)
154 m_maxpositive = mi;
157 else
159 if ( mi < m_maxnegative)
161 m_maxnegative = mi;
168 m_donephonology = true;
171 QString CStem::GetProbabilityInformation()
173 return QString("\nUnigram log probability %1"
174 "\nUnigram complexity %2"
175 "\nBigram log probability %3"
176 "\nBigram complexity %4"
177 "\nTier 2 MI score %5"
178 "\nLocal tier 2 model score: %6"
179 "\nLocal tier 2 model log probability: %7"
180 "\nDistant tier 2 MI: %8"
181 "\nDistant tier 2 model score %9")
182 .arg(m_UnigramLogProb)
183 .arg(m_UnigramComplexity)
184 .arg(m_BigramLogProb)
185 .arg(m_BigramComplexity)
186 .arg(m_Tier2_LocalMI_Score)
187 .arg(m_LocalMI_TotalBoltzmannScore)
188 .arg(m_LocalMI_Plog)
189 .arg(m_Tier2_DistantMI_Score)
190 .arg(m_DistantMI_TotalBoltzmannScore);
193 void CStem::SplitPhonologyToTiers(enum ePhonologySplitType Type,
194 CParse& PhonesToMove)
196 const QString DummySymbol = "*";
198 m_Phonology_Tier2.ClearParse();
199 for (int i = 1; i <= m_Phonology_Tier1.Size(); ++i)
200 if (PhonesToMove.Contains(m_Phonology_Tier1.GetPiece(i))) {
201 m_Phonology_Tier2.Append(m_Phonology_Tier1.GetPiece(i));
203 if (Type == Split_LeaveSlot) {
204 CStringSurrogate dummy = DummySymbol;
205 m_Phonology_Tier1.Replace(i, dummy);
210 // this is here specifically to do probabilistic tests on projections to C and V.
211 void CStem::CreateCVTemplate(CParse* Vowels)
213 QString V ("V"), C("C"); QString boundary ("#");
215 m_Phonology_Tier1_Skeleton.ClearParse();
216 m_Phonology_Tier1_Skeleton.Append ( boundary );
218 CParse VowelsAndAsterisk ( *Vowels);
219 VowelsAndAsterisk.Append ('*');
222 for (int i = 2; i < m_Phonology_Tier1.Size(); i++)
224 QString b; b= m_Phonology_Tier1.GetPiece(i).Display();
225 if (VowelsAndAsterisk.Contains ( m_Phonology_Tier1.GetPiece(i) ) )
227 m_Phonology_Tier1_Skeleton.Append ( V );
229 else
231 m_Phonology_Tier1_Skeleton.Append ( C );
234 m_Phonology_Tier1_Skeleton.Append ( boundary );
237 void CStem::CreatePhonologyFromOrthography(eAddBoundarySymbols AddBoundaries)
239 if (m_Phonology_Tier1.GetKeyLength() > 0 ) return;
241 if (AddBoundaries == BOUNDARIES)
242 m_Phonology_Tier1.Append(QChar('#'));
244 for (int i = 0; i < GetKeyLength(); ++i)
245 m_Phonology_Tier1.Append(CStringSurrogate(m_Key, i, 1));
247 if (AddBoundaries == BOUNDARIES)
248 m_Phonology_Tier1.Append(QChar('#'));
251 void CStem::SetPhonology_Tier1(CParse* PhonoRep)
253 m_Phonology_Tier1.Append(QChar('#'));
254 m_Phonology_Tier1.Append(*PhonoRep);
255 m_Phonology_Tier1.Append(QChar('#'));