HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / cMT.cpp
blobd1c5ce99b7489b20191db1e23dc0f451b1270344
1 // Implementation of the cMT class
2 // Copyright © 2009 The University of Chicago
3 #include "cMT.h"
5 #include <QMessageBox>
6 #include <Q3TextStream>
7 #include <QFile>
8 #include <Q3SortedList>
9 #include <QMap>
10 #include "cMTModel2Norm.h"
11 #include "cMTModel1.h"
12 #include "mTVolca.h"
13 #include "Typedefs.h"
16 //////////////////////////////////////////////////////////////////////
17 // Construction/Destruction
18 //////////////////////////////////////////////////////////////////////
20 cMT::cMT(LinguisticaMainWindow* parent, QString projectDirectory)
21 : m_parent(parent),
22 m_projectDirectory(projectDirectory),
23 m_Volca(),
24 m_Model1(),
25 m_Model2Norm(),
26 m_MTLog("LinguisticaMTLog.txt") { }
28 cMT::~cMT()
30 delete m_Model2Norm;
31 delete m_Model1;
32 delete m_Volca;
35 void cMT::readTrainingCorpus()
37 delete m_Volca;
38 m_Volca = new mTVolca(this, m_projectDirectory);
40 m_Volca ->initVolList();
42 m_Volca ->readSentences();
44 m_Volca ->setFastSearchPairsForT();
46 QMessageBox::information(NULL, "Status", QString("Words # in language 1 is %1!").arg(m_Volca ->m_language1TotalWords), "OK");
47 QMessageBox::information(NULL, "Status", QString("Words # in language 2 is %1!").arg(m_Volca ->m_language2TotalWords), "OK");
51 void cMT::trainModel1(int model1Iterations)
53 delete m_Model1;
54 m_Model1 = new cMTModel1(this, model1Iterations);
56 m_Model1 ->initT();
58 m_Model1 ->EMLoops(m_Model1 ->m_Iterations);
60 QMessageBox::information(NULL, "Status", "Logging T After Model1...!", "OK");
61 logTAfterModel1();
62 QMessageBox::information(NULL, "Status", "Done logging T After Model1...!", "OK");
67 void cMT::logTAfterModel1()
69 int language1WordId;
70 int language2WordId;
71 QString language1Word;
72 QString language2Word;
73 double TValue;
74 IntToIntToDouble::iterator IntToIntToDoubleIt;
75 IntToDouble* oneList;
76 IntToDouble::iterator IntToDoubleIt;
77 mTSortedList sortPlatForm;
78 mTForSortingItem* oneSortItem;
79 int outputTopLimit = 20;
80 int outputIndex;
81 QFile file( m_MTLog );
83 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
85 QMessageBox::information(NULL, "Error", "Can't Open the MT Log file!", "OK");
86 return;
89 Q3TextStream outf( &file );
91 outf << "******T Tables After Model1******" << endl <<endl;
94 sortPlatForm.setAutoDelete(TRUE);
96 for ( IntToIntToDoubleIt = (m_Model1 ->m_T).begin(); IntToIntToDoubleIt != (m_Model1 ->m_T).end(); IntToIntToDoubleIt++)
98 language1WordId = IntToIntToDoubleIt.key();
99 language1Word = (m_Volca ->m_language1WordIndex)[language1WordId] ;
100 outf << language1Word << " :"<<endl ;
102 oneList = IntToIntToDoubleIt.data();
104 for (IntToDoubleIt = oneList ->begin(); IntToDoubleIt != oneList ->end(); IntToDoubleIt++)
106 language2WordId = IntToDoubleIt.key();
107 language2Word = (m_Volca ->m_language2WordIndex)[language2WordId] ;
109 TValue = IntToDoubleIt.data();
111 oneSortItem = new mTForSortingItem(language2Word, TValue, 1);
113 sortPlatForm.append(oneSortItem);
116 sortPlatForm.sort();
118 outputIndex = 0;
119 for ( oneSortItem=sortPlatForm.first(); oneSortItem != 0; oneSortItem=sortPlatForm.next())
121 outputIndex++;
123 language2Word = oneSortItem ->m_name;
124 TValue = oneSortItem ->m_doubleValue;
126 outf <<" "<< outputIndex <<" : " <<language2Word << " " << TValue <<endl ;
128 if ( outputIndex >= outputTopLimit) break;
132 sortPlatForm.clear();
136 file.close();
141 void cMT::trainModel2Norm(int model2Iterations, bool getTFromModel1)
143 delete m_Model2Norm;
144 m_Model2Norm = new cMTModel2Norm(this, model2Iterations, getTFromModel1);
146 m_Model2Norm ->initTandA();
148 m_Model2Norm ->EMLoops(m_Model2Norm ->m_Iterations);
150 QMessageBox::information(NULL, "Status", "Doing Viterbi for Model2Norm...", "OK");
152 m_Model2Norm ->viterbiAll();
154 QMessageBox::information(NULL, "Status", "Logging TandA After Model2Norm...!", "OK");
156 logTandAAfterModel2Norm();
158 QMessageBox::information(NULL, "Status", "Done logging TandA After Model2Norm...!", "OK");
163 void cMT::logTandAAfterModel2Norm()
165 int language1WordId;
166 int language2WordId;
167 QString language1Word;
168 QString language2Word;
169 int language1ChunkId;
170 int language2ChunkId;
171 QString language1ChunkStr;
172 QString language2ChunkStr;
173 double TValue;
174 double AValue;
175 IntToIntToDouble::iterator IntToIntToDoubleIt;
176 IntToDouble* oneList;
177 IntToDouble::iterator IntToDoubleIt;
178 mTSortedList sortPlatForm;
179 mTForSortingItem* oneSortItem;
180 int outputTopLimit = 20;
181 int outputIndex;
182 QFile file( m_MTLog );
184 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
186 QMessageBox::information(NULL, "Error", "Can't Open the MT Log file!", "OK");
187 return;
190 Q3TextStream outf( &file );
192 // Log T
193 outf << "******T Tables After Model2Norm******" << endl <<endl;
196 sortPlatForm.setAutoDelete(TRUE);
198 for ( IntToIntToDoubleIt = (m_Model2Norm ->m_T).begin(); IntToIntToDoubleIt != (m_Model2Norm ->m_T).end(); IntToIntToDoubleIt++)
200 language1WordId = IntToIntToDoubleIt.key();
201 language1Word = (m_Volca ->m_language1WordIndex)[language1WordId] ;
202 outf << language1Word << " :"<<endl ;
204 oneList = IntToIntToDoubleIt.data();
206 for (IntToDoubleIt = oneList ->begin(); IntToDoubleIt != oneList ->end(); IntToDoubleIt++)
208 language2WordId = IntToDoubleIt.key();
209 language2Word = (m_Volca ->m_language2WordIndex)[language2WordId] ;
211 TValue = IntToDoubleIt.data();
213 oneSortItem = new mTForSortingItem(language2Word, TValue, 1);
215 sortPlatForm.append(oneSortItem);
218 sortPlatForm.sort();
220 outputIndex = 0;
221 for ( oneSortItem=sortPlatForm.first(); oneSortItem != 0; oneSortItem=sortPlatForm.next())
223 outputIndex++;
225 language2Word = oneSortItem ->m_name;
226 TValue = oneSortItem ->m_doubleValue;
228 outf <<" "<< outputIndex <<" : " <<language2Word << " " << TValue <<endl ;
230 if ( outputIndex >= outputTopLimit) break;
234 sortPlatForm.clear();
237 // Log A
238 outf << "******A Tables After Model2Norm******" << endl <<endl;
241 sortPlatForm.setAutoDelete(TRUE);
243 for ( IntToIntToDoubleIt = (m_Model2Norm ->m_A).begin(); IntToIntToDoubleIt != (m_Model2Norm ->m_A).end(); IntToIntToDoubleIt++)
245 language2ChunkId = IntToIntToDoubleIt.key();
246 language2ChunkStr = QString("%1").arg(language2ChunkId);
247 outf << language2ChunkStr << " :"<<endl ;
249 oneList = IntToIntToDoubleIt.data();
251 for (IntToDoubleIt = oneList ->begin(); IntToDoubleIt != oneList ->end(); IntToDoubleIt++)
253 language1ChunkId = IntToDoubleIt.key();
254 language1ChunkStr = QString("%1").arg(language1ChunkId);
256 AValue = IntToDoubleIt.data();
258 oneSortItem = new mTForSortingItem(language1ChunkStr, AValue, 1);
260 sortPlatForm.append(oneSortItem);
263 sortPlatForm.sort();
265 outputIndex = 0;
266 outputTopLimit = 100;
267 for ( oneSortItem=sortPlatForm.first(); oneSortItem != 0; oneSortItem=sortPlatForm.next())
269 outputIndex++;
271 language1ChunkStr = oneSortItem ->m_name;
272 AValue = oneSortItem ->m_doubleValue;
274 outf <<" " <<language1ChunkStr << " " << AValue <<endl ;
276 if ( outputIndex >= outputTopLimit) break;
280 sortPlatForm.clear();
284 // Log Viterbi Alignments after model 2
285 outf << "******Viterbi Alignments After Model2Norm******" << endl <<endl;
287 mTVolca* myVolca;
288 int i;
289 int l,m;
290 IntToInt* oneLan1Sentence;
291 IntToInt* oneLan2Sentence;
292 IntToInt* oneAlignment;
293 int language1SentenceLen;
294 int language2SentenceLen;
295 QString oneWordStr;
296 QString twoWordStr;
300 myVolca = m_Volca;
302 for ( i=0; i < myVolca ->m_countOfSentences; i++)
304 outf <<"Sentence " << i << " : " << endl;
306 oneLan1Sentence = myVolca ->m_language1Sentences[i];
307 oneLan2Sentence = myVolca ->m_language2Sentences[i];
309 language1SentenceLen = oneLan1Sentence ->size();
310 language2SentenceLen = oneLan2Sentence ->size();
312 // Output sentence in lan1
313 outf << "Sentence 1 -> ";
314 for ( l =0; l < language1SentenceLen; l++)
316 language1WordId = (*oneLan1Sentence)[l];
317 oneWordStr = (m_Volca ->m_language1WordIndex)[language1WordId] ;
318 outf << l << " : " << oneWordStr << " ";
321 outf << endl;
323 // Output sentence in lan2
324 outf << "Sentence 2 -> ";
325 for ( m =0; m < language2SentenceLen; m++)
327 language2WordId = (*oneLan2Sentence)[m];
328 oneWordStr = (m_Volca ->m_language2WordIndex)[language2WordId] ;
329 outf << m << " : " << oneWordStr << " ";
332 outf << endl;
334 // Output the alignment
335 outf << "Alignments are language2Word -> language1Word" << endl;
336 oneAlignment = (myVolca ->m_sentenceAlignments)[i];
338 for ( m=0; m< language2SentenceLen; m++)
340 language2WordId = (*oneLan2Sentence)[m];
341 twoWordStr = (m_Volca ->m_language2WordIndex)[language2WordId] ;
343 language1WordId = (*oneLan1Sentence)[(*oneAlignment)[m]];
344 oneWordStr = (m_Volca ->m_language1WordIndex)[language1WordId] ;
346 outf << " " << twoWordStr << " --> " << oneWordStr << endl;
352 file.close();