CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / mTVolca.cpp
blob6dc24dfac40b05e76e7cc21c6b7a1e8ffd8f7cb1
1 // Implementation of mTVolca methods
2 // Copyright © 2009 The University of Chicago
3 #include "mTVolca.h"
5 #include <Q3FileDialog>
6 #include <QMessageBox>
7 #include <Q3TextStream>
8 #include <QFile>
9 #include <QString>
10 #include "Typedefs.h"
12 //////////////////////////////////////////////////////////////////////
13 // Construction/Destruction
14 //////////////////////////////////////////////////////////////////////
16 mTVolca::mTVolca(cMT* myMT, QString projectDirectory)
18 m_myMT = myMT;
19 m_projectDirectory = projectDirectory;
21 m_language1TotalWords =0;
22 m_language2TotalWords =0;
24 m_language1FileName = "";
25 m_language2FileName = "";
27 m_countOfSentences = 0;
31 mTVolca::~mTVolca()
33 // To do clean
37 void mTVolca::initVolList()
40 m_language1Words.clear();
41 m_language2Words.clear();
42 m_language1WordIndex.clear();
43 m_language2WordIndex.clear();
46 QString language1FileName;
47 QString language2FileName;
48 QFile* language1File;
49 QFile* language2File;
50 QString oneWord;
51 int wordIndex;
52 StringToInt::iterator StringToIntIt;
55 language1FileName = Q3FileDialog::getOpenFileName( m_projectDirectory,
56 "Text File (*.txt);;All files (*.*);;Corpus File (*.corpus)",
57 NULL,
58 "open file dialog",
59 "Choose Language1 corpus" );
61 language2FileName = Q3FileDialog::getOpenFileName( m_projectDirectory,
62 "Text File (*.txt);;All files (*.*);;Corpus File (*.corpus)",
63 NULL,
64 "open file dialog",
65 "Choose Language2 corpus" );
68 m_language1FileName = language1FileName;
69 m_language2FileName = language2FileName;
71 language1File = new QFile( language1FileName );
72 if ( language1File->open( QIODevice::ReadOnly ) )
74 Q3TextStream stream( language1File );
75 stream.setEncoding( Q3TextStream::Locale );
77 while( !stream.atEnd() )
79 stream >> oneWord;
81 // a little filtering
82 if (oneWord.length() >0 && oneWord != QString(".") && oneWord != QString(","))
84 m_language1Words.insert(oneWord, 0);
89 m_language1TotalWords = m_language1Words.size();
91 wordIndex =0;
92 for ( StringToIntIt = m_language1Words.begin(); StringToIntIt != m_language1Words.end(); StringToIntIt++ )
94 oneWord = StringToIntIt.key();
95 m_language1Words[oneWord] = wordIndex;
96 m_language1WordIndex.insert(wordIndex, oneWord);
97 wordIndex++;
100 StringToIntIt--;
101 wordIndex = StringToIntIt.data();
103 //QMessageBox::information ( NULL, "Linguistica : MT Model1", QString("The total is %1 and last index is %2.").arg(m_language1TotalWords).arg(wordIndex), "OK" );
105 language1File ->close();
107 else
109 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Can't Open Language 1 Corpus !", "OK" );
110 return;
113 language2File = new QFile( language2FileName );
114 if ( language2File->open( QIODevice::ReadOnly ) )
116 Q3TextStream stream( language2File );
117 stream.setEncoding( Q3TextStream::Locale );
119 while( !stream.atEnd() )
121 stream >> oneWord;
123 // a little filtering
124 if (oneWord.length() >0 && oneWord != QString(".") && oneWord != QString(","))
126 m_language2Words.insert(oneWord, 0);
130 m_language2TotalWords = m_language2Words.size();
132 wordIndex =0;
133 for ( StringToIntIt = m_language2Words.begin(); StringToIntIt != m_language2Words.end(); StringToIntIt++ )
135 oneWord = StringToIntIt.key();
136 m_language2Words[oneWord] = wordIndex;
137 m_language2WordIndex.insert(wordIndex, oneWord);
138 wordIndex++;
141 StringToIntIt--;
142 wordIndex = StringToIntIt.data();
144 //QMessageBox::information ( NULL, "Linguistica : MT Model1", QString("The total is %1 and last index is %2.").arg(m_language2TotalWords).arg(wordIndex), "OK" );
146 language2File ->close();
149 else
151 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Can't Open Language 2 Corpus !", "OK" );
152 return;
159 void mTVolca::readSentences()
162 int countOfLanguage1Sentence,countOfLanguage2Sentence;
163 QFile* language1File;
164 QFile* language2File;
165 QString oneLine;
166 QString oneWord;
167 int indexInsideSentence;
168 int idOfTheWord;
169 IntToInt* oneSentence;
173 countOfLanguage1Sentence =0;
174 language1File = new QFile( m_language1FileName );
175 if ( language1File->open( QIODevice::ReadOnly ) )
177 Q3TextStream stream( language1File );
178 stream.setEncoding( Q3TextStream::Locale );
180 while( !stream.atEnd() )
182 oneLine = stream.readLine();
185 Q3TextStream lineStream( &oneLine, QIODevice::ReadOnly );
187 indexInsideSentence = 0;
188 oneSentence = new IntToInt();
189 while ( ! lineStream.atEnd())
191 lineStream >> oneWord;
193 if (oneWord.length() >0 && oneWord != QString(".") && oneWord != QString(","))
195 idOfTheWord = m_language1Words[oneWord];
196 oneSentence ->insert(indexInsideSentence, idOfTheWord);
197 indexInsideSentence++;
201 if ( indexInsideSentence >= 1)
203 m_language1Sentences.insert(countOfLanguage1Sentence,oneSentence);
204 countOfLanguage1Sentence++;
206 else
208 delete oneSentence;
213 else
215 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Can't Open Language 1 Corpus !", "OK" );
216 return;
220 countOfLanguage2Sentence =0;
221 language2File = new QFile( m_language2FileName );
222 if ( language2File->open( QIODevice::ReadOnly ) )
224 Q3TextStream stream( language2File );
225 stream.setEncoding( Q3TextStream::Locale );
227 while( !stream.atEnd() )
229 oneLine = stream.readLine();
232 Q3TextStream lineStream( &oneLine, QIODevice::ReadOnly );
234 indexInsideSentence = 0;
235 oneSentence = new IntToInt();
236 while ( ! lineStream.atEnd())
238 lineStream >> oneWord;
240 if (oneWord.length() >0 && oneWord != QString(".") && oneWord != QString(","))
242 idOfTheWord = m_language2Words[oneWord];
243 oneSentence ->insert(indexInsideSentence, idOfTheWord);
244 indexInsideSentence++;
248 if ( indexInsideSentence >= 1)
250 m_language2Sentences.insert(countOfLanguage2Sentence,oneSentence);
251 countOfLanguage2Sentence++;
253 else
255 QMessageBox::information ( NULL, "Linguistica : MT Model1 Error", QString("At Sentence %1").arg(countOfLanguage2Sentence), "OK" );
256 delete oneSentence;
261 else
263 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Can't Open Language 2 Corpus !", "OK" );
264 return;
268 if ( countOfLanguage1Sentence != countOfLanguage2Sentence )
270 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Language1 Sentence# != Language2 Sentence#!", "OK" );
271 // To do Clean sentences
272 return;
275 m_countOfSentences = countOfLanguage1Sentence;
278 //QMessageBox::information ( NULL, "Linguistica : MT Model1", QString("Read In %1 sentences !").arg(m_countOfSentences), "OK" );
281 // Debug
283 QString oneSentenceStr;
284 QString oneVolcaWord;
285 QString twoVolcaWord;
286 int oneWordId;
287 int count;
288 int i;
291 oneSentence = m_language1Sentences[1];
292 count = oneSentence ->size();
295 oneSentenceStr = "";
296 for ( i=0; i < count; i++)
298 oneWordId = (*oneSentence)[i];
299 oneSentenceStr += QString("%1-").arg(oneWordId);
303 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Lan1 Sentence 1 is: " + oneSentenceStr, "OK" );
305 oneSentence = m_language2Sentences[1];
306 count = oneSentence ->size();
309 oneSentenceStr = "";
310 for ( i=0; i < count; i++)
312 oneWordId = (*oneSentence)[i];
313 oneSentenceStr += QString("%1-").arg(oneWordId);
316 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Lan2 Sentence 1 is: " + oneSentenceStr, "OK" );
321 void mTVolca::setFastSearchPairsForT()
323 QString fastSearchPairs;
324 int i, l, m;
325 int idOfOneLanguage1Word;
326 int idOfOneLanguage2Word;
327 IntToInt* oneLan1Sentence;
328 IntToInt* oneLan2Sentence;
329 IntToDouble* lan2WordsForOneLan1Word;
330 IntToDouble* softCountLan2WordsForOneLan1Word;
334 for ( i=0; i < m_countOfSentences; i++)
336 oneLan1Sentence = m_language1Sentences[i];
337 oneLan2Sentence = m_language2Sentences[i];
339 for ( l=0; l< static_cast <int> (oneLan1Sentence ->size() ); l++)
341 idOfOneLanguage1Word = (*oneLan1Sentence)[l];
343 if ( m_fastWordsPairs.contains(idOfOneLanguage1Word))
345 lan2WordsForOneLan1Word = m_fastWordsPairs[idOfOneLanguage1Word];
347 else
349 lan2WordsForOneLan1Word = new IntToDouble();
350 m_fastWordsPairs.insert(idOfOneLanguage1Word, lan2WordsForOneLan1Word);
353 if ( m_fastWordsSoftCounts.contains(idOfOneLanguage1Word))
355 softCountLan2WordsForOneLan1Word = m_fastWordsSoftCounts[idOfOneLanguage1Word];
357 else
359 softCountLan2WordsForOneLan1Word = new IntToDouble();
360 m_fastWordsSoftCounts.insert(idOfOneLanguage1Word, softCountLan2WordsForOneLan1Word);
363 for ( m=0; m < static_cast <int> ( oneLan2Sentence ->size() ); m++)
365 idOfOneLanguage2Word = (*oneLan2Sentence)[m];
367 lan2WordsForOneLan1Word ->insert(idOfOneLanguage2Word, 0.0);
368 softCountLan2WordsForOneLan1Word ->insert(idOfOneLanguage2Word, 0.0);
374 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Finished Indexing the Word Pairs", "OK" );
379 void mTVolca::clearSentenceViterbiAlignment()
381 IntToIntToInt::iterator IntToIntToIntIt;
382 IntToInt* oneAlignment;
384 for ( IntToIntToIntIt = m_sentenceAlignments.begin(); IntToIntToIntIt != m_sentenceAlignments.end(); IntToIntToIntIt++)
386 oneAlignment = IntToIntToIntIt.data();
387 delete oneAlignment;
390 m_sentenceAlignments.clear();