1 // Implementation of mTVolca methods
2 // Copyright © 2009 The University of Chicago
5 #include <Q3FileDialog>
7 #include <Q3TextStream>
12 //////////////////////////////////////////////////////////////////////
13 // Construction/Destruction
14 //////////////////////////////////////////////////////////////////////
16 mTVolca::mTVolca(cMT
* myMT
, QString projectDirectory
)
19 m_projectDirectory
= projectDirectory
;
21 m_language1TotalWords
=0;
22 m_language2TotalWords
=0;
24 m_language1FileName
= "";
25 m_language2FileName
= "";
27 m_countOfSentences
= 0;
37 void mTVolca::initVolList()
40 m_language1Words
.clear();
41 m_language2Words
.clear();
42 m_language1WordIndex
.clear();
43 m_language2WordIndex
.clear();
46 QString language1FileName
;
47 QString language2FileName
;
52 StringToInt::iterator StringToIntIt
;
55 language1FileName
= Q3FileDialog::getOpenFileName( m_projectDirectory
,
56 "Text File (*.txt);;All files (*.*);;Corpus File (*.corpus)",
59 "Choose Language1 corpus" );
61 language2FileName
= Q3FileDialog::getOpenFileName( m_projectDirectory
,
62 "Text File (*.txt);;All files (*.*);;Corpus File (*.corpus)",
65 "Choose Language2 corpus" );
68 m_language1FileName
= language1FileName
;
69 m_language2FileName
= language2FileName
;
71 language1File
= new QFile( language1FileName
);
72 if ( language1File
->open( QIODevice::ReadOnly
) )
74 Q3TextStream
stream( language1File
);
75 stream
.setEncoding( Q3TextStream::Locale
);
77 while( !stream
.atEnd() )
82 if (oneWord
.length() >0 && oneWord
!= QString(".") && oneWord
!= QString(","))
84 m_language1Words
.insert(oneWord
, 0);
89 m_language1TotalWords
= m_language1Words
.size();
92 for ( StringToIntIt
= m_language1Words
.begin(); StringToIntIt
!= m_language1Words
.end(); StringToIntIt
++ )
94 oneWord
= StringToIntIt
.key();
95 m_language1Words
[oneWord
] = wordIndex
;
96 m_language1WordIndex
.insert(wordIndex
, oneWord
);
101 wordIndex
= StringToIntIt
.data();
103 //QMessageBox::information ( NULL, "Linguistica : MT Model1", QString("The total is %1 and last index is %2.").arg(m_language1TotalWords).arg(wordIndex), "OK" );
105 language1File
->close();
109 QMessageBox::information ( NULL
, "Linguistica : MT Model1", "Can't Open Language 1 Corpus !", "OK" );
113 language2File
= new QFile( language2FileName
);
114 if ( language2File
->open( QIODevice::ReadOnly
) )
116 Q3TextStream
stream( language2File
);
117 stream
.setEncoding( Q3TextStream::Locale
);
119 while( !stream
.atEnd() )
123 // a little filtering
124 if (oneWord
.length() >0 && oneWord
!= QString(".") && oneWord
!= QString(","))
126 m_language2Words
.insert(oneWord
, 0);
130 m_language2TotalWords
= m_language2Words
.size();
133 for ( StringToIntIt
= m_language2Words
.begin(); StringToIntIt
!= m_language2Words
.end(); StringToIntIt
++ )
135 oneWord
= StringToIntIt
.key();
136 m_language2Words
[oneWord
] = wordIndex
;
137 m_language2WordIndex
.insert(wordIndex
, oneWord
);
142 wordIndex
= StringToIntIt
.data();
144 //QMessageBox::information ( NULL, "Linguistica : MT Model1", QString("The total is %1 and last index is %2.").arg(m_language2TotalWords).arg(wordIndex), "OK" );
146 language2File
->close();
151 QMessageBox::information ( NULL
, "Linguistica : MT Model1", "Can't Open Language 2 Corpus !", "OK" );
159 void mTVolca::readSentences()
162 int countOfLanguage1Sentence
,countOfLanguage2Sentence
;
163 QFile
* language1File
;
164 QFile
* language2File
;
167 int indexInsideSentence
;
169 IntToInt
* oneSentence
;
173 countOfLanguage1Sentence
=0;
174 language1File
= new QFile( m_language1FileName
);
175 if ( language1File
->open( QIODevice::ReadOnly
) )
177 Q3TextStream
stream( language1File
);
178 stream
.setEncoding( Q3TextStream::Locale
);
180 while( !stream
.atEnd() )
182 oneLine
= stream
.readLine();
185 Q3TextStream
lineStream( &oneLine
, QIODevice::ReadOnly
);
187 indexInsideSentence
= 0;
188 oneSentence
= new IntToInt();
189 while ( ! lineStream
.atEnd())
191 lineStream
>> oneWord
;
193 if (oneWord
.length() >0 && oneWord
!= QString(".") && oneWord
!= QString(","))
195 idOfTheWord
= m_language1Words
[oneWord
];
196 oneSentence
->insert(indexInsideSentence
, idOfTheWord
);
197 indexInsideSentence
++;
201 if ( indexInsideSentence
>= 1)
203 m_language1Sentences
.insert(countOfLanguage1Sentence
,oneSentence
);
204 countOfLanguage1Sentence
++;
215 QMessageBox::information ( NULL
, "Linguistica : MT Model1", "Can't Open Language 1 Corpus !", "OK" );
220 countOfLanguage2Sentence
=0;
221 language2File
= new QFile( m_language2FileName
);
222 if ( language2File
->open( QIODevice::ReadOnly
) )
224 Q3TextStream
stream( language2File
);
225 stream
.setEncoding( Q3TextStream::Locale
);
227 while( !stream
.atEnd() )
229 oneLine
= stream
.readLine();
232 Q3TextStream
lineStream( &oneLine
, QIODevice::ReadOnly
);
234 indexInsideSentence
= 0;
235 oneSentence
= new IntToInt();
236 while ( ! lineStream
.atEnd())
238 lineStream
>> oneWord
;
240 if (oneWord
.length() >0 && oneWord
!= QString(".") && oneWord
!= QString(","))
242 idOfTheWord
= m_language2Words
[oneWord
];
243 oneSentence
->insert(indexInsideSentence
, idOfTheWord
);
244 indexInsideSentence
++;
248 if ( indexInsideSentence
>= 1)
250 m_language2Sentences
.insert(countOfLanguage2Sentence
,oneSentence
);
251 countOfLanguage2Sentence
++;
255 QMessageBox::information ( NULL
, "Linguistica : MT Model1 Error", QString("At Sentence %1").arg(countOfLanguage2Sentence
), "OK" );
263 QMessageBox::information ( NULL
, "Linguistica : MT Model1", "Can't Open Language 2 Corpus !", "OK" );
268 if ( countOfLanguage1Sentence
!= countOfLanguage2Sentence
)
270 QMessageBox::information ( NULL
, "Linguistica : MT Model1", "Language1 Sentence# != Language2 Sentence#!", "OK" );
271 // To do Clean sentences
275 m_countOfSentences
= countOfLanguage1Sentence
;
278 //QMessageBox::information ( NULL, "Linguistica : MT Model1", QString("Read In %1 sentences !").arg(m_countOfSentences), "OK" );
283 QString oneSentenceStr;
284 QString oneVolcaWord;
285 QString twoVolcaWord;
291 oneSentence = m_language1Sentences[1];
292 count = oneSentence ->size();
296 for ( i=0; i < count; i++)
298 oneWordId = (*oneSentence)[i];
299 oneSentenceStr += QString("%1-").arg(oneWordId);
303 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Lan1 Sentence 1 is: " + oneSentenceStr, "OK" );
305 oneSentence = m_language2Sentences[1];
306 count = oneSentence ->size();
310 for ( i=0; i < count; i++)
312 oneWordId = (*oneSentence)[i];
313 oneSentenceStr += QString("%1-").arg(oneWordId);
316 QMessageBox::information ( NULL, "Linguistica : MT Model1", "Lan2 Sentence 1 is: " + oneSentenceStr, "OK" );
321 void mTVolca::setFastSearchPairsForT()
323 QString fastSearchPairs
;
325 int idOfOneLanguage1Word
;
326 int idOfOneLanguage2Word
;
327 IntToInt
* oneLan1Sentence
;
328 IntToInt
* oneLan2Sentence
;
329 IntToDouble
* lan2WordsForOneLan1Word
;
330 IntToDouble
* softCountLan2WordsForOneLan1Word
;
334 for ( i
=0; i
< m_countOfSentences
; i
++)
336 oneLan1Sentence
= m_language1Sentences
[i
];
337 oneLan2Sentence
= m_language2Sentences
[i
];
339 for ( l
=0; l
< static_cast <int> (oneLan1Sentence
->size() ); l
++)
341 idOfOneLanguage1Word
= (*oneLan1Sentence
)[l
];
343 if ( m_fastWordsPairs
.contains(idOfOneLanguage1Word
))
345 lan2WordsForOneLan1Word
= m_fastWordsPairs
[idOfOneLanguage1Word
];
349 lan2WordsForOneLan1Word
= new IntToDouble();
350 m_fastWordsPairs
.insert(idOfOneLanguage1Word
, lan2WordsForOneLan1Word
);
353 if ( m_fastWordsSoftCounts
.contains(idOfOneLanguage1Word
))
355 softCountLan2WordsForOneLan1Word
= m_fastWordsSoftCounts
[idOfOneLanguage1Word
];
359 softCountLan2WordsForOneLan1Word
= new IntToDouble();
360 m_fastWordsSoftCounts
.insert(idOfOneLanguage1Word
, softCountLan2WordsForOneLan1Word
);
363 for ( m
=0; m
< static_cast <int> ( oneLan2Sentence
->size() ); m
++)
365 idOfOneLanguage2Word
= (*oneLan2Sentence
)[m
];
367 lan2WordsForOneLan1Word
->insert(idOfOneLanguage2Word
, 0.0);
368 softCountLan2WordsForOneLan1Word
->insert(idOfOneLanguage2Word
, 0.0);
374 QMessageBox::information ( NULL
, "Linguistica : MT Model1", "Finished Indexing the Word Pairs", "OK" );
379 void mTVolca::clearSentenceViterbiAlignment()
381 IntToIntToInt::iterator IntToIntToIntIt
;
382 IntToInt
* oneAlignment
;
384 for ( IntToIntToIntIt
= m_sentenceAlignments
.begin(); IntToIntToIntIt
!= m_sentenceAlignments
.end(); IntToIntToIntIt
++)
386 oneAlignment
= IntToIntToIntIt
.data();
390 m_sentenceAlignments
.clear();