HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Sequencer.cpp
blobaf7bfcdde50c31dc8abb16167f39738d98598cfb
1 // Implementation of CSequencer methods
2 // Copyright © 2009 The University of Chicago
3 #include "Sequencer.h"
5 #include <Q3FileDialog>
6 #include <QInputDialog>
7 #include <QMessageBox>
8 #include <QLineEdit>
9 #include <Q3TextStream>
10 #include <QIODevice>
11 #include <QFile>
12 #include <QString>
13 #include <QMap>
14 #include "ui/Status.h"
15 #include "log2.h"
17 CSequencer::CSequencer()
18 : m_K(10),
19 m_resultK(10),
20 m_maxlineintrain(2000),
21 m_maxlineintest(10),
22 separator("?_?"),
23 m_bigrams(),
24 m_trigrams(),
25 m_bigramsbase(),
26 m_trigramsbase(),
27 m_bigramprob(),
28 m_trigramprob(),
29 m_totalbigrams(0),
30 m_totaltrigrams(0),
31 m_totalbigramsbase(0),
32 m_totaltrigramsbase(0) { }
34 void CSequencer::readCorpus(linguistica::ui::status_user_agent& status)
36 QString sequenceTrainFileName;
37 QString oneLine;
38 QString firstWord, secondWord, thirdWord;
39 QString leftWord, rightWord;
40 QString oneBigram, oneTrigram;
41 QString oneTrigrambase;
42 int numberOfLines;
43 int loc;
44 int indexOfWord;
45 int oneNumber;
46 QMap<QString, int>::Iterator StringToIntIt;
47 QMap<QString, double>::Iterator StringToDoubleIt;
49 sequenceTrainFileName = Q3FileDialog::getOpenFileName(
50 sequenceTrainFileName,
51 "TXT Files (*.txt)",
52 NULL,
53 "open file dialog",
54 "Choose a train file to open");
56 if (sequenceTrainFileName.isEmpty())
57 return;
59 // XXX. These should be bundled into a struct and made a local variable.
60 m_bigrams.clear();
61 m_bigramsbase.clear();
62 m_bigramprob.clear();
63 m_trigrams.clear();
64 m_trigramsbase.clear();
65 m_trigramprob.clear();
66 m_totalbigrams = 0;
67 m_totaltrigrams = 0;
68 m_totalbigramsbase = 0;
69 m_totaltrigramsbase = 0;
71 Q_ASSERT(!sequenceTrainFileName.isEmpty());
72 QFile trainFile(sequenceTrainFileName);
74 if (trainFile.open( QIODevice::ReadOnly)) {
75 Q3TextStream trainStream(&trainFile);
76 // trainStream.setEncoding(QTextStream::Unicode);
78 numberOfLines = 0;
79 while (!trainStream.atEnd()) {
80 oneLine = trainStream.readLine(); // This is one sentence.
82 if (oneLine.length() == 0)
83 continue;
85 oneLine = oneLine.lower();
86 oneLine = oneLine.stripWhiteSpace();
87 oneLine = oneLine.simplifyWhiteSpace();
89 firstWord = "#";
90 secondWord = "";
91 thirdWord = "";
92 indexOfWord = -1;
94 loc = oneLine.find(" ");
95 while (loc != -1) {
96 leftWord = oneLine.left(loc);
97 rightWord = oneLine.right(oneLine.length() - loc - 1);
98 indexOfWord++;
99 if (indexOfWord == 0) {
100 secondWord = leftWord;
101 oneBigram = firstWord + separator + secondWord;
102 if (m_bigrams.contains(oneBigram))
103 m_bigrams[oneBigram]++;
104 else
105 m_bigrams.insert(oneBigram, 1);
107 if (m_bigramsbase.contains(firstWord))
108 m_bigramsbase[firstWord]++;
109 else
110 m_bigramsbase.insert(firstWord, 1);
112 m_totalbigrams++;
113 m_totalbigramsbase++;
114 } else {
115 thirdWord = leftWord;
117 oneBigram = secondWord + separator + thirdWord;
118 oneTrigram = firstWord + separator + secondWord + separator + thirdWord;
119 oneTrigrambase = firstWord + separator + secondWord;
121 // add into bigram
122 if (m_bigrams.contains(oneBigram))
123 m_bigrams[oneBigram]++;
124 else
125 m_bigrams.insert(oneBigram, 1);
127 if (m_bigramsbase.contains(secondWord))
128 m_bigramsbase[secondWord]++;
129 else
130 m_bigramsbase.insert(secondWord, 1);
132 m_totalbigrams++;
133 m_totalbigramsbase++;
135 // add into trigram
136 if (m_trigrams.contains(oneTrigram))
137 m_trigrams[oneTrigram]++;
138 else
139 m_trigrams.insert(oneTrigram, 1);
141 if (m_trigramsbase.contains(oneTrigrambase))
142 m_trigramsbase[oneTrigrambase]++;
143 else
144 m_trigramsbase.insert(oneTrigrambase, 1);
145 m_totaltrigrams++;
146 m_totaltrigramsbase++;
148 // Move the first,second words
149 firstWord = secondWord;
150 secondWord = thirdWord;
153 oneLine = rightWord;
154 loc = oneLine.find(" ");
157 leftWord = oneLine;
158 indexOfWord++;
160 if (leftWord == QString(".")) {
161 // XXX. report errors to caller instead of aborting
162 // make sure we don't get a line like "."
163 Q_ASSERT(indexOfWord > 0);
165 thirdWord = leftWord;
167 oneBigram = secondWord + separator + thirdWord;
168 oneTrigram = firstWord + separator + secondWord + separator + thirdWord;
169 oneTrigrambase = firstWord + separator + secondWord;
171 // add into bigram
172 if (m_bigrams.contains(oneBigram))
173 m_bigrams[oneBigram]++;
174 else
175 m_bigrams.insert(oneBigram, 1);
177 if (m_bigramsbase.contains(secondWord))
178 m_bigramsbase[secondWord]++;
179 else
180 m_bigramsbase.insert(secondWord, 1);
182 m_totalbigrams++;
183 m_totalbigramsbase++;
185 // add into trigram
186 if (m_trigrams.contains(oneTrigram))
187 m_trigrams[oneTrigram]++;
188 else
189 m_trigrams.insert(oneTrigram, 1);
191 if (m_trigramsbase.contains(oneTrigrambase))
192 m_trigramsbase[oneTrigrambase]++;
193 else
194 m_trigramsbase.insert(oneTrigrambase, 1);
196 m_totaltrigrams++;
197 m_totaltrigramsbase++;
198 } else {
199 if (leftWord.right(1) == QString(".")) {
200 if (indexOfWord == 0) {
201 secondWord = leftWord.left(leftWord.length() -1);
203 oneBigram = firstWord + separator + secondWord;
204 if (m_bigrams.contains(oneBigram))
205 m_bigrams[oneBigram]++;
206 else
207 m_bigrams.insert(oneBigram, 1);
209 if (m_bigramsbase.contains(firstWord))
210 m_bigramsbase[firstWord]++;
211 else
212 m_bigramsbase.insert(firstWord, 1);
214 m_totalbigrams++;
215 m_totalbigramsbase++;
216 } else {
217 thirdWord = leftWord.left(leftWord.length() -1);
219 oneBigram = secondWord + separator + thirdWord;
220 oneTrigram = firstWord + separator + secondWord + separator + thirdWord;
221 oneTrigrambase = firstWord + separator + secondWord;
223 // add into bigram
224 if (m_bigrams.contains(oneBigram))
225 m_bigrams[oneBigram]++;
226 else
227 m_bigrams.insert(oneBigram, 1);
229 if (m_bigramsbase.contains(secondWord))
230 m_bigramsbase[secondWord]++;
231 else
232 m_bigramsbase.insert(secondWord, 1);
234 m_totalbigrams++;
235 m_totalbigramsbase++;
237 // add into trigram
238 if (m_trigrams.contains(oneTrigram))
239 m_trigrams[oneTrigram]++;
240 else
241 m_trigrams.insert(oneTrigram, 1);
243 if (m_trigramsbase.contains(oneTrigrambase))
244 m_trigramsbase[oneTrigrambase]++;
245 else
246 m_trigramsbase.insert(oneTrigrambase, 1);
248 m_totaltrigrams++;
249 m_totaltrigramsbase++;
251 // Move the first,second words
252 firstWord = secondWord;
253 secondWord = thirdWord;
255 } else {
256 if (indexOfWord == 0) {
257 secondWord = leftWord;
259 oneBigram = firstWord + separator + secondWord;
260 if (m_bigrams.contains(oneBigram))
261 m_bigrams[oneBigram]++;
262 else
263 m_bigrams.insert(oneBigram, 1);
265 if (m_bigramsbase.contains(firstWord))
266 m_bigramsbase[firstWord]++;
267 else
268 m_bigramsbase.insert(firstWord, 1);
270 m_totalbigrams++;
271 m_totalbigramsbase++;
272 } else {
273 thirdWord = leftWord;
275 oneBigram = secondWord + separator + thirdWord;
276 oneTrigram = firstWord + separator + secondWord + separator + thirdWord;
277 oneTrigrambase = firstWord + separator + secondWord;
279 // add into bigram
280 if (m_bigrams.contains(oneBigram))
281 m_bigrams[oneBigram]++;
282 else
283 m_bigrams.insert(oneBigram, 1);
285 if (m_bigramsbase.contains(secondWord))
286 m_bigramsbase[secondWord]++;
287 else
288 m_bigramsbase.insert(secondWord, 1);
290 m_totalbigrams++;
291 m_totalbigramsbase++;
293 // add into trigram
294 if (m_trigrams.contains(oneTrigram))
295 m_trigrams[oneTrigram]++;
296 else
297 m_trigrams.insert(oneTrigram, 1);
299 if (m_trigramsbase.contains(oneTrigrambase))
300 m_trigramsbase[oneTrigrambase]++;
301 else
302 m_trigramsbase.insert(oneTrigrambase, 1);
304 m_totaltrigrams++;
305 m_totaltrigramsbase++;
307 // Move the first,second words
308 firstWord = secondWord;
309 secondWord = thirdWord;
313 // add the last "."
314 Q_ASSERT(indexOfWord > 0);
316 thirdWord = QString(".");
318 oneBigram = secondWord + separator + thirdWord;
319 oneTrigram = firstWord + separator + secondWord + separator + thirdWord;
320 oneTrigrambase = firstWord + separator + secondWord;
322 // add into bigram
323 if (m_bigrams.contains(oneBigram))
324 m_bigrams[oneBigram]++;
325 else
326 m_bigrams.insert(oneBigram, 1);
328 if (m_bigramsbase.contains(secondWord))
329 m_bigramsbase[secondWord]++;
330 else
331 m_bigramsbase.insert(secondWord, 1);
333 m_totalbigrams++;
334 m_totalbigramsbase++;
336 // add into trigram
337 if (m_trigrams.contains(oneTrigram))
338 m_trigrams[oneTrigram]++;
339 else
340 m_trigrams.insert(oneTrigram, 1);
342 if (m_trigramsbase.contains(oneTrigrambase))
343 m_trigramsbase[oneTrigrambase]++;
344 else
345 m_trigramsbase.insert(oneTrigrambase, 1);
347 m_totaltrigrams++;
348 m_totaltrigramsbase++;
351 //oneLine = trainStream.readLine(); // This is the "return"
353 numberOfLines++;
354 status.major_operation =
355 QString("read line %1...").arg(numberOfLines);
357 if (numberOfLines > m_maxlineintrain)
358 break;
361 trainFile.close();
363 // Compute the bigram prob and trigram prob
364 double oneBigramProb;
365 double oneTrigramProb;
366 double oneBaseProb;
367 QString oneBase;
368 int numberOfProcessed;
370 numberOfProcessed = 0;
371 for (StringToIntIt = m_bigrams.begin(); StringToIntIt != m_bigrams.end(); StringToIntIt++) {
372 oneBigram = StringToIntIt.key();
373 oneNumber = StringToIntIt.data();
374 oneBigramProb = (double) oneNumber / m_totalbigrams;
375 loc = oneBigram.find(separator);
376 oneBase = oneBigram.left(loc);
377 oneBaseProb = (double) m_bigramsbase[oneBase] / m_totalbigramsbase;
378 oneBigramProb = -base2log(oneBigramProb / oneBaseProb);
379 m_bigramprob.insert(oneBigram, oneBigramProb);
380 numberOfProcessed++;
381 status.major_operation =
382 QString("processing bigram %1...")
383 .arg(numberOfProcessed);
386 numberOfProcessed = 0;
387 for (StringToIntIt = m_trigrams.begin(); StringToIntIt != m_trigrams.end(); StringToIntIt++) {
388 oneTrigram = StringToIntIt.key();
389 oneNumber = StringToIntIt.data();
390 oneTrigramProb = (double) oneNumber / m_totaltrigrams;
391 loc = oneTrigram.findRev(separator);
392 oneBase = oneTrigram.left(loc);
393 oneBaseProb = (double) m_trigramsbase[oneBase] / m_totaltrigramsbase;
394 oneTrigramProb = -base2log(oneTrigramProb / oneBaseProb);
395 m_trigramprob.insert(oneTrigram, oneTrigramProb);
396 numberOfProcessed++;
397 status.major_operation =
398 QString("processing trigram %1...")
399 .arg(numberOfProcessed);
401 status.major_operation.clear();
403 // Debug output bigrams and trigrams
404 QString bigramFileName = "bigrams.txt";
405 QString trigramFileName = "trigrams.txt";
406 QString oneKey;
407 double oneValue;
408 QFile bigramFile(bigramFileName);
409 QFile trigramFile(trigramFileName);
411 if (bigramFile.open(QIODevice::WriteOnly)) {
412 Q3TextStream bigramStream(&bigramFile);
413 for (StringToDoubleIt = m_bigramprob.begin(); StringToDoubleIt != m_bigramprob.end(); StringToDoubleIt++) {
414 oneKey = StringToDoubleIt.key();
415 oneValue =StringToDoubleIt.data();
416 oneKey = oneKey.replace(separator, " ");
417 bigramStream << oneKey << " " << oneValue << endl;
419 bigramFile.close();
420 } else {
421 // XXX. handle error.
424 if (trigramFile.open(QIODevice::WriteOnly)) {
425 Q3TextStream trigramStream(&trigramFile);
426 for (StringToDoubleIt = m_trigramprob.begin(); StringToDoubleIt != m_trigramprob.end(); StringToDoubleIt++) {
427 oneKey = StringToDoubleIt.key();
428 oneValue =StringToDoubleIt.data();
429 oneKey = oneKey.replace(separator, " ");
430 trigramStream << oneKey << " " << oneValue << endl;
432 trigramFile.close();
433 } else {
434 // XXX. handle error.
439 void CSequencer::sequencerTestAFile(linguistica::ui::status_user_agent& status)
441 if (m_totalbigrams == 0) {
442 QMessageBox::information(NULL, "Warning", "Please Read Training Corpus Firstly!", "OK");
443 return;
446 QString sequenceTestFileName = Q3FileDialog::getOpenFileName(
447 sequenceTestFileName,
448 "TXT Files (*.txt)",
449 NULL,
450 "open file dialog",
451 "Choose a train file to open");
453 if (sequenceTestFileName.isEmpty())
454 return;
456 QFile testFile(sequenceTestFileName);
457 int numberOfLines;
458 QString oneLine;
459 int oneBiScore, oneTriScore;
460 int totalHitInBigramList = 0;
461 int totalHitInTrigramList = 0;
462 int totalSentences = 0;
463 int totalHitInBigramListSumRanks = 0;
464 int totalHitInTrigramListSumRanks = 0;
465 double averageBiRanking, averageTriRanking;
467 if (testFile.open(QIODevice::ReadOnly)) {
468 Q3TextStream testStream(&testFile);
469 // testStream.setEncoding( QTextStream::Unicode );
471 numberOfLines = 0;
472 while (!testStream.atEnd()) {
473 oneLine = testStream.readLine(); // This is one sentence.
474 if (oneLine.length() == 0)
475 continue;
476 oneLine = oneLine.lower();
477 oneLine = oneLine.stripWhiteSpace();
478 oneLine = oneLine.simplifyWhiteSpace();
479 if (oneLine.length() == 0)
480 continue;
481 sequenceASentence(oneBiScore, oneTriScore, oneLine);
482 totalSentences++;
483 if (oneBiScore != 0) {
484 totalHitInBigramListSumRanks += oneBiScore;
485 totalHitInBigramList++;
487 if (oneTriScore != 0) {
488 totalHitInTrigramListSumRanks += oneTriScore;
489 totalHitInTrigramList++;
491 numberOfLines++;
492 status.major_operation =
493 QString("testing sentence %1...")
494 .arg(numberOfLines);
495 if (numberOfLines > m_maxlineintest)
496 break;
498 status.major_operation.clear();
500 // Result Statics
501 averageBiRanking = (double)totalHitInBigramListSumRanks / totalHitInBigramList;
502 averageTriRanking = (double)totalHitInTrigramListSumRanks / totalHitInTrigramList;
504 QMessageBox::information(NULL,
505 "Debug",
506 QString("Total %1, Bi-Hit %2, Tri-Hit %3, RankingInBiHit %4, RankingInTriHit %5")
507 .arg(totalSentences)
508 .arg(totalHitInBigramList)
509 .arg(totalHitInTrigramList)
510 .arg(averageBiRanking)
511 .arg(averageTriRanking), "OK");
515 void CSequencer::sequenceASentence(int& biRank, int& triRank, QString inputSentence)
517 QString aSentence;
518 bool ok;
519 int loc;
520 int numberOfWords;
521 QMap<QString, int> bagOfWords;
522 QMap<QString, int> currentBagOfWords;
523 QMap<QString, int>::Iterator StringToIntIt1, StringToIntIt2;
524 QString rightSentence;
525 QString oneWord, twoWord;
526 QString oneBigram, bestBigram;
527 bool reasonablePair;
528 int count;
529 double bigramcount, bestbigramcount;
530 Q3SortedList<sentenceItem> finalRankedSentences;
531 Q3SortedList<sentenceItem> biResultRankedSentences;
532 Q3SortedList<sentenceItem> triResultRankedSentences;
533 sentenceItem* oneSentenceItem;
534 QString resultSentence;
535 int resultRanki;
536 bool getInputFromUI;
540 finalRankedSentences.setAutoDelete(FALSE);
541 biResultRankedSentences.setAutoDelete(TRUE);
542 triResultRankedSentences.setAutoDelete(TRUE);
544 if ( m_totalbigrams == 0)
546 QMessageBox::information ( NULL, "Warning","Please Read Training Corpus Firstly!", "OK" );
547 return;
550 bagOfWords.clear();
553 if ( inputSentence.length() ==0)
555 aSentence = QInputDialog::getText(
556 "Sequencer_ASentence", "Enter a Sentence", QLineEdit::Normal,
557 QString::null, &ok, NULL );
559 getInputFromUI = true;
561 else
563 aSentence = inputSentence;
564 ok = true;
565 getInputFromUI = false;
568 if ( ok && !aSentence.isEmpty() )
570 // Parse this sentence into words
571 QString leftWord, rightWord;
575 aSentence = aSentence.lower();
576 aSentence = aSentence.stripWhiteSpace();
577 aSentence = aSentence.simplifyWhiteSpace();
579 rightSentence = QString("#");
581 numberOfWords = 0;
582 loc = aSentence.find(" ");
583 while(loc != -1)
585 leftWord = aSentence.left(loc);
586 rightWord = aSentence.right(aSentence.length() - loc - 1);
588 if (( leftWord != QString("#")) && (leftWord != QString(".")))
591 if ( bagOfWords.contains(leftWord))
593 bagOfWords[leftWord]++;
595 else
597 bagOfWords.insert(leftWord, 1);
600 rightSentence = rightSentence + " " + leftWord;
601 numberOfWords++;
604 aSentence = rightWord;
605 loc = aSentence.find(" ");
608 leftWord = aSentence;
610 if ( leftWord != QString("."))
612 if (leftWord.right(1) == QString("."))
614 leftWord = leftWord.left(leftWord.length() -1);
617 if (( leftWord != QString("#")) && (leftWord != QString(".")))
620 if ( bagOfWords.contains(leftWord))
622 bagOfWords[leftWord]++;
624 else
626 bagOfWords.insert(leftWord, 1);
629 rightSentence = rightSentence + " " + leftWord;
630 numberOfWords++;
636 rightSentence = rightSentence + " ." ;
638 if ( numberOfWords == 1)
640 resultSentence = QString("# ") + (bagOfWords.begin()).key() + QString(" .") ;
641 //QMessageBox::information ( NULL, "All-Debug",resultSentence, "OK" );
642 biRank = 1;
643 triRank = 1;
644 return;
648 //QMessageBox::information ( NULL, "Debug",QString("bag of word has %1 words.").arg(numberOfWords), "OK" );
650 bestbigramcount =0.0;
651 for ( StringToIntIt1 = bagOfWords.begin(); StringToIntIt1 != bagOfWords.end(); StringToIntIt1++)
653 oneWord = StringToIntIt1.key();
654 count = StringToIntIt1.data();
656 if ( count > 1)
658 reasonablePair = true;
660 else
662 reasonablePair = false;
665 for ( StringToIntIt2 = bagOfWords.begin(); StringToIntIt2 != bagOfWords.end(); StringToIntIt2++)
667 twoWord = StringToIntIt2.key();
669 if (( !reasonablePair) && (oneWord != twoWord))
671 reasonablePair = true;
674 if ( !reasonablePair)
676 continue;
679 oneBigram = oneWord + separator + twoWord;
681 if ( m_bigramprob.contains(oneBigram))
684 bigramcount = m_bigramprob[oneBigram];
686 currentBagOfWords = bagOfWords;
688 if (currentBagOfWords[oneWord] <= 1)
690 currentBagOfWords.remove(oneWord);
692 else
694 currentBagOfWords[oneWord]--;
697 if (currentBagOfWords[twoWord] <= 1)
699 currentBagOfWords.remove(twoWord);
701 else
703 currentBagOfWords[twoWord]--;
706 currentBagOfWords.insert(QString("#"), 1);
707 currentBagOfWords.insert(QString("."), 1);
710 // Test bigram sequencer
712 sequenizeFromABigram(oneBigram, bigramcount, currentBagOfWords, numberOfWords, m_K, finalRankedSentences, 1);
714 for ( oneSentenceItem=finalRankedSentences.first(); oneSentenceItem != 0; oneSentenceItem=finalRankedSentences.next())
716 biResultRankedSentences.append(oneSentenceItem);
720 finalRankedSentences.setAutoDelete(FALSE);
721 finalRankedSentences.clear();
724 // Test trigram sequencer
726 sequenizeFromABigram(oneBigram, bigramcount, currentBagOfWords, numberOfWords, m_K, finalRankedSentences, 2);
728 for ( oneSentenceItem=finalRankedSentences.first(); oneSentenceItem != 0; oneSentenceItem=finalRankedSentences.next())
730 triResultRankedSentences.append(oneSentenceItem);
733 finalRankedSentences.setAutoDelete(FALSE);
734 finalRankedSentences.clear();
746 if ( biResultRankedSentences.count() != 0)
749 QFile file( "SequencerLog.txt" );
751 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
753 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
754 return;
757 Q3TextStream outf( &file );
759 outf << "******Bigram Results******" << endl <<endl;
763 QString biResultKey;
766 biResultRankedSentences.sort();
768 resultRanki = 0;
769 biRank = resultRanki;
770 for ( oneSentenceItem=biResultRankedSentences.first(); oneSentenceItem != 0; oneSentenceItem=biResultRankedSentences.next())
772 resultRanki++;
774 biResultKey = oneSentenceItem ->m_key;
775 biResultKey.replace(separator, " ");
777 if ( biResultKey == rightSentence)
779 biRank = resultRanki;
783 if ( resultRanki > m_resultK)
785 break;
789 // log history of this result sentence
790 outf << "Result Sentence Rank " << resultRanki << " : " << biResultKey << endl;
791 for ( int stepi = 1; stepi <= oneSentenceItem ->m_stepnumber; stepi++)
793 QString oneHistoryString;
794 double oneHistoryScore;
796 oneHistoryString = oneSentenceItem ->m_historystrings[stepi];
797 oneHistoryString = oneHistoryString.replace(separator, " ");
798 oneHistoryScore = oneSentenceItem ->m_historyscores[stepi];
800 outf << " " << stepi << " : " << oneHistoryString << " : " << oneHistoryScore << endl;
804 outf << endl;
809 if ( getInputFromUI )
813 oneSentenceItem=biResultRankedSentences.first();
815 biResultKey = oneSentenceItem ->m_key;
816 biResultKey.replace(separator, " ");
818 QMessageBox::information ( NULL, "Top Bigram Sequencer Output",biResultKey, "OK" );
824 biResultRankedSentences.clear();
826 file.close();
830 if ( triResultRankedSentences.count() != 0)
833 QFile file( "SequencerLog.txt" );
835 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
837 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
838 return;
841 Q3TextStream outf( &file );
843 outf << "******Trigram Results******" << endl <<endl;
846 QString triResultKey;
848 triResultRankedSentences.sort();
850 resultRanki = 0;
851 triRank = resultRanki;
852 for ( oneSentenceItem=triResultRankedSentences.first(); oneSentenceItem != 0; oneSentenceItem=triResultRankedSentences.next())
854 resultRanki++;
856 triResultKey = oneSentenceItem ->m_key;
857 triResultKey.replace(separator, " ");
859 if ( triResultKey == rightSentence)
861 triRank = resultRanki;
865 if ( resultRanki > m_resultK)
867 break;
870 // log history of this result sentence
871 outf << "Result Sentence Rank " << resultRanki << " : " << triResultKey << endl;
872 for ( int stepi = 1; stepi <= oneSentenceItem ->m_stepnumber; stepi++)
874 QString oneHistoryString;
875 double oneHistoryScore;
877 oneHistoryString = oneSentenceItem ->m_historystrings[stepi];
878 oneHistoryString = oneHistoryString.replace(separator, " ");
879 oneHistoryScore = oneSentenceItem ->m_historyscores[stepi];
881 outf << " " << stepi << " : " << oneHistoryString << " : " << oneHistoryScore << endl;
885 outf << endl;
891 if ( getInputFromUI )
893 QString triResultKey;
895 oneSentenceItem=triResultRankedSentences.first();
897 triResultKey = oneSentenceItem ->m_key;
898 triResultKey.replace(separator, " ");
900 QMessageBox::information ( NULL, "Top Trigram Sequencer Output",triResultKey, "OK" );
905 triResultRankedSentences.clear();
907 file.close();
913 else
915 return;
921 void CSequencer::sequenceASentence2(int& biRank, int& triRank, QString inputSentence)
923 QString aSentence;
924 bool ok;
925 int loc;
926 int numberOfWords;
927 QMap<QString, double> allBigrams;
928 QMap<QString, int> bagOfWords;
929 QMap<QString, int> currentBagOfWords;
930 QMap<QString, int>::Iterator StringToIntIt1, StringToIntIt2 ;
931 QString rightSentence;
932 QString oneWord, twoWord;
933 QString oneBigram, bestBigram;
934 bool reasonablePair;
935 int count;
936 double bigramcount, bestbigramcount;
937 Q3SortedList<sentenceItem> finalRankedSentences;
938 Q3SortedList<sentenceItem> biResultRankedSentences;
939 Q3SortedList<sentenceItem> triResultRankedSentences;
940 sentenceItem* oneSentenceItem, *twoSentenceItem;
941 QString resultSentence;
942 int resultRanki;
943 bool getInputFromUI;
944 QMap<int, QString> currentHistoryString;
945 QMap<int, double> currentHistoryScore;
950 finalRankedSentences.setAutoDelete(FALSE);
951 biResultRankedSentences.setAutoDelete(TRUE);
952 triResultRankedSentences.setAutoDelete(TRUE);
954 if ( m_totalbigrams == 0)
956 QMessageBox::information ( NULL, "Warning","Please Read Training Corpus Firstly!", "OK" );
957 return;
960 bagOfWords.clear();
963 if ( inputSentence.length() ==0)
965 aSentence = QInputDialog::getText(
966 "Sequencer_ASentence", "Enter a Sentence", QLineEdit::Normal,
967 QString::null, &ok, NULL );
969 getInputFromUI = true;
971 else
973 aSentence = inputSentence;
974 ok = true;
975 getInputFromUI = false;
978 if ( ok && !aSentence.isEmpty() )
980 // Parse this sentence into words
981 QString leftWord, rightWord;
985 aSentence = aSentence.lower();
986 aSentence = aSentence.stripWhiteSpace();
987 aSentence = aSentence.simplifyWhiteSpace();
989 rightSentence = QString("#");
991 numberOfWords = 0;
992 loc = aSentence.find(" ");
993 while(loc != -1)
995 leftWord = aSentence.left(loc);
996 rightWord = aSentence.right(aSentence.length() - loc - 1);
998 if (( leftWord != QString("#")) && (leftWord != QString(".")))
1001 if ( bagOfWords.contains(leftWord))
1003 bagOfWords[leftWord]++;
1005 else
1007 bagOfWords.insert(leftWord, 1);
1010 rightSentence = rightSentence + " " + leftWord;
1011 numberOfWords++;
1014 aSentence = rightWord;
1015 loc = aSentence.find(" ");
1018 leftWord = aSentence;
1020 if ( leftWord != QString("."))
1022 if (leftWord.right(1) == QString("."))
1024 leftWord = leftWord.left(leftWord.length() -1);
1027 if (( leftWord != QString("#")) && (leftWord != QString(".")))
1030 if ( bagOfWords.contains(leftWord))
1032 bagOfWords[leftWord]++;
1034 else
1036 bagOfWords.insert(leftWord, 1);
1039 rightSentence = rightSentence + " " + leftWord;
1040 numberOfWords++;
1046 rightSentence = rightSentence + " ." ;
1048 if ( numberOfWords == 1)
1050 resultSentence = QString("# ") + (bagOfWords.begin()).key() + QString(" .") ;
1051 //QMessageBox::information ( NULL, "All-Debug",resultSentence, "OK" );
1052 biRank = 1;
1053 triRank = 1;
1054 return;
1058 //QMessageBox::information ( NULL, "Debug",QString("bag of word has %1 words.").arg(numberOfWords), "OK" );
1060 bestbigramcount =0.0;
1061 for ( StringToIntIt1 = bagOfWords.begin(); StringToIntIt1 != bagOfWords.end(); StringToIntIt1++)
1063 oneWord = StringToIntIt1.key();
1064 count = StringToIntIt1.data();
1066 if ( count > 1)
1068 reasonablePair = true;
1070 else
1072 reasonablePair = false;
1075 for ( StringToIntIt2 = bagOfWords.begin(); StringToIntIt2 != bagOfWords.end(); StringToIntIt2++)
1077 twoWord = StringToIntIt2.key();
1079 if (( !reasonablePair) && (oneWord != twoWord))
1081 reasonablePair = true;
1084 if ( !reasonablePair)
1086 continue;
1089 oneBigram = oneWord + separator + twoWord;
1091 if ( m_bigramprob.contains(oneBigram))
1094 // bigram score
1095 bigramcount = m_bigramprob[oneBigram];
1097 // bigram bag of words
1098 currentBagOfWords.clear();
1099 currentBagOfWords = bagOfWords;
1101 if (currentBagOfWords[oneWord] <= 1)
1103 currentBagOfWords.remove(oneWord);
1105 else
1107 currentBagOfWords[oneWord]--;
1110 if (currentBagOfWords[twoWord] <= 1)
1112 currentBagOfWords.remove(twoWord);
1114 else
1116 currentBagOfWords[twoWord]--;
1119 currentBagOfWords.insert(QString("#"), 1);
1120 currentBagOfWords.insert(QString("."), 1);
1122 currentHistoryString.clear();
1123 currentHistoryScore.clear();
1125 oneSentenceItem = new sentenceItem(bigramcount, oneBigram, currentBagOfWords, 1, currentHistoryString, currentHistoryScore );
1126 oneSentenceItem ->m_numberofwordsinsentence = 2;
1127 oneSentenceItem ->m_value = oneSentenceItem ->m_value / (double)oneSentenceItem ->m_numberofwordsinsentence;
1129 twoSentenceItem = new sentenceItem(0, oneBigram, currentBagOfWords, 1, currentHistoryString, currentHistoryScore );
1130 twoSentenceItem ->m_numberofwordsinsentence = 2;
1131 twoSentenceItem ->m_value = twoSentenceItem ->m_value / (double)twoSentenceItem ->m_numberofwordsinsentence;
1133 biResultRankedSentences.append(oneSentenceItem);
1134 triResultRankedSentences.append(twoSentenceItem);
1146 // Test Bigram Sequencerizer2
1147 sequenize2(bagOfWords, numberOfWords, m_K, biResultRankedSentences, 1);
1149 // Test Trigram Sequencerizer2
1150 sequenize2(bagOfWords, numberOfWords, m_K, triResultRankedSentences, 2);
1153 if ( biResultRankedSentences.count() != 0)
1156 QFile file( "SequencerLog2.txt" );
1158 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
1160 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1161 return;
1164 Q3TextStream outf( &file );
1166 outf << "******Bigram Results******" << endl <<endl;
1170 QString biResultKey;
1173 biResultRankedSentences.sort();
1175 resultRanki = 0;
1176 biRank = resultRanki;
1177 for ( oneSentenceItem=biResultRankedSentences.first(); oneSentenceItem != 0; oneSentenceItem=biResultRankedSentences.next())
1179 resultRanki++;
1181 biResultKey = oneSentenceItem ->m_key;
1182 biResultKey.replace(separator, " ");
1184 if ( biResultKey == rightSentence)
1186 biRank = resultRanki;
1190 if ( resultRanki > m_resultK)
1192 break;
1196 // log history of this result sentence
1197 outf << "Result Sentence Rank " << resultRanki << " : " << biResultKey << endl;
1198 for ( int stepi = 1; stepi <= oneSentenceItem ->m_stepnumber; stepi++)
1200 QString oneHistoryString;
1201 double oneHistoryScore;
1203 oneHistoryString = oneSentenceItem ->m_historystrings[stepi];
1204 oneHistoryString = oneHistoryString.replace(separator, " ");
1205 oneHistoryScore = oneSentenceItem ->m_historyscores[stepi];
1207 outf << " " << stepi << " : " << oneHistoryString << " : " << oneHistoryScore << endl;
1211 outf << endl;
1216 if ( getInputFromUI )
1220 oneSentenceItem=biResultRankedSentences.first();
1222 biResultKey = oneSentenceItem ->m_key;
1223 biResultKey.replace(separator, " ");
1225 QMessageBox::information ( NULL, "Top Bigram Sequencer Output",biResultKey, "OK" );
1229 biResultRankedSentences.setAutoDelete(TRUE);
1230 biResultRankedSentences.clear();
1232 file.close();
1236 if ( triResultRankedSentences.count() != 0)
1239 QFile file( "SequencerLog2.txt" );
1241 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
1243 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1244 return;
1247 Q3TextStream outf( &file );
1249 outf << "******Trigram Results******" << endl <<endl;
1252 QString triResultKey;
1254 triResultRankedSentences.sort();
1256 resultRanki = 0;
1257 triRank = resultRanki;
1258 for ( oneSentenceItem=triResultRankedSentences.first(); oneSentenceItem != 0; oneSentenceItem=triResultRankedSentences.next())
1260 resultRanki++;
1262 triResultKey = oneSentenceItem ->m_key;
1263 triResultKey.replace(separator, " ");
1265 if ( triResultKey == rightSentence)
1267 triRank = resultRanki;
1271 if ( resultRanki > m_resultK)
1273 break;
1276 // log history of this result sentence
1277 outf << "Result Sentence Rank " << resultRanki << " : " << triResultKey << endl;
1278 for ( int stepi = 1; stepi <= oneSentenceItem ->m_stepnumber; stepi++)
1280 QString oneHistoryString;
1281 double oneHistoryScore;
1283 oneHistoryString = oneSentenceItem ->m_historystrings[stepi];
1284 oneHistoryString = oneHistoryString.replace(separator, " ");
1285 oneHistoryScore = oneSentenceItem ->m_historyscores[stepi];
1287 outf << " " << stepi << " : " << oneHistoryString << " : " << oneHistoryScore << endl;
1291 outf << endl;
1297 if ( getInputFromUI )
1299 QString triResultKey;
1301 oneSentenceItem=triResultRankedSentences.first();
1303 triResultKey = oneSentenceItem ->m_key;
1304 triResultKey.replace(separator, " ");
1306 QMessageBox::information ( NULL, "Top Trigram Sequencer Output",triResultKey, "OK" );
1310 triResultRankedSentences.setAutoDelete(TRUE);
1311 triResultRankedSentences.clear();
1313 file.close();
1319 else
1321 return;
1327 void CSequencer::sequenizeFromABigram(
1328 QString oneBigram,
1329 double bigramValue,
1330 QMap<QString, int>& bagOfWords,
1331 int lenOfSentence,
1332 int K,
1333 Q3SortedList<sentenceItem>& resultKSentences,
1334 int computeType)
1336 // computeType: 1 --> bigram; 2 --> trigram
1338 QMap<QString, int>::Iterator StringToIntIt;
1339 double currentValue;
1340 QString currentString;
1341 QString currentleftBigrambase, currentrightBigrambase;
1342 QString currentleftTrigrambase, currentrightTrigrambase;
1343 QMap<QString, int>* currentBagOfWords;
1344 QString tempString;
1345 int leftFirstLoc, rightFirstLoc;
1346 int leftSecondLoc, rightSecondLoc;
1347 int lenOfSeparator = separator.length();
1348 QMap<QString, int> oneTryBagOfWords;
1349 Q3SortedList<sentenceItem> tempResultKSentence;
1350 Q3SortedList<sentenceItem> swapResultKSentence;
1351 sentenceItem* oneSentenceItem;
1352 sentenceItem* oneCurrentItem;
1353 int i,j;
1354 bool canExpandLeft, canExpandRight;
1355 bool lastWord;
1356 QMap<int, QString> currentHistoryString;
1357 QMap<int, double> currentHistoryScore;
1358 int currentStepNumber;
1361 resultKSentences.setAutoDelete( TRUE );
1362 tempResultKSentence.setAutoDelete( FALSE );
1363 resultKSentences.clear();
1366 // First, put this bigram in resultKSentences;
1367 currentHistoryString.clear();
1368 currentHistoryScore.clear();
1369 oneSentenceItem = new sentenceItem(bigramValue, oneBigram, bagOfWords, 1, currentHistoryString, currentHistoryScore );
1370 resultKSentences.append(oneSentenceItem);
1373 // Loop for lenOfSentence( abc doesn't count the beginning "#" and ending ".")
1374 // Each loop refers to expand one word either from left or right
1375 lastWord = false;
1376 for ( i=0; i <lenOfSentence; i++)
1378 // for each expansion. Total : N loops
1380 if ( i == lenOfSentence -1)
1382 lastWord = true;
1384 else
1386 lastWord = false;
1390 tempResultKSentence.clear();
1391 for ( oneCurrentItem=resultKSentences.first(); oneCurrentItem != 0; oneCurrentItem=resultKSentences.next())
1393 // for each current string. Total: K loops
1395 currentString = oneCurrentItem ->m_key;
1396 currentValue = oneCurrentItem ->m_value;
1397 currentBagOfWords = &(oneCurrentItem ->m_bagofwords);
1398 currentHistoryString.clear();
1399 currentHistoryString = oneCurrentItem ->m_historystrings;
1400 currentHistoryScore.clear();
1401 currentHistoryScore = oneCurrentItem ->m_historyscores;
1402 currentStepNumber = oneCurrentItem ->m_stepnumber;
1404 leftFirstLoc = currentString.find(separator);
1405 currentleftBigrambase = currentString.left(leftFirstLoc);
1406 tempString = currentString.right(currentString.length() - leftFirstLoc - lenOfSeparator);
1407 leftSecondLoc = tempString.find(separator);
1409 if ( leftSecondLoc == -1)
1411 currentleftTrigrambase = currentString;
1413 else
1415 currentleftTrigrambase = currentString.left(leftFirstLoc + lenOfSeparator + leftSecondLoc);
1419 rightFirstLoc = currentString.findRev(separator);
1420 currentrightBigrambase = currentString.right(currentString.length() - rightFirstLoc - lenOfSeparator);
1421 tempString = currentString.left(rightFirstLoc );
1422 rightSecondLoc = tempString.findRev(separator);
1424 if ( rightSecondLoc == -1)
1426 currentrightTrigrambase = currentString;
1428 else
1430 currentrightTrigrambase = currentString.right(currentString.length() - rightSecondLoc - lenOfSeparator);
1434 if (currentleftBigrambase == QString("#"))
1436 canExpandLeft = false;
1438 else
1440 canExpandLeft = true;
1444 if (currentrightBigrambase == QString("."))
1446 canExpandRight = false;
1448 else
1450 canExpandRight = true;
1454 if ( (!canExpandLeft) && (!canExpandRight))
1456 continue;
1459 for ( StringToIntIt = currentBagOfWords ->begin(); StringToIntIt != currentBagOfWords ->end(); StringToIntIt++)
1461 // For each possible word. Total : (N - M) loops
1463 QString onePossibleExpansion;
1464 QString leftExpansion;
1465 QString rightExpansion;
1466 double leftValue;
1467 double rightValue;
1469 onePossibleExpansion = StringToIntIt.key();
1472 // Try Left Expansion
1473 if (( onePossibleExpansion != QString(".")) && (canExpandLeft))
1475 if ((onePossibleExpansion == QString("#")) && (!canExpandRight) && (!lastWord))
1477 continue;
1481 leftExpansion = onePossibleExpansion + separator + currentString;
1483 if ( computeType == 1)
1485 QString oneTryBigram;
1486 double oneValue;
1488 oneTryBigram = onePossibleExpansion + separator + currentleftBigrambase;
1490 if (! m_bigramprob.contains(oneTryBigram))
1492 if (m_bigramsbase.contains(onePossibleExpansion))
1494 oneValue = 10.0; // big punishment
1496 else
1498 oneValue = 5.0; // mild punishment
1502 else
1504 oneValue = m_bigramprob[oneTryBigram];
1507 leftValue = currentValue + oneValue;
1510 else if ( computeType == 2)
1513 QString oneTryTrigram;
1514 QString oneTryTrigrambase;
1515 double oneValue;
1517 oneTryTrigrambase = onePossibleExpansion + separator + currentleftBigrambase;
1518 oneTryTrigram = onePossibleExpansion + separator + currentleftTrigrambase;
1522 if (! m_trigramprob.contains(oneTryTrigram))
1524 if (m_trigramsbase.contains(oneTryTrigrambase))
1526 oneValue = 10.0; // big punishment
1528 else
1530 oneValue = 5.0; // mild punishment
1534 else
1536 oneValue = m_trigramprob[oneTryTrigram];
1539 leftValue = currentValue + oneValue;
1542 else
1544 return;
1548 // create a sentenceItem
1549 oneTryBagOfWords = (*currentBagOfWords);
1550 if ( oneTryBagOfWords[onePossibleExpansion] > 1)
1552 oneTryBagOfWords[onePossibleExpansion]--;
1554 else
1556 oneTryBagOfWords.remove(onePossibleExpansion);
1559 oneSentenceItem = new sentenceItem(leftValue, leftExpansion, oneTryBagOfWords, currentStepNumber+1, currentHistoryString, currentHistoryScore);
1561 tempResultKSentence.append(oneSentenceItem);
1566 // Try Right Expansion
1567 if ( (onePossibleExpansion != QString("#")) && (canExpandRight))
1569 if ((onePossibleExpansion == QString(".")) && (!canExpandLeft) && (!lastWord))
1571 continue;
1574 rightExpansion = currentString + separator + onePossibleExpansion;
1576 if ( computeType == 1)
1578 QString oneTryBigram;
1579 double oneValue;
1581 oneTryBigram = currentrightBigrambase + separator + onePossibleExpansion;
1583 if (! m_bigramprob.contains(oneTryBigram))
1585 if (m_bigramsbase.contains(currentrightBigrambase))
1587 oneValue = 10.0; // big punishment
1589 else
1591 oneValue = 5.0; // mild punishment
1595 else
1597 oneValue = m_bigramprob[oneTryBigram];
1600 rightValue = currentValue + oneValue;
1603 else if ( computeType == 2)
1606 QString oneTryTrigram;
1607 QString oneTryTrigrambase;
1608 double oneValue;
1610 oneTryTrigram = currentrightTrigrambase + separator + onePossibleExpansion;
1611 oneTryTrigrambase = currentrightTrigrambase;
1614 if (! m_trigramprob.contains(oneTryTrigram))
1616 if (m_trigramsbase.contains(oneTryTrigrambase))
1618 oneValue = 10.0; // big punishment
1620 else
1622 oneValue = 5.0; // mild punishment
1626 else
1628 oneValue = m_trigramprob[oneTryTrigram];
1631 rightValue = currentValue + oneValue;
1634 else
1636 return;
1640 // create a sentenceItem
1641 oneTryBagOfWords = (*currentBagOfWords);;
1642 if ( oneTryBagOfWords[onePossibleExpansion] > 1)
1644 oneTryBagOfWords[onePossibleExpansion]--;
1646 else
1648 oneTryBagOfWords.remove(onePossibleExpansion);
1651 oneSentenceItem = new sentenceItem(rightValue, rightExpansion, oneTryBagOfWords, currentStepNumber+1, currentHistoryString, currentHistoryScore);
1653 tempResultKSentence.append(oneSentenceItem);
1661 tempResultKSentence.sort();
1664 // Delete the duplicate Items
1665 QString preString;
1666 QString postString;
1668 preString = QString("");
1669 swapResultKSentence.clear();
1670 for ( oneCurrentItem=tempResultKSentence.first(); oneCurrentItem != 0; oneCurrentItem=tempResultKSentence.next())
1672 postString = oneCurrentItem ->m_key;
1674 if (! (postString == preString) )
1676 swapResultKSentence.append(oneCurrentItem);
1677 preString = postString ;
1679 else
1681 delete oneCurrentItem;
1687 tempResultKSentence = swapResultKSentence;
1690 // Keep the top K item in this sentence list
1691 if ( static_cast <int> ( tempResultKSentence.count() ) > K)
1693 int diff;
1695 diff = tempResultKSentence.count() - K ;
1697 tempResultKSentence.setAutoDelete( TRUE );
1698 for ( j = 0; j< diff; j++)
1700 tempResultKSentence.removeLast();
1702 tempResultKSentence.setAutoDelete( FALSE );
1705 // copy to ResultKSentence
1706 resultKSentences.clear();
1707 resultKSentences = tempResultKSentence;
1717 void CSequencer::sequenize2(
1718 QMap<QString, int> allBagOfWords,
1719 int lenOfSentence,
1720 int K,
1721 Q3SortedList<sentenceItem>& resultKSentences,
1722 int computeType)
1724 // computeType: 1 --> bigram; 2 --> trigram
1726 QMap<QString, int> testBagOfWords;
1727 QMap<QString, int> testBagOfWords2;
1728 QMap<QString, int>::Iterator StringToIntIt;
1729 double currentValue, tryValue;
1730 QString currentString, tryString;
1731 QString currentleftBigrambase, currentrightBigrambase;
1732 QString currentleftTrigrambase, currentrightTrigrambase;
1733 QString tryleftBigrambase, tryrightBigrambase;
1734 QString tryleftTrigrambase, tryrightTrigrambase;
1735 QMap<QString, int>* currentBagOfWords;
1736 QMap<QString, int>* tryBagOfWords;
1737 QString tempString;
1738 int leftFirstLoc, rightFirstLoc;
1739 int leftSecondLoc, rightSecondLoc;
1740 int lenOfSeparator = separator.length();
1741 QMap<QString, int> oneTryBagOfWords;
1742 Q3SortedList<sentenceItem> tempResultKSentence;
1743 Q3SortedList<sentenceItem> swapResultKSentence;
1744 Q3SortedList<sentenceItem> copyResultKSentence;
1745 sentenceItem* oneSentenceItem;
1746 sentenceItem* oneCurrentItem;
1747 sentenceItem* twoCurrentItem;
1748 int i,j;
1749 bool canExpandLeft, canExpandRight;
1750 bool canExpandLeft2, canExpandRight2;
1751 bool lastWord;
1752 bool myLastWord;
1753 QMap<int, QString> currentHistoryString, tryHistoryString;
1754 QMap<int, double> currentHistoryScore, tryHistoryScore;
1755 int currentStepNumber;
1756 int currentNumberOfWordsInSentence, tryNumberOfWordsInSentence;
1757 bool shouldFurther;
1758 bool debug = false;
1759 bool deleteduplicatesentence = false;
1762 resultKSentences.setAutoDelete( TRUE );
1763 tempResultKSentence.setAutoDelete( FALSE );
1764 copyResultKSentence.setAutoDelete( FALSE );
1765 swapResultKSentence.setAutoDelete( FALSE );
1768 if ( debug)
1770 QFile file( "SequencerLog2Debug.txt" );
1772 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
1774 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1775 return;
1778 Q3TextStream outf( &file );
1779 QString displayTempString;
1782 outf << "******One Sentence******" << endl <<endl;
1784 file.close();
1788 // At most Loop for lenOfSentence( abc doesn't count the beginning "#" and ending ".")
1789 // Each loop refers to expand one step
1790 lastWord = false;
1791 for ( i=0; i <lenOfSentence; i++)
1794 shouldFurther = false;
1796 // for each expansion. Total : N loops
1798 if ( i == lenOfSentence -1)
1800 lastWord = true;
1802 else
1804 lastWord = false;
1808 if ( debug)
1810 QFile file( "SequencerLog2Debug.txt" );
1812 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
1814 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1815 return;
1818 Q3TextStream outf( &file );
1819 QString displayTempString;
1823 outf << endl << "******" << i <<"******" << endl;
1825 file.close();
1829 tempResultKSentence.clear();
1831 copyResultKSentence = resultKSentences;
1833 for ( oneCurrentItem=resultKSentences.first(); oneCurrentItem != 0; oneCurrentItem=resultKSentences.next())
1835 // for each current string. Total: K loops
1837 currentString = oneCurrentItem ->m_key;
1838 currentValue = oneCurrentItem ->m_value; // Now, this is an average value;
1839 currentBagOfWords = &(oneCurrentItem ->m_bagofwords);
1840 currentHistoryString.clear();
1841 currentHistoryString = oneCurrentItem ->m_historystrings;
1842 currentHistoryScore.clear();
1843 currentHistoryScore = oneCurrentItem ->m_historyscores;
1844 currentStepNumber = oneCurrentItem ->m_stepnumber;
1845 currentNumberOfWordsInSentence = oneCurrentItem ->m_numberofwordsinsentence;
1849 if ( debug)
1852 QFile file( "SequencerLog2Debug.txt" );
1854 if ( !file.open( QIODevice::WriteOnly | QIODevice::Append ) )
1856 QMessageBox::information(NULL, "Error", "Can't Open the file!", "OK");
1857 return;
1860 Q3TextStream outf( &file );
1861 QString displayTempString;
1864 displayTempString = currentString;
1865 displayTempString = displayTempString.replace(separator, " ");
1866 outf << displayTempString << " " << currentValue <<endl;
1868 file.close();
1872 // Get the words of currentString
1873 testBagOfWords.clear();
1874 testBagOfWords = allBagOfWords;
1875 for ( StringToIntIt = currentBagOfWords ->begin(); StringToIntIt != currentBagOfWords ->end(); StringToIntIt++)
1877 QString oneInCurrentWords;
1879 oneInCurrentWords = StringToIntIt.key();
1881 if ( testBagOfWords[oneInCurrentWords] == 1)
1883 testBagOfWords.remove(oneInCurrentWords);
1885 else
1887 testBagOfWords[oneInCurrentWords]--;
1893 // this sentence already done
1894 if ( currentNumberOfWordsInSentence == (lenOfSentence +2))
1896 oneSentenceItem = new sentenceItem(oneCurrentItem);
1898 tempResultKSentence.append(oneSentenceItem);
1900 continue;
1903 myLastWord = lastWord;
1904 if ( currentNumberOfWordsInSentence == (lenOfSentence +1))
1906 myLastWord = true;
1911 // Figure out the bigrambase and trigrambase of the current string;
1912 leftFirstLoc = currentString.find(separator);
1913 currentleftBigrambase = currentString.left(leftFirstLoc);
1914 tempString = currentString.right(currentString.length() - leftFirstLoc - lenOfSeparator);
1915 leftSecondLoc = tempString.find(separator);
1917 if ( leftSecondLoc == -1)
1919 currentleftTrigrambase = currentString;
1921 else
1923 currentleftTrigrambase = currentString.left(leftFirstLoc + lenOfSeparator + leftSecondLoc);
1927 rightFirstLoc = currentString.findRev(separator);
1928 currentrightBigrambase = currentString.right(currentString.length() - rightFirstLoc - lenOfSeparator);
1929 tempString = currentString.left(rightFirstLoc );
1930 rightSecondLoc = tempString.findRev(separator);
1932 if ( rightSecondLoc == -1)
1934 currentrightTrigrambase = currentString;
1936 else
1938 currentrightTrigrambase = currentString.right(currentString.length() - rightSecondLoc - lenOfSeparator);
1942 if (currentleftBigrambase == QString("#"))
1944 canExpandLeft = false;
1946 else
1948 canExpandLeft = true;
1952 if (currentrightBigrambase == QString("."))
1954 canExpandRight = false;
1956 else
1958 canExpandRight = true;
1962 if ( (!canExpandLeft) && (!canExpandRight))
1964 continue;
1969 // Consider the single word in currentbagsofWords
1970 for ( StringToIntIt = currentBagOfWords ->begin(); StringToIntIt != currentBagOfWords ->end(); StringToIntIt++)
1972 // For each possible word. Total : (N - M) loops
1974 QString onePossibleExpansion;
1975 QString leftExpansion;
1976 QString rightExpansion;
1977 double leftValue;
1978 double rightValue;
1980 onePossibleExpansion = StringToIntIt.key();
1983 // Try Left Expansion
1984 if (( onePossibleExpansion != QString(".")) && (canExpandLeft))
1986 if ((onePossibleExpansion == QString("#")) && (!canExpandRight) && (!myLastWord))
1988 continue;
1992 leftExpansion = onePossibleExpansion + separator + currentString;
1994 if ( computeType == 1)
1996 QString oneTryBigram;
1997 double oneValue;
1999 oneTryBigram = onePossibleExpansion + separator + currentleftBigrambase;
2001 if (! m_bigramprob.contains(oneTryBigram))
2003 if (m_bigramsbase.contains(onePossibleExpansion))
2005 oneValue = 10.0; // big punishment
2007 else
2009 oneValue = 5.0; // mild punishment
2013 else
2015 oneValue = m_bigramprob[oneTryBigram];
2018 leftValue = currentValue*currentNumberOfWordsInSentence + oneValue;
2021 else if ( computeType == 2)
2024 QString oneTryTrigram;
2025 QString oneTryTrigrambase;
2026 double oneValue;
2028 oneTryTrigrambase = onePossibleExpansion + separator + currentleftBigrambase;
2029 oneTryTrigram = onePossibleExpansion + separator + currentleftTrigrambase;
2033 if (! m_trigramprob.contains(oneTryTrigram))
2035 if (m_trigramsbase.contains(oneTryTrigrambase))
2037 oneValue = 10.0; // big punishment
2039 else
2041 oneValue = 5.0; // mild punishment
2045 else
2047 oneValue = m_trigramprob[oneTryTrigram];
2050 leftValue = currentValue*currentNumberOfWordsInSentence + oneValue;
2053 else
2055 return;
2059 // create a sentenceItem
2060 oneTryBagOfWords = (*currentBagOfWords);
2061 if ( oneTryBagOfWords[onePossibleExpansion] > 1)
2063 oneTryBagOfWords[onePossibleExpansion]--;
2065 else
2067 oneTryBagOfWords.remove(onePossibleExpansion);
2070 oneSentenceItem = new sentenceItem(leftValue, leftExpansion, oneTryBagOfWords, currentStepNumber+1, currentHistoryString, currentHistoryScore);
2071 oneSentenceItem ->m_numberofwordsinsentence = currentNumberOfWordsInSentence + 1;
2072 oneSentenceItem ->m_value = oneSentenceItem ->m_value / (double)oneSentenceItem ->m_numberofwordsinsentence;
2074 tempResultKSentence.append(oneSentenceItem);
2075 shouldFurther = true;
2080 // Try Right Expansion
2081 if ( (onePossibleExpansion != QString("#")) && (canExpandRight))
2083 if ((onePossibleExpansion == QString(".")) && (!canExpandLeft) && (!myLastWord))
2085 continue;
2088 rightExpansion = currentString + separator + onePossibleExpansion;
2090 if ( computeType == 1)
2092 QString oneTryBigram;
2093 double oneValue;
2095 oneTryBigram = currentrightBigrambase + separator + onePossibleExpansion;
2097 if (! m_bigramprob.contains(oneTryBigram))
2099 if (m_bigramsbase.contains(currentrightBigrambase))
2101 oneValue = 10.0; // big punishment
2103 else
2105 oneValue = 5.0; // mild punishment
2109 else
2111 oneValue = m_bigramprob[oneTryBigram];
2114 rightValue = currentValue*currentNumberOfWordsInSentence + oneValue;
2117 else if ( computeType == 2)
2120 QString oneTryTrigram;
2121 QString oneTryTrigrambase;
2122 double oneValue;
2124 oneTryTrigram = currentrightTrigrambase + separator + onePossibleExpansion;
2125 oneTryTrigrambase = currentrightTrigrambase;
2128 if (! m_trigramprob.contains(oneTryTrigram))
2130 if (m_trigramsbase.contains(oneTryTrigrambase))
2132 oneValue = 10.0; // big punishment
2134 else
2136 oneValue = 5.0; // mild punishment
2140 else
2142 oneValue = m_trigramprob[oneTryTrigram];
2145 rightValue = currentValue*currentNumberOfWordsInSentence + oneValue;
2148 else
2150 return;
2154 // create a sentenceItem
2155 oneTryBagOfWords = (*currentBagOfWords);;
2156 if ( oneTryBagOfWords[onePossibleExpansion] > 1)
2158 oneTryBagOfWords[onePossibleExpansion]--;
2160 else
2162 oneTryBagOfWords.remove(onePossibleExpansion);
2165 oneSentenceItem = new sentenceItem(rightValue, rightExpansion, oneTryBagOfWords, currentStepNumber+1, currentHistoryString, currentHistoryScore);
2166 oneSentenceItem ->m_numberofwordsinsentence = currentNumberOfWordsInSentence + 1;
2167 oneSentenceItem ->m_value = oneSentenceItem ->m_value / (double)oneSentenceItem ->m_numberofwordsinsentence;
2169 tempResultKSentence.append(oneSentenceItem);
2170 shouldFurther = true;
2178 // If this sentence Item only need one more word, not necessary to consider other chunks;
2179 //myLastWord = true;
2181 if ( myLastWord)
2183 continue;
2186 // Consider the possible chunk concatenance;
2188 for ( twoCurrentItem=copyResultKSentence.first(); twoCurrentItem != 0; twoCurrentItem=copyResultKSentence.next())
2190 QString leftExpansion;
2191 QString rightExpansion;
2192 double leftValue;
2193 double rightValue;
2194 bool overlapped;
2197 tryString = twoCurrentItem ->m_key;
2198 tryValue = twoCurrentItem ->m_value; // Now, this is an average value;
2199 tryBagOfWords = &(twoCurrentItem ->m_bagofwords);
2200 tryHistoryString.clear();
2201 tryHistoryString = twoCurrentItem ->m_historystrings;
2202 tryHistoryScore.clear();
2203 tryHistoryScore = twoCurrentItem ->m_historyscores;
2204 tryNumberOfWordsInSentence = twoCurrentItem ->m_numberofwordsinsentence;
2207 // Too long
2208 if ( (currentNumberOfWordsInSentence + tryNumberOfWordsInSentence -2) > lenOfSentence)
2210 continue;
2214 // Figure out the bigrambase and trigrambase of the try string;
2215 leftFirstLoc = tryString.find(separator);
2216 tryleftBigrambase = tryString.left(leftFirstLoc);
2217 tempString = tryString.right(tryString.length() - leftFirstLoc - lenOfSeparator);
2218 leftSecondLoc = tempString.find(separator);
2220 if ( leftSecondLoc == -1)
2222 tryleftTrigrambase = tryString;
2224 else
2226 tryleftTrigrambase = tryString.left(leftFirstLoc + lenOfSeparator + leftSecondLoc);
2230 rightFirstLoc = tryString.findRev(separator);
2231 tryrightBigrambase = tryString.right(tryString.length() - rightFirstLoc - lenOfSeparator);
2232 tempString = tryString.left(rightFirstLoc );
2233 rightSecondLoc = tempString.findRev(separator);
2235 if ( rightSecondLoc == -1)
2237 tryrightTrigrambase = tryString;
2239 else
2241 tryrightTrigrambase = tryString.right(tryString.length() - rightSecondLoc - lenOfSeparator);
2245 // Do quick check in order to save time
2247 if (tryleftBigrambase == QString("#"))
2249 canExpandLeft2 = false;
2251 else
2253 canExpandLeft2 = true;
2257 if (tryrightBigrambase == QString("."))
2259 canExpandRight2 = false;
2261 else
2263 canExpandRight2 = true;
2268 if ( !canExpandLeft && !canExpandRight)
2270 continue;
2273 if ( !canExpandLeft2 && !canExpandRight2)
2275 continue;
2278 if ( (canExpandLeft && !canExpandRight) && (!canExpandRight2 && canExpandLeft2))
2280 continue;
2283 if ( (!canExpandLeft && canExpandRight) && (canExpandRight2 && !canExpandLeft2))
2285 continue;
2288 if ( (canExpandLeft && canExpandRight) && (canExpandRight2 && canExpandLeft2))
2290 if (( currentNumberOfWordsInSentence + tryNumberOfWordsInSentence) > lenOfSentence)
2292 continue;
2297 if ( (canExpandLeft && !canExpandRight) && (canExpandRight2 && !canExpandLeft2))
2299 if ( (currentNumberOfWordsInSentence + tryNumberOfWordsInSentence -2) < lenOfSentence)
2301 continue;
2305 if ( (canExpandLeft2 && !canExpandRight2) && (canExpandRight && !canExpandLeft))
2307 if ( (currentNumberOfWordsInSentence + tryNumberOfWordsInSentence -2) < lenOfSentence)
2309 continue;
2314 // Check wether the two strings overlap some common words.
2315 testBagOfWords2.clear();
2316 testBagOfWords2 = (*tryBagOfWords);
2318 overlapped = false;
2319 for ( StringToIntIt = testBagOfWords.begin(); StringToIntIt != testBagOfWords.end(); StringToIntIt++)
2321 QString oneInTestWords;
2322 int oneInTestWordCount;
2325 oneInTestWords = StringToIntIt.key();
2326 oneInTestWordCount = StringToIntIt.data();
2328 if (! testBagOfWords2.contains(oneInTestWords))
2330 overlapped = true;
2331 break;
2334 if ( testBagOfWords2[oneInTestWords] < oneInTestWordCount)
2336 overlapped = true;
2337 break;
2339 else if ( testBagOfWords2[oneInTestWords] == oneInTestWordCount)
2341 testBagOfWords2.remove(oneInTestWords);
2343 else
2345 testBagOfWords2[oneInTestWords] -= oneInTestWordCount;
2350 if ( overlapped) continue;
2353 // Now, these two chunks are ready to merge
2354 oneTryBagOfWords.clear();
2355 oneTryBagOfWords = testBagOfWords2;
2358 // Try Left Merge
2360 if ( canExpandLeft && canExpandRight2)
2363 leftExpansion = tryString + separator + currentString;
2365 if ( computeType == 1)
2367 QString oneTryBigram;
2368 double oneValue;
2370 oneTryBigram = tryrightBigrambase + separator + currentleftBigrambase;
2372 if (! m_bigramprob.contains(oneTryBigram))
2374 if (m_bigramsbase.contains(tryrightBigrambase))
2376 oneValue = 10.0; // big punishment
2378 else
2380 oneValue = 5.0; // mild punishment
2384 else
2386 oneValue = m_bigramprob[oneTryBigram];
2389 leftValue = currentValue*currentNumberOfWordsInSentence + oneValue + tryValue*tryNumberOfWordsInSentence;
2392 else if ( computeType == 2)
2395 QString oneTryTrigram;
2396 QString oneTryTrigrambase;
2397 double oneValue, twoValue;
2400 oneTryTrigrambase = tryrightBigrambase + separator + currentleftBigrambase;
2401 oneTryTrigram = tryrightBigrambase + separator + currentleftTrigrambase;
2405 if (! m_trigramprob.contains(oneTryTrigram))
2407 if (m_trigramsbase.contains(oneTryTrigrambase))
2409 oneValue = 10.0; // big punishment
2411 else
2413 oneValue = 5.0; // mild punishment
2417 else
2419 oneValue = m_trigramprob[oneTryTrigram];
2423 // Special here, one more trigram are taken in.
2424 oneTryTrigrambase = tryrightTrigrambase;
2425 oneTryTrigram = tryrightTrigrambase + separator + currentleftBigrambase;
2428 if (! m_trigramprob.contains(oneTryTrigram))
2430 if (m_trigramsbase.contains(oneTryTrigrambase))
2432 twoValue = 10.0; // big punishment
2434 else
2436 twoValue = 5.0; // mild punishment
2440 else
2442 twoValue = m_trigramprob[oneTryTrigram];
2446 leftValue = currentValue*currentNumberOfWordsInSentence + oneValue + twoValue + tryValue*tryNumberOfWordsInSentence;
2449 else
2451 return;
2454 // create a sentenceItem
2456 // Debug
2457 for ( StringToIntIt = oneTryBagOfWords.begin(); StringToIntIt != oneTryBagOfWords.end(); StringToIntIt++)
2459 QString oneInTestWords;
2460 int oneInTestWordCount;
2462 oneInTestWords = StringToIntIt.key();
2463 oneInTestWordCount = 1;
2470 oneSentenceItem = new sentenceItem(leftValue, leftExpansion, oneTryBagOfWords, currentStepNumber+1, currentHistoryString, currentHistoryScore);
2471 oneSentenceItem ->m_numberofwordsinsentence = currentNumberOfWordsInSentence + tryNumberOfWordsInSentence;
2472 oneSentenceItem ->m_value = oneSentenceItem ->m_value / (double)oneSentenceItem ->m_numberofwordsinsentence;
2474 tempResultKSentence.append(oneSentenceItem);
2475 shouldFurther = true;
2480 // Try Right Merge
2482 if ( canExpandRight && canExpandLeft2)
2485 rightExpansion = currentString + separator + tryString;
2487 if ( computeType == 1)
2489 QString oneTryBigram;
2490 double oneValue;
2492 oneTryBigram = currentrightBigrambase + separator + tryleftBigrambase;
2494 if (! m_bigramprob.contains(oneTryBigram))
2496 if (m_bigramsbase.contains(tryrightBigrambase))
2498 oneValue = 10.0; // big punishment
2500 else
2502 oneValue = 5.0; // mild punishment
2506 else
2508 oneValue = m_bigramprob[oneTryBigram];
2511 rightValue = currentValue*currentNumberOfWordsInSentence + oneValue + tryValue*tryNumberOfWordsInSentence;
2514 else if ( computeType == 2)
2517 QString oneTryTrigram;
2518 QString oneTryTrigrambase;
2519 double oneValue, twoValue;
2522 oneTryTrigrambase = currentrightBigrambase + separator + tryleftBigrambase;
2523 oneTryTrigram = currentrightBigrambase + separator + tryleftTrigrambase;
2526 if (! m_trigramprob.contains(oneTryTrigram))
2528 if (m_trigramsbase.contains(oneTryTrigrambase))
2530 oneValue = 10.0; // big punishment
2532 else
2534 oneValue = 5.0; // mild punishment
2538 else
2540 oneValue = m_trigramprob[oneTryTrigram];
2544 // Special here, one more trigram are taken in.
2545 oneTryTrigrambase = currentrightTrigrambase;
2546 oneTryTrigram = currentrightTrigrambase + separator + tryleftBigrambase;
2549 if (! m_trigramprob.contains(oneTryTrigram))
2551 if (m_trigramsbase.contains(oneTryTrigrambase))
2553 twoValue = 10.0; // big punishment
2555 else
2557 twoValue = 5.0; // mild punishment
2561 else
2563 twoValue = m_trigramprob[oneTryTrigram];
2567 rightValue = currentValue*currentNumberOfWordsInSentence + oneValue + twoValue + tryValue*tryNumberOfWordsInSentence;
2570 else
2572 return;
2575 // create a sentenceItem
2576 oneSentenceItem = new sentenceItem(rightValue, rightExpansion, oneTryBagOfWords, currentStepNumber+1, currentHistoryString, currentHistoryScore);
2577 oneSentenceItem ->m_numberofwordsinsentence = currentNumberOfWordsInSentence + tryNumberOfWordsInSentence;
2578 oneSentenceItem ->m_value = oneSentenceItem ->m_value / (double)oneSentenceItem ->m_numberofwordsinsentence;
2580 tempResultKSentence.append(oneSentenceItem);
2581 shouldFurther = true;
2592 tempResultKSentence.sort();
2594 // Delete the duplicate Items
2596 if ( deleteduplicatesentence )
2598 QString preString;
2599 QString postString;
2601 preString = QString("");
2602 swapResultKSentence.clear();
2603 for ( oneCurrentItem=tempResultKSentence.first(); oneCurrentItem != 0; oneCurrentItem=tempResultKSentence.next())
2605 postString = oneCurrentItem ->m_key;
2607 if (! (postString == preString) )
2609 swapResultKSentence.append(oneCurrentItem);
2610 preString = postString ;
2612 else
2614 delete oneCurrentItem;
2620 tempResultKSentence = swapResultKSentence;
2624 // Keep the top K item in this sentence list
2625 if ( static_cast <int> ( tempResultKSentence.count() ) > K)
2627 int diff;
2629 diff = tempResultKSentence.count() - K ;
2631 tempResultKSentence.setAutoDelete( TRUE );
2632 for ( j = 0; j< diff; j++)
2634 tempResultKSentence.removeLast();
2636 tempResultKSentence.setAutoDelete( FALSE );
2639 // copy to ResultKSentence
2640 resultKSentences.clear();
2641 resultKSentences = tempResultKSentence;
2643 if ( !shouldFurther)
2645 break;