CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / linguisticamainwindow_goldstandard.cpp
blobe280bad9b575e22c18a21a749a1fea07d8e3d74f
1 // Handling requests to compare the analyzed corpus to a gold standard
2 // Copyright © 2009 The University of Chicago
3 #include "linguisticamainwindow.h"
5 #include <QMessageBox>
6 #include <QDomDocument>
7 #include <QDomElement>
8 #include <Q3ValueList>
9 #include "Lexicon.h"
10 #include "CorpusWord.h"
11 #include "Stem.h"
12 #include "CorpusWordCollection.h"
13 #include "TemplateCollection.h"
14 #include "Typedefs.h"
16 typedef QMap<CParse*,int> ParseToInt;
17 typedef QMap<QString,CParse*> StringToParse;
18 typedef QMap<QString,ParseToInt*> StringToParseToInt;
19 typedef QMap<QString,StringToInt*> StringToStringToInt;
21 void LinguisticaMainWindow::changeGoldStdFileSlot()
23 m_goldStdFileName = Q3FileDialog::getOpenFileName( m_goldStdFileName,
24 "XML files (*.xml)",
25 this,
26 "Linguistica :: New Gold Standard File",
27 "Choose a new gold standard file" );
28 if( !m_goldStdFileName.isEmpty() )
29 m_Settings.writeEntry( "/linguistica.uchicago.edu/Linguistica/MainWindow/DiagnosticsMenu/NewGoldStdFile", m_goldStdFileName );
34 void LinguisticaMainWindow::compareCompoundsSlot()
37 QString word;
38 CStem* pStem;
39 StringToPtrCStem goldStdCompounds, goldStd;
41 QString goldStdFileName;
43 goldStdFileName = QFileDialog::getOpenFileName( m_projectDirectory,
44 "XML Files (*.xml)",
45 this,
46 "open file dialog",
47 "Choose a gold standard:" );
49 if( !goldStdFileName.isEmpty() )
51 QFile goldStdFile( goldStdFileName );
52 if( goldStdFile.open( IO_ReadOnly ) )
54 QDomDocument doc( "Goldstandard" );
56 if( !doc.setContent( &goldStdFile ) )
58 goldStdFile.close();
59 return;
62 QDomElement root = doc.documentElement();
64 QString tagName = root.tagName();
66 if( root.tagName() != "GDS")
68 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + tagName, "OK" );
69 goldStdFile.close();
70 return;
73 // Read the header
74 QDomNode header = root.firstChild();
76 QDomElement direction = header.nextSibling().toElement();
78 // Read all content
79 QDomNode contentnode = direction.nextSibling();
80 QDomElement content = contentnode.toElement();
82 tagName = content.tagName();
83 if( tagName != "content" )
85 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + tagName, "OK" );
86 goldStdFile.close();
87 return;
90 QString value;
91 int supposedTotalNumberOfWords;
92 QString key;
93 QString wordComment;
94 int numberOfPieces;
95 int start;
96 int type;
97 int color;
98 int score;
99 int rootCount;
100 QString comment, allomorph;
102 value = content.attribute( "number", "100" );
103 supposedTotalNumberOfWords = value.toInt();
105 QDomNode onewordnode = content.firstChild();
107 while (!onewordnode.isNull())
109 QDomElement oneword = onewordnode.toElement();
110 if( !oneword.isNull() )
112 if( oneword.tagName() == "word" )
114 key = oneword.attribute( "key", "" );
115 if (key == "")
117 onewordnode = onewordnode.nextSibling();
118 continue;
121 word = key;
122 pStem = new CStem( key );
123 goldStd.insert( word, pStem );
125 value = oneword.attribute("morphemes", "0");
126 numberOfPieces = value.toInt();
128 // Looking for goldStdCompounds, must be length 2
129 if( numberOfPieces < 2 )
131 onewordnode = onewordnode.nextSibling();
132 continue;
135 wordComment = oneword.attribute( "comment", "" );
137 QDomNode onepiecenode = oneword.firstChild();
139 rootCount = 0; // Need at least two roots to be a compound
141 while( !onepiecenode.isNull() )
143 QDomElement onepiece = onepiecenode.toElement();
144 if( !onepiece.isNull() )
146 if( onepiece.tagName() == "morpheme" )
148 value = onepiece.attribute("start", "-1");
149 start = value.toInt();
150 value = onepiece.attribute("type", "-1");
151 type = value.toInt();
154 if( type == 0 || word[start] == '-' )
156 goldStd.remove( word );
157 goldStdCompounds.remove( word );
158 if( pStem ) delete pStem;
159 pStem = NULL;
160 break;
163 if( type == 1 ) rootCount++;
164 if( rootCount >= 2 ) goldStdCompounds.insert( word, pStem );
166 value = onepiece.attribute("color", "-2");
167 color = value.toInt();
168 value = onepiece.attribute("score", "0");
169 score = value.toInt();
170 allomorph = onepiece.attribute("allomorph", "");
171 comment = onepiece.attribute("comment", "");
173 if( score != 1 )
175 goldStd.remove( word );
176 goldStdCompounds.remove( word );
177 if( pStem ) delete pStem;
178 pStem = NULL;
179 break;
182 if( (start == -1) || (type == -1) || (color == -2) )
184 onepiecenode = onepiecenode.nextSibling();
185 continue;
188 pStem->CutRightBeforeHere( start );
192 onepiecenode = onepiecenode.nextSibling();
195 if( rootCount < 2 )
197 goldStdCompounds.remove( word );
202 onewordnode = onewordnode.nextSibling();
205 goldStdFile.close();
207 else
209 QMessageBox::information( NULL, "Attention", "Unable to open " + goldStdFileName + " .", "OK" );
210 return;
213 else return;
215 CMiniLexicon* mini = m_lexicon->GetMiniLexicon( m_lexicon->GetActiveMiniIndex() );
216 CWordCollection* pWords = mini->GetWords();
217 CCompoundCollection* pCompounds = mini->GetCompounds();
218 CStem* pWord, * qWord;
219 CCompound* pCompound;
220 CSuffixCollection* pSuffixes = NULL;
221 CPrefixCollection* pPrefixes = NULL;
222 CSuffix* pSuffix;
223 CPrefix* pPrefix;
224 int truePos = 0, trueNeg = 0, falsePos = 0, falseNeg = 0;
226 QString outputFileName;
228 outputFileName = QFileDialog::getOpenFileName( m_projectDirectory,
229 "Text Files (*.txt)",
230 this,
231 "open file dialog",
232 "Choose an output file:" );
234 if( !outputFileName.isEmpty() )
236 QFile outputFile( outputFileName );
237 if( outputFile.open( IO_WriteOnly ) )
239 QTextStream out( &outputFile );
240 out.setEncoding( QTextStream::Unicode );
242 pWords->Sort(KEY);
244 out.setf(2);
246 out.width(20);
247 out << "WORD";
249 out.width(2);
250 out << " ";
252 out.width(20);
253 out << "GOLD STD";
255 out.width(2);
256 out << " ";
258 out.width(20);
259 out << "COMPOUND";
261 out.width(2);
262 out << " ";
264 out.width(1);
265 out << "P";
267 out.width(2);
268 out << " ";
270 out.width(1);
271 out << "S" << endl;
273 for( int i = 0; i < pWords->GetCount(); i++ )
275 bool notInGS = FALSE;
277 pWord = pWords->GetAtSort(i);
279 pCompound = *pCompounds ^= pWord->GetKey();
281 if( !pCompound )
283 pStem = pWord->GetStemPtr();
284 if( pStem ) pCompound = *pCompounds ^= pStem->GetKey();
287 if( goldStdCompounds.find( pWord->Display() ) != goldStdCompounds.end() )
289 qWord = goldStdCompounds[ pWord->Display() ];
291 else qWord = NULL;
293 if( goldStd.find( pWord->Display() ) == goldStd.end() )
295 notInGS = TRUE;
297 else
299 if( qWord && pCompound ) truePos++;
300 if( qWord && !pCompound ) falseNeg++;
301 if( !qWord && pCompound ) falsePos++;
303 if( !qWord && !pCompound )
305 trueNeg++;
309 if( !qWord && !pCompound )
311 continue;
314 if( notInGS )
316 out.width(3);
317 out << "* ";
320 out.width(20);
321 out << pWord->Display();
323 out.width(2);
324 out << " ";
326 out.width(20);
327 if( qWord ) out << qWord->Display('+');
328 else out << " ";
330 out.width(2);
331 out << " ";
333 out.width(20);
334 if( pCompound ) out << pCompound->Display('+');
335 else out << " ";
337 out.width(2);
338 out << " ";
340 out.width(1);
341 CMiniLexicon* mini2;
342 int j;
343 for( j = 0; j < m_lexicon->GetMiniSize(); j++ )
345 mini2 = m_lexicon->GetMiniLexicon(j);
346 if( mini2 )
348 pPrefixes = mini2->GetPrefixes();
350 if( pCompound && pPrefixes )
352 pPrefix = (*pPrefixes) ^= pCompound->GetPiece( 1 );
354 if( pPrefix )
356 out << "*";
357 break;
363 out.width(2);
364 out << " ";
366 out.width(1);
367 for( j = 0; j < m_lexicon->GetMiniSize(); j++ )
369 mini2 = m_lexicon->GetMiniLexicon(j);
370 if( mini2 )
372 pSuffixes = mini2->GetSuffixes();
374 if( pCompound && pSuffixes )
376 pSuffix = (*pSuffixes) ^= pCompound->GetPiece( 1 );
378 if( pSuffix )
380 out << "*";
381 break;
387 out << endl;
390 double precision = (double) truePos / (double)( truePos + falsePos );
391 double recall = (double) truePos / (double)( truePos + falseNeg );
393 out << endl << QString( "True Positive Count = %1" ).arg( truePos ) << endl;
394 out << QString( "True Negative Count = %1" ).arg( trueNeg ) << endl;
395 out << QString( "False Positive Count = %1" ).arg( falsePos ) << endl;
396 out << QString( "False Negative Count = %1" ).arg( falseNeg ) << endl;
397 out << endl << QString( "Precision = %1" ).arg( precision ) << endl;
398 out << QString( "Recall = %1" ).arg( recall ) << endl;
399 out << QString( "F-Score = %1" ).arg( ( 2.0 * precision * recall ) / ( precision + recall ) ) << endl;
401 outputFile.close();
408 void LinguisticaMainWindow::compareGoldStdSlot()
410 StringToCStemList goldStdWords;
411 StringToCStemList::Iterator goldStdWordsIt;
412 CStemList stemList;
414 QString goldStdFileName = Q3FileDialog::getOpenFileName( m_projectDirectory,
415 "XML Files (*.xml)",
416 this,
417 "open file dialog",
418 "Choose a gold standard file to open" );
419 if( !goldStdFileName.isEmpty() )
422 QFile goldStdFile( goldStdFileName );
423 if( goldStdFile.open( QIODevice::ReadOnly ) )
426 QDomDocument doc( "Alchemist" ), author_data, document_data;
428 QString errorMsg;
429 int errorLine, errorColumn;
430 if( !doc.setContent( &goldStdFile, &errorMsg, &errorLine, &errorColumn ) )
432 //Maybe we should put this back in.
433 // QMessageBox::warning( this, "Gold Standard : XML Error",
434 // QString( errorMsg + "\nLine: %1" + "Col: %2" ).arg( errorLine ).arg( errorColumn ), QMessageBox::Ok, NULL, NULL );
436 return;
439 QString feature_name;
441 QDomElement alchemist_doc, element, word, string, gloss,
442 morph, piece, notes, morpheme, allomorph,
443 lmnt, feature, name, feature_id, instance_id;
444 QDomNodeList nodes;
445 QDomNode node1, node2, node3, node4;
446 QDomText text;
448 CStem* pStem;
450 QString strStem;
452 bool skipWord = FALSE;
453 int pieceCount, lastEnd, start, length;
455 alchemist_doc = doc.documentElement();
457 if( alchemist_doc.tagName() != "alchemist-doc" )
459 errorMsg = "The XML document \"" + alchemist_doc.tagName() + "\" is not an alchemist document.";
460 QMessageBox::information( NULL, "Gold Standard : XML Error", errorMsg, "OK" );
461 return;
465 // Author data (optional)
466 node1 = alchemist_doc.firstChild();
467 if( !node1.isNull() && node1.isElement() && node1.nodeName() == "author-data" )
469 // Skip...
470 node1 = node1.nextSibling();
474 // Document data (optional)
475 if( !node1.isNull() && node1.isElement() && node1.nodeName() == "document-data" )
477 // Skip...
478 node1 = node1.nextSibling();
482 // Feature list first of morphology description
483 if( node1.isNull() || !node1.isElement() || node1.nodeName() != "feature-list" )
485 // TODO: add to error string
487 else
489 // Skip...
490 node1 = node1.nextSibling();
494 // Morpheme list second
495 if( node1.isNull() || !node1.isElement() || node1.nodeName() != "morpheme-list" )
497 // TODO: add to error string
499 else
501 // Skip...
502 node1 = node1.nextSibling();
506 // Word list last.. this is what we need!
507 if( node1.isNull() || !node1.isElement() || node1.nodeName() != "word-list" )
509 // TODO: add to error string
511 else
513 node2 = node1.firstChild();
515 while( !node2.isNull() &&
516 node2.isElement() &&
517 node2.nodeName() == "word" )
519 word = node2.toElement();
520 node2 = node2.nextSibling();
522 // score attribute
523 if( !word.hasAttribute( "score" ) )
525 // TODO: add to error string
526 continue;
528 else
530 if( word.attribute( "score" ) == "Not Scored" ) continue;
531 else if( word.attribute( "score" ) == "Certain" ); // we want to look at these words
532 else if( word.attribute( "score" ) == "Somewhat Certain" ) continue;
533 else if( word.attribute( "score" ) == "Uncertain" ) continue;
536 // string element
537 node3 = word.firstChild();
538 if( !node3.isElement() || node3.nodeName() != "string" )
540 // TODO: add to error message
541 continue;
543 string = node3.toElement();
545 // Make new gold standard word
546 strStem = string.text();
547 pStem = new CStem( strStem );
549 // gloss element
550 node3 = string.nextSibling();
551 if( node3.isElement() && node3.nodeName() == "gloss" )
553 // Skip...
554 node3 = node3.nextSibling();
557 // affix, root, and piece elements
558 skipWord = FALSE;
559 while( !node3.isNull() &&
560 node3.isElement() &&
561 ( node3.nodeName() == "piece" ||
562 node3.nodeName() == "affix" ||
563 node3.nodeName() == "root" ) &&
564 !skipWord )
566 if( node3.nodeName() == "affix" || node3.nodeName() == "root" )
568 morph = node3.toElement();
570 // string element
571 node4 = morph.firstChild();
572 if( node4.isElement() && node4.nodeName() == "string" )
574 string = node4.toElement();
576 // no need to do anything with this string
577 node4 = string.nextSibling();
579 else
581 // TODO: add to error string
584 // piece elements
585 pieceCount = 0;
586 lastEnd = -1;
587 while( !node4.isNull() && node4.isElement() && node4.nodeName() == "piece" )
589 piece = node4.toElement();
590 pieceCount++;
592 // Not handling multi-piece morphemes yet
593 if( pieceCount > 1 )
595 skipWord = TRUE;
596 delete pStem; pStem = NULL;
597 break;
600 if( !piece.hasAttribute( "start" ) ||
601 !piece.hasAttribute( "length" ) )
603 // TODO: add to error string
604 skipWord = TRUE;
605 delete pStem; pStem = NULL;
606 break;
609 // Maybe we'll want to handle these differently in the future
610 // so I am leaving the distinction
611 if( morph.tagName() == "affix" || morph.tagName() == "root" )
613 start = piece.attribute( "start" ).toInt();
614 length = piece.attribute( "length" ).toInt();
616 // Not handling overlapping morphemes yet
617 if( start <= lastEnd )
619 skipWord = TRUE;
620 delete pStem; pStem = NULL;
621 break;
623 lastEnd = start + length - 1;
625 pStem->CutRightBeforeHere( start );
628 node4 = node4.nextSibling();
631 else
633 // Word has unassigned pieces, skip...
634 skipWord = TRUE;
635 delete pStem; pStem = NULL;
638 node3 = node3.nextSibling();
641 if( skipWord ) continue;
644 // Add word to gold standard words
645 goldStdWordsIt = goldStdWords.find( strStem );
646 if( goldStdWordsIt == goldStdWords.end() )
648 // New word...
649 goldStdWordsIt = goldStdWords.insert( strStem, stemList );
650 //goldStdWordsIt.data().setAutoDelete( TRUE ); @@@ fix this, make sure there are no memory leaks created here.
652 goldStdWordsIt.data().append( pStem );
655 // notes element
656 if( !node3.isNull() && node3.isElement() && node3.nodeName() == "notes" )
658 // Skip...
659 node3 = node3.nextSibling();
662 if( !node3.isNull() )
664 // TODO: add to error string
669 else
671 QMessageBox::information( NULL, "Attention", "Unable to open " + goldStdFileName + " .", "OK" );
672 return;
675 goldStdFile.close();
677 else
679 return;
683 // Yu Hu's code
684 //----------------------------------------------------------------------
685 QString theWord;
686 CStem* theCStem;
687 CParse* theParse;
688 StringToParse* TempSedCuts;
689 StringToParse::Iterator StringToParseIt;
690 StringToParse SedCuts;
691 CCorpusWordCollection* TempSFCut;
692 StringToParse SFCuts;
693 CCorpusWord* theCorpusWord;
695 // Get the Lingustica analyses result SF or PF
696 if ( !m_lexicon) return;
697 TempSFCut = m_lexicon->GetWords();
699 TempSFCut->Sort( KEY );
701 for( int i = 0; i < TempSFCut->GetCount(); i++ )
703 theCorpusWord = TempSFCut->GetAt(i);
704 theWord = theCorpusWord->Display();
706 SFCuts.insert(theWord, theCorpusWord);
710 // Get SED analyses
711 if ( m_Words_Templates != NULL)
713 TempSedCuts = m_Words_Templates ->GetParsedResult();
714 for ( StringToParseIt = TempSedCuts ->begin(); StringToParseIt != TempSedCuts ->end(); StringToParseIt++)
716 theWord = StringToParseIt.key();
717 theParse = StringToParseIt.data();
718 theCStem = new CStem(*theParse);
720 SedCuts.insert(theWord, theCStem);
726 // Goldstandard comparison output
727 double TotalPrecision;
728 double TotalRecall;
729 double AveragePrecision;
730 double AverageRecall;
732 double Ftot=0.0;
733 double Fav=0.0;
734 QString outs;
736 ////////////// compute precision recall SF
738 GetMorphPrecisionRecallByWord( goldStdWords, SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
739 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
740 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
741 // print out precision recall
742 outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
744 QMessageBox::information ( NULL, "SF Morpheme Precision/Recall By Word", outs );
746 GetMorphPrecisionRecall( goldStdWords, SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
747 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
748 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
749 // print out precision recall
750 outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
752 QMessageBox::information ( NULL, "SF Morpheme Precision/Recall", outs );
754 GetCutPrecisionRecall( goldStdWords, SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
755 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
756 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
757 // print out precision recall
758 outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
760 QMessageBox::information ( NULL, "SF Cut Precision/Recall", outs );
766 ///////////////////////////////// SED
767 if ( m_Words_Templates != NULL)
770 GetMorphPrecisionRecallByWord( goldStdWords, SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
772 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
773 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
775 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
777 QMessageBox::information ( NULL, "SED Morpheme Precision/Recall By Word", outs );
779 GetMorphPrecisionRecall( goldStdWords, SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
781 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
782 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
784 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
786 QMessageBox::information ( NULL, "SED Morpheme Precision/Recall By Word", outs );
789 GetCutPrecisionRecall( goldStdWords, SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
791 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
792 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
794 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
796 QMessageBox::information ( NULL, "SED Cut Precision/Recall", outs );
802 void LinguisticaMainWindow::GetMorphPrecisionRecallByWord( StringToCStemList& goldStdWords,
803 StringToParse& lxaWords,
804 double& totalPrecision,
805 double& totalRecall,
806 double& averagePrecision,
807 double& averageRecall )
809 CParse* pGoldStdStem;
810 int* goldStdStemCuts;
811 int goldStdStemCutsPos;
813 CParse* pLxaStem;
814 int* lxaStemCuts;
815 int lxaStemCutsPos;
817 int totalNumLxaWordsCompared = 0;
818 int totalNumGSWordsCompared = 0;
820 int totalNumLxaMorphemes = 0;
821 int totalNumGSMorphemes = 0;
822 int totalNumCorrectMorphemes = 0;
823 int totalNumFoundMorphemes = 0; // For precision, see generalization notes below in function
825 averagePrecision = 0.0;
826 averageRecall = 0.0;
828 QString strWord;
830 StringToCStemList::Iterator goldStdIt;
831 for( goldStdIt = goldStdWords.begin(); goldStdIt != goldStdWords.end(); goldStdIt++ )
833 strWord = goldStdIt.key();
835 // We only look through words that exist in both spaces
836 if( lxaWords.find( strWord ) == lxaWords.end() ) continue;
837 pLxaStem = lxaWords.find( strWord ).data();
839 lxaStemCuts = pLxaStem->GetPieces();
841 int numLxaMorphemes,
842 numGSMorphemes,
843 numCorrectMorphemes;
845 totalNumLxaWordsCompared++;
847 // There may be duplicates in gold standard, we should consider all
848 for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
850 numLxaMorphemes = 0;
851 numGSMorphemes = 0;
852 numCorrectMorphemes = 0;
854 totalNumGSWordsCompared++;
856 goldStdStemCuts = pGoldStdStem->GetPieces();
858 // The word strings should match now...
859 Q_ASSERT( pLxaStem->Display() == pGoldStdStem->Display() );
861 // Therefore we can look at the cuts to compare the morphemes
862 lxaStemCutsPos = goldStdStemCutsPos = 0;
863 while( lxaStemCutsPos < pLxaStem->Size() && goldStdStemCutsPos < pGoldStdStem->Size() )
865 if( lxaStemCuts[ lxaStemCutsPos ] == goldStdStemCuts[ goldStdStemCutsPos ] &&
866 lxaStemCuts[ lxaStemCutsPos + 1 ] == goldStdStemCuts[ goldStdStemCutsPos + 1 ] )
868 // Morphemes match, increment everything
869 numLxaMorphemes++;
870 numGSMorphemes++;
871 numCorrectMorphemes++;
873 // Move both positions
874 lxaStemCutsPos++;
875 goldStdStemCutsPos++;
877 else if( lxaStemCuts[ lxaStemCutsPos ] == goldStdStemCuts[ goldStdStemCutsPos ] )
879 if( lxaStemCuts[ lxaStemCutsPos + 1 ] < goldStdStemCuts[ goldStdStemCutsPos + 1 ] )
881 numLxaMorphemes++;
882 lxaStemCutsPos++;
884 else
886 numGSMorphemes++;
887 goldStdStemCutsPos++;
890 else
892 if( lxaStemCuts[ lxaStemCutsPos ] < goldStdStemCuts[ goldStdStemCutsPos ] )
894 numLxaMorphemes++;
895 lxaStemCutsPos++;
897 else
899 numGSMorphemes++;
900 goldStdStemCutsPos++;
905 // Handle remaining morphemes in either group
906 while( lxaStemCutsPos < pLxaStem->Size() )
908 numLxaMorphemes++;
909 lxaStemCutsPos++;
911 while( goldStdStemCutsPos < pGoldStdStem->Size() )
913 numGSMorphemes++;
914 goldStdStemCutsPos++;
917 averageRecall += ( (double) numCorrectMorphemes / (double) numGSMorphemes );
919 totalNumGSMorphemes += numGSMorphemes;
920 totalNumCorrectMorphemes += numCorrectMorphemes;
924 // Precision generalization: if Lxa finds a morpheme M in a word W, it
925 // gets credit for it if M appears in any of the analyses spelled W.
926 // From John's e-mail to Colin, July 27, 2006
928 numLxaMorphemes = 0;
929 int numFoundMorphemes = 0;
930 int piece = 1;
931 while( piece <= pLxaStem->Size() )
933 numLxaMorphemes++;
935 for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
937 if( pGoldStdStem->Contains( pLxaStem->GetPiece( piece ) ) )
939 numFoundMorphemes++;
940 break;
944 piece++;
947 totalNumLxaMorphemes += numLxaMorphemes;
948 totalNumFoundMorphemes += numFoundMorphemes;
950 averagePrecision += ( (double) numFoundMorphemes / (double) numLxaMorphemes );
953 averagePrecision /= (double) totalNumLxaWordsCompared;
954 averageRecall /= (double) totalNumGSWordsCompared;
956 totalPrecision = (double) totalNumFoundMorphemes / (double) totalNumLxaMorphemes;
957 totalRecall = (double) totalNumCorrectMorphemes / (double) totalNumGSMorphemes;
961 void LinguisticaMainWindow::GetCutPrecisionRecall( StringToCStemList& goldStdWords,
962 StringToParse& lxaWords,
963 double& totalPrecision,
964 double& totalRecall,
965 double& averagePrecision,
966 double& averageRecall )
968 CParse* pGoldStdStem;
969 int* goldStdStemCuts;
970 int goldStdStemCutsPos;
972 CParse* pLxaStem;
973 int* lxaStemCuts;
974 int lxaStemCutsPos;
976 int totalNumLxaWordsCompared = 0;
977 int totalNumGSWordsCompared = 0;
979 int totalNumLxaCuts = 0;
980 int totalNumGSCuts = 0;
981 int totalNumCorrectCuts = 0;
982 int totalNumFoundCuts = 0; // Need different number for precision (using totalNumCorrectCuts for recall)
983 int totalNumOnePieceWords = 0; // One piece Lxa words are undefined for precision, we need to subtract when
984 // totalling
986 averagePrecision = 0.0;
987 averageRecall = 0.0;
989 QString strWord;
991 StringToCStemList::Iterator goldStdIt;
992 for( goldStdIt = goldStdWords.begin(); goldStdIt != goldStdWords.end(); goldStdIt++ )
994 strWord = goldStdIt.key();
996 // We only look through words that exist in both spaces
997 if( lxaWords.find( strWord ) == lxaWords.end() ) continue;
998 pLxaStem = lxaWords.find( strWord ).data();
1000 lxaStemCuts = pLxaStem->GetPieces();
1002 int numLxaCuts = 0,
1003 numGSCuts = 0,
1004 numCorrectCuts = 0,
1005 numFoundCuts = 0;
1007 totalNumLxaWordsCompared++;
1009 Q3ValueList<int> unionOfGSCuts;
1011 // There may be duplicates in gold standard, we need the union of all their cuts
1012 for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
1014 totalNumGSWordsCompared++;
1016 goldStdStemCuts = pGoldStdStem->GetPieces();
1018 // The word strings should match here.
1019 Q_ASSERT( pLxaStem->Display() == pGoldStdStem->Display() );
1021 goldStdStemCutsPos = 0;
1022 while( goldStdStemCutsPos < pGoldStdStem->Size() )
1024 if( unionOfGSCuts.find( goldStdStemCuts[ goldStdStemCutsPos ] ) == unionOfGSCuts.end() )
1026 unionOfGSCuts.append( goldStdStemCuts[ goldStdStemCutsPos ] );
1028 goldStdStemCutsPos++;
1032 lxaStemCutsPos = 0;
1033 while( lxaStemCutsPos < pLxaStem->Size() )
1035 numLxaCuts++;
1037 if( unionOfGSCuts.find( lxaStemCuts[ lxaStemCutsPos ] ) != unionOfGSCuts.end() )
1039 numCorrectCuts++;
1040 numFoundCuts++;
1042 lxaStemCutsPos++;
1045 numGSCuts = unionOfGSCuts.count();
1047 averageRecall += ( (double) numCorrectCuts / (double) numGSCuts );
1049 totalNumGSCuts += numGSCuts;
1050 totalNumCorrectCuts += numCorrectCuts;
1052 if( pLxaStem->Size() < 2 )
1054 totalNumOnePieceWords++;
1056 Q_ASSERT( numFoundCuts == 1 && numLxaCuts == 1 );
1057 numFoundCuts--;
1058 numLxaCuts--;
1060 if( numFoundCuts < 0 ) numFoundCuts = 0;
1061 if( numLxaCuts < 0 ) numLxaCuts = 0;
1064 totalNumLxaCuts += numLxaCuts;
1065 totalNumFoundCuts += numFoundCuts;
1067 if( numLxaCuts > 0 ) averagePrecision += ( (double) numCorrectCuts / (double) numLxaCuts );
1070 averagePrecision /= (double) ( totalNumLxaWordsCompared - totalNumOnePieceWords );
1071 averageRecall /= (double) totalNumGSWordsCompared;
1073 totalPrecision = (double) totalNumFoundCuts / (double) totalNumLxaCuts;
1074 totalRecall = (double) totalNumCorrectCuts / (double) totalNumGSCuts;
1078 void LinguisticaMainWindow::GetMorphPrecisionRecall( StringToCStemList& goldStdWords,
1079 StringToParse& lxaWords,
1080 double& totalPrecision,
1081 double& totalRecall,
1082 double& averagePrecision,
1083 double& averageRecall )
1085 QStringList unionOfGoldStdMorphs,
1086 unionOfLxaMorphs;
1088 QString strWord, strPiece;
1090 CParse* pLxaStem, * pGoldStdStem;
1092 int i,
1093 totalNumLxaWordsCompared = 0,
1094 totalNumGSWordsCompared = 0,
1095 totalNumLxaMorphemes = 0,
1096 totalNumGSMorphemes = 0,
1097 totalNumCorrectMorphemes = 0;
1099 StringToCStemList::Iterator goldStdIt;
1100 for( goldStdIt = goldStdWords.begin(); goldStdIt != goldStdWords.end(); goldStdIt++ )
1102 strWord = goldStdIt.key();
1104 // We only look through words that exist in both spaces
1105 if( lxaWords.find( strWord ) == lxaWords.end() ) continue;
1106 pLxaStem = lxaWords.find( strWord ).data();
1108 totalNumLxaWordsCompared++;
1110 for( i = 1; i <= pLxaStem->Size(); i++ )
1112 strPiece = pLxaStem->GetPiece(i).Display();
1113 if( unionOfLxaMorphs.findIndex( strPiece ) == -1 )
1115 unionOfLxaMorphs.append( strPiece );
1119 // There may be duplicates in gold standard, we need the union of all their morphemes
1120 for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
1122 totalNumGSWordsCompared++;
1124 for( i = 1; i <= pGoldStdStem->Size(); i++ )
1126 strPiece = pGoldStdStem->GetPiece(i).Display();
1127 if( unionOfGoldStdMorphs.findIndex( strPiece ) == -1 )
1129 unionOfGoldStdMorphs.append( strPiece );
1135 unionOfLxaMorphs.sort();
1136 unionOfGoldStdMorphs.sort();
1138 QStringList::Iterator lxaMorphsIt = unionOfLxaMorphs.begin(),
1139 goldStdMorphsIt = unionOfGoldStdMorphs.begin();
1141 while( lxaMorphsIt != unionOfLxaMorphs.end() &&
1142 goldStdMorphsIt != unionOfGoldStdMorphs.end() )
1144 if( *goldStdMorphsIt == *lxaMorphsIt )
1146 totalNumCorrectMorphemes++;
1147 totalNumLxaMorphemes++;
1148 totalNumGSMorphemes++;
1150 ++goldStdMorphsIt;
1151 ++lxaMorphsIt;
1153 else if( *goldStdMorphsIt > *lxaMorphsIt )
1155 totalNumLxaMorphemes++;
1157 ++lxaMorphsIt;
1159 else // *goldStdMorphsIt < *lxaMorphsIt
1161 totalNumGSMorphemes++;
1163 ++goldStdMorphsIt;
1167 totalPrecision = (double) totalNumCorrectMorphemes / (double) totalNumLxaMorphemes;
1168 totalRecall = (double) totalNumCorrectMorphemes / (double) totalNumGSMorphemes;
1170 averagePrecision = totalPrecision;
1171 averageRecall = totalRecall;
1176 /* This is the old version that Yu Hu wrote before Alchemist 3.0
1177 I am keeping it here for reference. The last part of compareGoldStdSlot()
1178 was transferred to the new version.
1180 void LinguisticaMainWindow::compareGoldStdSlot()
1182 QString line;
1183 //int FoundLoc;
1184 QString FirstPiece, RemainingPiece;
1185 QString theWord;
1186 CStem* theCStem;
1187 CParse* theParse;
1188 StringToParse* TempSedCuts;
1189 StringToParse::Iterator StringToParseIt;
1190 StringToPtrCStem GoldCuts;
1191 StringToParse SedCuts;
1192 StringToPtrCStem::Iterator GoldStIt;
1193 StringToCStem::Iterator SFIt;
1194 CCorpusWordCollection* TempSFCut;
1195 StringToParse SFCuts;
1196 CCorpusWord* theCorpusWord;
1198 // 1. Read GoldStand File
1200 QString goldFileName;
1202 goldFileName = QFileDialog::getOpenFileName( m_projectDirectory,
1203 "XML Files (*.xml)",
1204 this,
1205 "open file dialog",
1206 "Choose a file to open" );
1208 if( !goldFileName.isEmpty() )
1211 QFile goldFile( goldFileName );
1212 if ( goldFile.open( IO_ReadOnly ) )
1214 QDomDocument doc( "Goldstandard" );
1216 if( !doc.setContent(&goldFile) )
1218 goldFile.close();
1219 return;
1222 QDomElement root = doc.documentElement();
1224 QString rootTagName = root.tagName();
1226 if( root.tagName() != "GDS")
1228 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + rootTagName, "OK" );
1229 goldFile.close();
1230 return;
1233 // Read the header
1234 QDomNode header = root.firstChild();
1236 QDomElement direction = header.nextSibling().toElement();
1238 // Filling into the docinfo
1239 // Read the content
1240 QDomNode contentnode = direction.nextSibling();
1241 QDomElement content = contentnode.toElement();
1243 if( content.tagName() != "content" )
1245 QMessageBox::information( NULL, "Error", "Sorry, the xml file format is not supported", "OK" );
1246 goldFile.close();
1247 return;
1252 QString value;
1253 int supposedtotalnumberofwords;
1254 QString key;
1255 QString wordcomment;
1256 int numberofpieces;
1257 int start;
1258 int type;
1259 int color;
1260 int score;
1261 QString allomorph, comment;
1262 // int Index;
1265 value = content.attribute("number", "100");
1266 supposedtotalnumberofwords = value.toInt();
1268 QDomNode onewordnode = content.firstChild();
1270 while (!onewordnode.isNull())
1272 QDomElement oneword = onewordnode.toElement();
1273 if( !oneword.isNull() )
1275 if( oneword.tagName() == "word" )
1277 key = oneword.attribute( "key", "" );
1278 if (key == "")
1280 onewordnode = onewordnode.nextSibling();
1281 continue;
1284 theWord = key;
1285 theCStem = new CStem(key);
1286 GoldCuts.insert(theWord, theCStem);
1288 value = oneword.attribute("morphemes", "0");
1289 numberofpieces = value.toInt();
1291 if (numberofpieces == 0)
1293 onewordnode = onewordnode.nextSibling();
1294 continue;
1297 wordcomment = oneword.attribute("comment", "");
1300 QDomNode onepiecenode = oneword.firstChild();
1302 while (!onepiecenode.isNull())
1304 QDomElement onepiece = onepiecenode.toElement();
1305 if( !onepiece.isNull() )
1307 if( onepiece.tagName() == "morpheme" )
1309 value = onepiece.attribute("start", "-1");
1310 start = value.toInt();
1311 value =onepiece.attribute("type", "-1");
1312 type = value.toInt();
1315 if ( type == 0)
1317 GoldCuts.remove(theWord);
1318 delete theCStem;
1319 break;
1323 value = onepiece.attribute("color", "-2");
1324 color = value.toInt();
1325 value = onepiece.attribute("score", "0");
1326 score = value.toInt();
1327 allomorph = onepiece.attribute("allomorph", "");
1328 comment = onepiece.attribute("comment", "");
1330 if ( ( start == -1) || (type == -1) || (color == -2))
1332 onepiecenode = onepiecenode.nextSibling();
1333 continue;
1336 theCStem ->CutRightBeforeHere(start);
1339 //m_WordCollection ->ParseOneWord(key, start, type, color, true, score, allomorph, comment, Index, wordcomment);
1345 onepiecenode = onepiecenode.nextSibling();
1352 onewordnode = onewordnode.nextSibling();
1356 else
1358 QMessageBox::information( NULL, "Attention", "Unable to open " + goldFileName + " .", "OK" );
1359 return;
1362 goldFile.close();
1365 else
1367 return;
1370 // Get the Lingustica analyses result SF or PF
1372 if ( !m_lexicon) return;
1373 TempSFCut = m_lexicon->GetWords();
1375 TempSFCut->Sort( KEY );
1377 for( int i = 0; i < TempSFCut->GetCount(); i++ )
1379 theCorpusWord = TempSFCut->GetAt(i);
1380 theWord = theCorpusWord->Display();
1382 SFCuts.insert(theWord, theCorpusWord);
1386 // Get SED analyses
1387 if ( m_Words_Templates != NULL)
1389 TempSedCuts = m_Words_Templates ->GetParsedResult();
1390 for ( StringToParseIt = TempSedCuts ->begin(); StringToParseIt != TempSedCuts ->end(); StringToParseIt++)
1392 theWord = StringToParseIt.key();
1393 theParse = StringToParseIt.data();
1394 theCStem = new CStem(*theParse);
1396 SedCuts.insert(theWord, theCStem);
1402 // Goldstandard comparison output
1403 double TotalPrecision;
1404 double TotalRecall;
1405 double AveragePrecision;
1406 double AverageRecall;
1408 double Ftot=0.0;
1409 double Fav=0.0;
1410 QString outs;
1412 ////////////// compute precision recall SF
1414 GetPrecisionRecall(GoldCuts,SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
1415 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
1416 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
1417 // print out precision recall
1418 outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
1420 QMessageBox::information ( NULL, "Irinaoutput", outs );
1424 ///////////////////////////////// SED
1425 if ( m_Words_Templates != NULL)
1428 GetPrecisionRecall(GoldCuts,SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
1430 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
1431 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
1433 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
1435 QMessageBox::information ( NULL, "Irinaoutput", outs );
1438 // TODO: delete all QMap data that are pointers
1442 void LinguisticaMainWindow::GetPrecisionRecall(StringToCStemList& GoldStM, StringToParse& ResultM, double &TotalPrecision, double &TotalRecall, double &AveragePrecision, double &AverageRecall )
1445 // iterate through the gold standard QMap GoldStM
1446 // look up each word in the results QMap ResultM
1447 // get the CParses for each word from GoldStM and ResultM and compare them
1450 // precision and recall are computed for all cuts
1451 // average precision and recall: precision and recall are computed for each word and then averaged
1453 StringToCStemList::Iterator GoldStIt;
1454 CParse* ReferenceStem;
1455 CParse* ResultStem;
1456 // CParse* ReferenceParse;
1457 // CParse* ResultParse;
1458 int* ReferenceCuts;
1459 int* ResultCuts;
1460 QString Word;
1462 // simultaneously collect the results for individual morphemes
1464 int TotalNumReferenceCuts=0;
1465 int TotalNumResultCuts=0;
1466 int TotalNumRightCuts=0;// use for precision
1467 int TotalNumMissedReferenceCuts=0; // use for recall
1469 AveragePrecision=0;
1470 AverageRecall=0;
1471 TotalPrecision=0;
1472 TotalRecall=0;
1474 int NumWords=0;
1476 for( GoldStIt = GoldStM.begin(); GoldStIt != GoldStM.end();GoldStIt++)
1480 // do for each word
1481 // get the reference and the result cuts
1483 Word = GoldStIt.key();
1484 for( ReferenceStem = GoldStIt.data().first(); ReferenceStem; ReferenceStem = GoldStIt.data().next() )
1486 // For now, we are only handling one version of each word
1487 // string. We need to escape after the first.
1488 if( ReferenceStem != GoldStIt.data().first() )
1490 break;
1493 //ReferenceParse=ReferenceStem->GetStemPtr();
1494 ReferenceCuts=ReferenceStem->GetPieces();
1498 if(ResultM.contains(Word))
1500 NumWords++;
1502 ResultStem=ResultM[Word];
1503 // ResultParse=ResultStem->GetStemPtr();
1504 ResultCuts=ResultStem->GetPieces();
1507 // compare these two analyses
1508 int RefSz=ReferenceStem->Size();
1509 int ResSz=ResultStem->Size();
1512 int RefCount=1;// the first index is always 0
1513 int ResCount=1;// the first index is always 0
1515 int NumRightCuts=0;
1516 int NumMissedCuts=0;
1518 // compare the cuts
1519 // if there is only one morpheme=word in both analyses:
1520 if(RefSz==1&& ResSz==1)
1523 AveragePrecision+=1;
1524 AverageRecall+=1;
1525 TotalNumRightCuts++;
1526 TotalNumReferenceCuts+=(RefSz);
1527 TotalNumResultCuts+=(ResSz);
1529 continue;
1531 else if(RefSz==1) // all cuts are wrong
1534 TotalNumResultCuts+=(ResSz-1);
1535 continue;
1537 else if(ResSz==1)
1539 TotalNumReferenceCuts+=(RefSz-1);
1540 continue;
1543 TotalNumReferenceCuts+=(RefSz-1);
1544 TotalNumResultCuts+=(ResSz-1);
1546 int begin=0;
1547 int end=0;
1549 while(RefCount<(RefSz) && ResCount<(ResSz))
1551 int NextRefIndex=ReferenceCuts[RefCount];
1552 int NextResIndex=ResultCuts[ResCount];
1554 if(NextRefIndex == NextResIndex)// the right cut
1558 NumRightCuts++;
1559 TotalNumRightCuts++;
1561 RefCount++;
1562 ResCount++;
1565 else if(NextRefIndex < NextResIndex)//missed cut
1568 //NumMissedCuts++;
1569 //TotalNumMissedReferenceCuts++;
1571 RefCount++;
1573 else if (NextRefIndex > NextResIndex)//wrong cut
1575 ResCount++;
1579 double WordPrecision=(double)NumRightCuts/(double)(ResSz-1);
1580 double WordRecall=(double)NumRightCuts/(double)(RefSz-1);
1581 AveragePrecision+=WordPrecision;
1582 AverageRecall+=WordRecall;
1589 AveragePrecision/=(double)NumWords;
1590 AverageRecall/=(double)NumWords;
1592 TotalPrecision=(double)TotalNumRightCuts/(double)TotalNumResultCuts;
1593 TotalRecall=(double)TotalNumRightCuts/(double)TotalNumReferenceCuts;