1 // Handling requests to compare the analyzed corpus to a gold standard
2 // Copyright © 2009 The University of Chicago
3 #include "linguisticamainwindow.h"
6 #include <QDomDocument>
10 #include "CorpusWord.h"
12 #include "CorpusWordCollection.h"
13 #include "TemplateCollection.h"
16 typedef QMap
<CParse
*,int> ParseToInt
;
17 typedef QMap
<QString
,CParse
*> StringToParse
;
18 typedef QMap
<QString
,ParseToInt
*> StringToParseToInt
;
19 typedef QMap
<QString
,StringToInt
*> StringToStringToInt
;
21 void LinguisticaMainWindow::changeGoldStdFileSlot()
23 m_goldStdFileName
= Q3FileDialog::getOpenFileName( m_goldStdFileName
,
26 "Linguistica :: New Gold Standard File",
27 "Choose a new gold standard file" );
28 if( !m_goldStdFileName
.isEmpty() )
29 m_Settings
.writeEntry( "/linguistica.uchicago.edu/Linguistica/MainWindow/DiagnosticsMenu/NewGoldStdFile", m_goldStdFileName
);
34 void LinguisticaMainWindow::compareCompoundsSlot()
39 StringToPtrCStem goldStdCompounds, goldStd;
41 QString goldStdFileName;
43 goldStdFileName = QFileDialog::getOpenFileName( m_projectDirectory,
47 "Choose a gold standard:" );
49 if( !goldStdFileName.isEmpty() )
51 QFile goldStdFile( goldStdFileName );
52 if( goldStdFile.open( IO_ReadOnly ) )
54 QDomDocument doc( "Goldstandard" );
56 if( !doc.setContent( &goldStdFile ) )
62 QDomElement root = doc.documentElement();
64 QString tagName = root.tagName();
66 if( root.tagName() != "GDS")
68 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + tagName, "OK" );
74 QDomNode header = root.firstChild();
76 QDomElement direction = header.nextSibling().toElement();
79 QDomNode contentnode = direction.nextSibling();
80 QDomElement content = contentnode.toElement();
82 tagName = content.tagName();
83 if( tagName != "content" )
85 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + tagName, "OK" );
91 int supposedTotalNumberOfWords;
100 QString comment, allomorph;
102 value = content.attribute( "number", "100" );
103 supposedTotalNumberOfWords = value.toInt();
105 QDomNode onewordnode = content.firstChild();
107 while (!onewordnode.isNull())
109 QDomElement oneword = onewordnode.toElement();
110 if( !oneword.isNull() )
112 if( oneword.tagName() == "word" )
114 key = oneword.attribute( "key", "" );
117 onewordnode = onewordnode.nextSibling();
122 pStem = new CStem( key );
123 goldStd.insert( word, pStem );
125 value = oneword.attribute("morphemes", "0");
126 numberOfPieces = value.toInt();
128 // Looking for goldStdCompounds, must be length 2
129 if( numberOfPieces < 2 )
131 onewordnode = onewordnode.nextSibling();
135 wordComment = oneword.attribute( "comment", "" );
137 QDomNode onepiecenode = oneword.firstChild();
139 rootCount = 0; // Need at least two roots to be a compound
141 while( !onepiecenode.isNull() )
143 QDomElement onepiece = onepiecenode.toElement();
144 if( !onepiece.isNull() )
146 if( onepiece.tagName() == "morpheme" )
148 value = onepiece.attribute("start", "-1");
149 start = value.toInt();
150 value = onepiece.attribute("type", "-1");
151 type = value.toInt();
154 if( type == 0 || word[start] == '-' )
156 goldStd.remove( word );
157 goldStdCompounds.remove( word );
158 if( pStem ) delete pStem;
163 if( type == 1 ) rootCount++;
164 if( rootCount >= 2 ) goldStdCompounds.insert( word, pStem );
166 value = onepiece.attribute("color", "-2");
167 color = value.toInt();
168 value = onepiece.attribute("score", "0");
169 score = value.toInt();
170 allomorph = onepiece.attribute("allomorph", "");
171 comment = onepiece.attribute("comment", "");
175 goldStd.remove( word );
176 goldStdCompounds.remove( word );
177 if( pStem ) delete pStem;
182 if( (start == -1) || (type == -1) || (color == -2) )
184 onepiecenode = onepiecenode.nextSibling();
188 pStem->CutRightBeforeHere( start );
192 onepiecenode = onepiecenode.nextSibling();
197 goldStdCompounds.remove( word );
202 onewordnode = onewordnode.nextSibling();
209 QMessageBox::information( NULL, "Attention", "Unable to open " + goldStdFileName + " .", "OK" );
215 CMiniLexicon* mini = m_lexicon->GetMiniLexicon( m_lexicon->GetActiveMiniIndex() );
216 CWordCollection* pWords = mini->GetWords();
217 CCompoundCollection* pCompounds = mini->GetCompounds();
218 CStem* pWord, * qWord;
219 CCompound* pCompound;
220 CSuffixCollection* pSuffixes = NULL;
221 CPrefixCollection* pPrefixes = NULL;
224 int truePos = 0, trueNeg = 0, falsePos = 0, falseNeg = 0;
226 QString outputFileName;
228 outputFileName = QFileDialog::getOpenFileName( m_projectDirectory,
229 "Text Files (*.txt)",
232 "Choose an output file:" );
234 if( !outputFileName.isEmpty() )
236 QFile outputFile( outputFileName );
237 if( outputFile.open( IO_WriteOnly ) )
239 QTextStream out( &outputFile );
240 out.setEncoding( QTextStream::Unicode );
273 for( int i = 0; i < pWords->GetCount(); i++ )
275 bool notInGS = FALSE;
277 pWord = pWords->GetAtSort(i);
279 pCompound = *pCompounds ^= pWord->GetKey();
283 pStem = pWord->GetStemPtr();
284 if( pStem ) pCompound = *pCompounds ^= pStem->GetKey();
287 if( goldStdCompounds.find( pWord->Display() ) != goldStdCompounds.end() )
289 qWord = goldStdCompounds[ pWord->Display() ];
293 if( goldStd.find( pWord->Display() ) == goldStd.end() )
299 if( qWord && pCompound ) truePos++;
300 if( qWord && !pCompound ) falseNeg++;
301 if( !qWord && pCompound ) falsePos++;
303 if( !qWord && !pCompound )
309 if( !qWord && !pCompound )
321 out << pWord->Display();
327 if( qWord ) out << qWord->Display('+');
334 if( pCompound ) out << pCompound->Display('+');
343 for( j = 0; j < m_lexicon->GetMiniSize(); j++ )
345 mini2 = m_lexicon->GetMiniLexicon(j);
348 pPrefixes = mini2->GetPrefixes();
350 if( pCompound && pPrefixes )
352 pPrefix = (*pPrefixes) ^= pCompound->GetPiece( 1 );
367 for( j = 0; j < m_lexicon->GetMiniSize(); j++ )
369 mini2 = m_lexicon->GetMiniLexicon(j);
372 pSuffixes = mini2->GetSuffixes();
374 if( pCompound && pSuffixes )
376 pSuffix = (*pSuffixes) ^= pCompound->GetPiece( 1 );
390 double precision = (double) truePos / (double)( truePos + falsePos );
391 double recall = (double) truePos / (double)( truePos + falseNeg );
393 out << endl << QString( "True Positive Count = %1" ).arg( truePos ) << endl;
394 out << QString( "True Negative Count = %1" ).arg( trueNeg ) << endl;
395 out << QString( "False Positive Count = %1" ).arg( falsePos ) << endl;
396 out << QString( "False Negative Count = %1" ).arg( falseNeg ) << endl;
397 out << endl << QString( "Precision = %1" ).arg( precision ) << endl;
398 out << QString( "Recall = %1" ).arg( recall ) << endl;
399 out << QString( "F-Score = %1" ).arg( ( 2.0 * precision * recall ) / ( precision + recall ) ) << endl;
408 void LinguisticaMainWindow::compareGoldStdSlot()
410 StringToCStemList goldStdWords
;
411 StringToCStemList::Iterator goldStdWordsIt
;
414 QString goldStdFileName
= Q3FileDialog::getOpenFileName( m_projectDirectory
,
418 "Choose a gold standard file to open" );
419 if( !goldStdFileName
.isEmpty() )
422 QFile
goldStdFile( goldStdFileName
);
423 if( goldStdFile
.open( QIODevice::ReadOnly
) )
426 QDomDocument
doc( "Alchemist" ), author_data
, document_data
;
429 int errorLine
, errorColumn
;
430 if( !doc
.setContent( &goldStdFile
, &errorMsg
, &errorLine
, &errorColumn
) )
432 //Maybe we should put this back in.
433 // QMessageBox::warning( this, "Gold Standard : XML Error",
434 // QString( errorMsg + "\nLine: %1" + "Col: %2" ).arg( errorLine ).arg( errorColumn ), QMessageBox::Ok, NULL, NULL );
439 QString feature_name
;
441 QDomElement alchemist_doc
, element
, word
, string
, gloss
,
442 morph
, piece
, notes
, morpheme
, allomorph
,
443 lmnt
, feature
, name
, feature_id
, instance_id
;
445 QDomNode node1
, node2
, node3
, node4
;
452 bool skipWord
= FALSE
;
453 int pieceCount
, lastEnd
, start
, length
;
455 alchemist_doc
= doc
.documentElement();
457 if( alchemist_doc
.tagName() != "alchemist-doc" )
459 errorMsg
= "The XML document \"" + alchemist_doc
.tagName() + "\" is not an alchemist document.";
460 QMessageBox::information( NULL
, "Gold Standard : XML Error", errorMsg
, "OK" );
465 // Author data (optional)
466 node1
= alchemist_doc
.firstChild();
467 if( !node1
.isNull() && node1
.isElement() && node1
.nodeName() == "author-data" )
470 node1
= node1
.nextSibling();
474 // Document data (optional)
475 if( !node1
.isNull() && node1
.isElement() && node1
.nodeName() == "document-data" )
478 node1
= node1
.nextSibling();
482 // Feature list first of morphology description
483 if( node1
.isNull() || !node1
.isElement() || node1
.nodeName() != "feature-list" )
485 // TODO: add to error string
490 node1
= node1
.nextSibling();
494 // Morpheme list second
495 if( node1
.isNull() || !node1
.isElement() || node1
.nodeName() != "morpheme-list" )
497 // TODO: add to error string
502 node1
= node1
.nextSibling();
506 // Word list last.. this is what we need!
507 if( node1
.isNull() || !node1
.isElement() || node1
.nodeName() != "word-list" )
509 // TODO: add to error string
513 node2
= node1
.firstChild();
515 while( !node2
.isNull() &&
517 node2
.nodeName() == "word" )
519 word
= node2
.toElement();
520 node2
= node2
.nextSibling();
523 if( !word
.hasAttribute( "score" ) )
525 // TODO: add to error string
530 if( word
.attribute( "score" ) == "Not Scored" ) continue;
531 else if( word
.attribute( "score" ) == "Certain" ); // we want to look at these words
532 else if( word
.attribute( "score" ) == "Somewhat Certain" ) continue;
533 else if( word
.attribute( "score" ) == "Uncertain" ) continue;
537 node3
= word
.firstChild();
538 if( !node3
.isElement() || node3
.nodeName() != "string" )
540 // TODO: add to error message
543 string
= node3
.toElement();
545 // Make new gold standard word
546 strStem
= string
.text();
547 pStem
= new CStem( strStem
);
550 node3
= string
.nextSibling();
551 if( node3
.isElement() && node3
.nodeName() == "gloss" )
554 node3
= node3
.nextSibling();
557 // affix, root, and piece elements
559 while( !node3
.isNull() &&
561 ( node3
.nodeName() == "piece" ||
562 node3
.nodeName() == "affix" ||
563 node3
.nodeName() == "root" ) &&
566 if( node3
.nodeName() == "affix" || node3
.nodeName() == "root" )
568 morph
= node3
.toElement();
571 node4
= morph
.firstChild();
572 if( node4
.isElement() && node4
.nodeName() == "string" )
574 string
= node4
.toElement();
576 // no need to do anything with this string
577 node4
= string
.nextSibling();
581 // TODO: add to error string
587 while( !node4
.isNull() && node4
.isElement() && node4
.nodeName() == "piece" )
589 piece
= node4
.toElement();
592 // Not handling multi-piece morphemes yet
596 delete pStem
; pStem
= NULL
;
600 if( !piece
.hasAttribute( "start" ) ||
601 !piece
.hasAttribute( "length" ) )
603 // TODO: add to error string
605 delete pStem
; pStem
= NULL
;
609 // Maybe we'll want to handle these differently in the future
610 // so I am leaving the distinction
611 if( morph
.tagName() == "affix" || morph
.tagName() == "root" )
613 start
= piece
.attribute( "start" ).toInt();
614 length
= piece
.attribute( "length" ).toInt();
616 // Not handling overlapping morphemes yet
617 if( start
<= lastEnd
)
620 delete pStem
; pStem
= NULL
;
623 lastEnd
= start
+ length
- 1;
625 pStem
->CutRightBeforeHere( start
);
628 node4
= node4
.nextSibling();
633 // Word has unassigned pieces, skip...
635 delete pStem
; pStem
= NULL
;
638 node3
= node3
.nextSibling();
641 if( skipWord
) continue;
644 // Add word to gold standard words
645 goldStdWordsIt
= goldStdWords
.find( strStem
);
646 if( goldStdWordsIt
== goldStdWords
.end() )
649 goldStdWordsIt
= goldStdWords
.insert( strStem
, stemList
);
650 //goldStdWordsIt.data().setAutoDelete( TRUE ); @@@ fix this, make sure there are no memory leaks created here.
652 goldStdWordsIt
.data().append( pStem
);
656 if( !node3
.isNull() && node3
.isElement() && node3
.nodeName() == "notes" )
659 node3
= node3
.nextSibling();
662 if( !node3
.isNull() )
664 // TODO: add to error string
671 QMessageBox::information( NULL
, "Attention", "Unable to open " + goldStdFileName
+ " .", "OK" );
684 //----------------------------------------------------------------------
688 StringToParse
* TempSedCuts
;
689 StringToParse::Iterator StringToParseIt
;
690 StringToParse SedCuts
;
691 CCorpusWordCollection
* TempSFCut
;
692 StringToParse SFCuts
;
693 CCorpusWord
* theCorpusWord
;
695 // Get the Lingustica analyses result SF or PF
696 if ( !m_lexicon
) return;
697 TempSFCut
= m_lexicon
->GetWords();
699 TempSFCut
->Sort( KEY
);
701 for( int i
= 0; i
< TempSFCut
->GetCount(); i
++ )
703 theCorpusWord
= TempSFCut
->GetAt(i
);
704 theWord
= theCorpusWord
->Display();
706 SFCuts
.insert(theWord
, theCorpusWord
);
711 if ( m_Words_Templates
!= NULL
)
713 TempSedCuts
= m_Words_Templates
->GetParsedResult();
714 for ( StringToParseIt
= TempSedCuts
->begin(); StringToParseIt
!= TempSedCuts
->end(); StringToParseIt
++)
716 theWord
= StringToParseIt
.key();
717 theParse
= StringToParseIt
.data();
718 theCStem
= new CStem(*theParse
);
720 SedCuts
.insert(theWord
, theCStem
);
726 // Goldstandard comparison output
727 double TotalPrecision
;
729 double AveragePrecision
;
730 double AverageRecall
;
736 ////////////// compute precision recall SF
738 GetMorphPrecisionRecallByWord( goldStdWords
, SFCuts
, TotalPrecision
, TotalRecall
,AveragePrecision
,AverageRecall
);
739 Ftot
=2*TotalPrecision
*TotalRecall
/(TotalPrecision
+TotalRecall
);
740 Fav
=2*AveragePrecision
*AverageRecall
/(AveragePrecision
+AverageRecall
);
741 // print out precision recall
742 outs
= QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision
).arg(TotalRecall
).arg(Ftot
);
744 QMessageBox::information ( NULL
, "SF Morpheme Precision/Recall By Word", outs
);
746 GetMorphPrecisionRecall( goldStdWords
, SFCuts
, TotalPrecision
, TotalRecall
,AveragePrecision
,AverageRecall
);
747 Ftot
=2*TotalPrecision
*TotalRecall
/(TotalPrecision
+TotalRecall
);
748 Fav
=2*AveragePrecision
*AverageRecall
/(AveragePrecision
+AverageRecall
);
749 // print out precision recall
750 outs
= QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision
).arg(TotalRecall
).arg(Ftot
);
752 QMessageBox::information ( NULL
, "SF Morpheme Precision/Recall", outs
);
754 GetCutPrecisionRecall( goldStdWords
, SFCuts
, TotalPrecision
, TotalRecall
,AveragePrecision
,AverageRecall
);
755 Ftot
=2*TotalPrecision
*TotalRecall
/(TotalPrecision
+TotalRecall
);
756 Fav
=2*AveragePrecision
*AverageRecall
/(AveragePrecision
+AverageRecall
);
757 // print out precision recall
758 outs
= QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision
).arg(TotalRecall
).arg(Ftot
);
760 QMessageBox::information ( NULL
, "SF Cut Precision/Recall", outs
);
766 ///////////////////////////////// SED
767 if ( m_Words_Templates
!= NULL
)
770 GetMorphPrecisionRecallByWord( goldStdWords
, SedCuts
, TotalPrecision
, TotalRecall
,AveragePrecision
,AverageRecall
);
772 Ftot
=2*TotalPrecision
*TotalRecall
/(TotalPrecision
+TotalRecall
);
773 Fav
=2*AveragePrecision
*AverageRecall
/(AveragePrecision
+AverageRecall
);
775 outs
= QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision
).arg(TotalRecall
).arg(Ftot
);
777 QMessageBox::information ( NULL
, "SED Morpheme Precision/Recall By Word", outs
);
779 GetMorphPrecisionRecall( goldStdWords
, SedCuts
, TotalPrecision
, TotalRecall
,AveragePrecision
,AverageRecall
);
781 Ftot
=2*TotalPrecision
*TotalRecall
/(TotalPrecision
+TotalRecall
);
782 Fav
=2*AveragePrecision
*AverageRecall
/(AveragePrecision
+AverageRecall
);
784 outs
= QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision
).arg(TotalRecall
).arg(Ftot
);
786 QMessageBox::information ( NULL
, "SED Morpheme Precision/Recall By Word", outs
);
789 GetCutPrecisionRecall( goldStdWords
, SedCuts
, TotalPrecision
, TotalRecall
,AveragePrecision
,AverageRecall
);
791 Ftot
=2*TotalPrecision
*TotalRecall
/(TotalPrecision
+TotalRecall
);
792 Fav
=2*AveragePrecision
*AverageRecall
/(AveragePrecision
+AverageRecall
);
794 outs
= QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision
).arg(TotalRecall
).arg(Ftot
);
796 QMessageBox::information ( NULL
, "SED Cut Precision/Recall", outs
);
802 void LinguisticaMainWindow::GetMorphPrecisionRecallByWord( StringToCStemList
& goldStdWords
,
803 StringToParse
& lxaWords
,
804 double& totalPrecision
,
806 double& averagePrecision
,
807 double& averageRecall
)
809 CParse
* pGoldStdStem
;
810 int* goldStdStemCuts
;
811 int goldStdStemCutsPos
;
817 int totalNumLxaWordsCompared
= 0;
818 int totalNumGSWordsCompared
= 0;
820 int totalNumLxaMorphemes
= 0;
821 int totalNumGSMorphemes
= 0;
822 int totalNumCorrectMorphemes
= 0;
823 int totalNumFoundMorphemes
= 0; // For precision, see generalization notes below in function
825 averagePrecision
= 0.0;
830 StringToCStemList::Iterator goldStdIt
;
831 for( goldStdIt
= goldStdWords
.begin(); goldStdIt
!= goldStdWords
.end(); goldStdIt
++ )
833 strWord
= goldStdIt
.key();
835 // We only look through words that exist in both spaces
836 if( lxaWords
.find( strWord
) == lxaWords
.end() ) continue;
837 pLxaStem
= lxaWords
.find( strWord
).data();
839 lxaStemCuts
= pLxaStem
->GetPieces();
845 totalNumLxaWordsCompared
++;
847 // There may be duplicates in gold standard, we should consider all
848 for( pGoldStdStem
= goldStdIt
.data().first(); pGoldStdStem
; pGoldStdStem
= goldStdIt
.data().next() )
852 numCorrectMorphemes
= 0;
854 totalNumGSWordsCompared
++;
856 goldStdStemCuts
= pGoldStdStem
->GetPieces();
858 // The word strings should match now...
859 Q_ASSERT( pLxaStem
->Display() == pGoldStdStem
->Display() );
861 // Therefore we can look at the cuts to compare the morphemes
862 lxaStemCutsPos
= goldStdStemCutsPos
= 0;
863 while( lxaStemCutsPos
< pLxaStem
->Size() && goldStdStemCutsPos
< pGoldStdStem
->Size() )
865 if( lxaStemCuts
[ lxaStemCutsPos
] == goldStdStemCuts
[ goldStdStemCutsPos
] &&
866 lxaStemCuts
[ lxaStemCutsPos
+ 1 ] == goldStdStemCuts
[ goldStdStemCutsPos
+ 1 ] )
868 // Morphemes match, increment everything
871 numCorrectMorphemes
++;
873 // Move both positions
875 goldStdStemCutsPos
++;
877 else if( lxaStemCuts
[ lxaStemCutsPos
] == goldStdStemCuts
[ goldStdStemCutsPos
] )
879 if( lxaStemCuts
[ lxaStemCutsPos
+ 1 ] < goldStdStemCuts
[ goldStdStemCutsPos
+ 1 ] )
887 goldStdStemCutsPos
++;
892 if( lxaStemCuts
[ lxaStemCutsPos
] < goldStdStemCuts
[ goldStdStemCutsPos
] )
900 goldStdStemCutsPos
++;
905 // Handle remaining morphemes in either group
906 while( lxaStemCutsPos
< pLxaStem
->Size() )
911 while( goldStdStemCutsPos
< pGoldStdStem
->Size() )
914 goldStdStemCutsPos
++;
917 averageRecall
+= ( (double) numCorrectMorphemes
/ (double) numGSMorphemes
);
919 totalNumGSMorphemes
+= numGSMorphemes
;
920 totalNumCorrectMorphemes
+= numCorrectMorphemes
;
924 // Precision generalization: if Lxa finds a morpheme M in a word W, it
925 // gets credit for it if M appears in any of the analyses spelled W.
926 // From John's e-mail to Colin, July 27, 2006
929 int numFoundMorphemes
= 0;
931 while( piece
<= pLxaStem
->Size() )
935 for( pGoldStdStem
= goldStdIt
.data().first(); pGoldStdStem
; pGoldStdStem
= goldStdIt
.data().next() )
937 if( pGoldStdStem
->Contains( pLxaStem
->GetPiece( piece
) ) )
947 totalNumLxaMorphemes
+= numLxaMorphemes
;
948 totalNumFoundMorphemes
+= numFoundMorphemes
;
950 averagePrecision
+= ( (double) numFoundMorphemes
/ (double) numLxaMorphemes
);
953 averagePrecision
/= (double) totalNumLxaWordsCompared
;
954 averageRecall
/= (double) totalNumGSWordsCompared
;
956 totalPrecision
= (double) totalNumFoundMorphemes
/ (double) totalNumLxaMorphemes
;
957 totalRecall
= (double) totalNumCorrectMorphemes
/ (double) totalNumGSMorphemes
;
961 void LinguisticaMainWindow::GetCutPrecisionRecall( StringToCStemList
& goldStdWords
,
962 StringToParse
& lxaWords
,
963 double& totalPrecision
,
965 double& averagePrecision
,
966 double& averageRecall
)
968 CParse
* pGoldStdStem
;
969 int* goldStdStemCuts
;
970 int goldStdStemCutsPos
;
976 int totalNumLxaWordsCompared
= 0;
977 int totalNumGSWordsCompared
= 0;
979 int totalNumLxaCuts
= 0;
980 int totalNumGSCuts
= 0;
981 int totalNumCorrectCuts
= 0;
982 int totalNumFoundCuts
= 0; // Need different number for precision (using totalNumCorrectCuts for recall)
983 int totalNumOnePieceWords
= 0; // One piece Lxa words are undefined for precision, we need to subtract when
986 averagePrecision
= 0.0;
991 StringToCStemList::Iterator goldStdIt
;
992 for( goldStdIt
= goldStdWords
.begin(); goldStdIt
!= goldStdWords
.end(); goldStdIt
++ )
994 strWord
= goldStdIt
.key();
996 // We only look through words that exist in both spaces
997 if( lxaWords
.find( strWord
) == lxaWords
.end() ) continue;
998 pLxaStem
= lxaWords
.find( strWord
).data();
1000 lxaStemCuts
= pLxaStem
->GetPieces();
1007 totalNumLxaWordsCompared
++;
1009 Q3ValueList
<int> unionOfGSCuts
;
1011 // There may be duplicates in gold standard, we need the union of all their cuts
1012 for( pGoldStdStem
= goldStdIt
.data().first(); pGoldStdStem
; pGoldStdStem
= goldStdIt
.data().next() )
1014 totalNumGSWordsCompared
++;
1016 goldStdStemCuts
= pGoldStdStem
->GetPieces();
1018 // The word strings should match here.
1019 Q_ASSERT( pLxaStem
->Display() == pGoldStdStem
->Display() );
1021 goldStdStemCutsPos
= 0;
1022 while( goldStdStemCutsPos
< pGoldStdStem
->Size() )
1024 if( unionOfGSCuts
.find( goldStdStemCuts
[ goldStdStemCutsPos
] ) == unionOfGSCuts
.end() )
1026 unionOfGSCuts
.append( goldStdStemCuts
[ goldStdStemCutsPos
] );
1028 goldStdStemCutsPos
++;
1033 while( lxaStemCutsPos
< pLxaStem
->Size() )
1037 if( unionOfGSCuts
.find( lxaStemCuts
[ lxaStemCutsPos
] ) != unionOfGSCuts
.end() )
1045 numGSCuts
= unionOfGSCuts
.count();
1047 averageRecall
+= ( (double) numCorrectCuts
/ (double) numGSCuts
);
1049 totalNumGSCuts
+= numGSCuts
;
1050 totalNumCorrectCuts
+= numCorrectCuts
;
1052 if( pLxaStem
->Size() < 2 )
1054 totalNumOnePieceWords
++;
1056 Q_ASSERT( numFoundCuts
== 1 && numLxaCuts
== 1 );
1060 if( numFoundCuts
< 0 ) numFoundCuts
= 0;
1061 if( numLxaCuts
< 0 ) numLxaCuts
= 0;
1064 totalNumLxaCuts
+= numLxaCuts
;
1065 totalNumFoundCuts
+= numFoundCuts
;
1067 if( numLxaCuts
> 0 ) averagePrecision
+= ( (double) numCorrectCuts
/ (double) numLxaCuts
);
1070 averagePrecision
/= (double) ( totalNumLxaWordsCompared
- totalNumOnePieceWords
);
1071 averageRecall
/= (double) totalNumGSWordsCompared
;
1073 totalPrecision
= (double) totalNumFoundCuts
/ (double) totalNumLxaCuts
;
1074 totalRecall
= (double) totalNumCorrectCuts
/ (double) totalNumGSCuts
;
1078 void LinguisticaMainWindow::GetMorphPrecisionRecall( StringToCStemList
& goldStdWords
,
1079 StringToParse
& lxaWords
,
1080 double& totalPrecision
,
1081 double& totalRecall
,
1082 double& averagePrecision
,
1083 double& averageRecall
)
1085 QStringList unionOfGoldStdMorphs
,
1088 QString strWord
, strPiece
;
1090 CParse
* pLxaStem
, * pGoldStdStem
;
1093 totalNumLxaWordsCompared
= 0,
1094 totalNumGSWordsCompared
= 0,
1095 totalNumLxaMorphemes
= 0,
1096 totalNumGSMorphemes
= 0,
1097 totalNumCorrectMorphemes
= 0;
1099 StringToCStemList::Iterator goldStdIt
;
1100 for( goldStdIt
= goldStdWords
.begin(); goldStdIt
!= goldStdWords
.end(); goldStdIt
++ )
1102 strWord
= goldStdIt
.key();
1104 // We only look through words that exist in both spaces
1105 if( lxaWords
.find( strWord
) == lxaWords
.end() ) continue;
1106 pLxaStem
= lxaWords
.find( strWord
).data();
1108 totalNumLxaWordsCompared
++;
1110 for( i
= 1; i
<= pLxaStem
->Size(); i
++ )
1112 strPiece
= pLxaStem
->GetPiece(i
).Display();
1113 if( unionOfLxaMorphs
.findIndex( strPiece
) == -1 )
1115 unionOfLxaMorphs
.append( strPiece
);
1119 // There may be duplicates in gold standard, we need the union of all their morphemes
1120 for( pGoldStdStem
= goldStdIt
.data().first(); pGoldStdStem
; pGoldStdStem
= goldStdIt
.data().next() )
1122 totalNumGSWordsCompared
++;
1124 for( i
= 1; i
<= pGoldStdStem
->Size(); i
++ )
1126 strPiece
= pGoldStdStem
->GetPiece(i
).Display();
1127 if( unionOfGoldStdMorphs
.findIndex( strPiece
) == -1 )
1129 unionOfGoldStdMorphs
.append( strPiece
);
1135 unionOfLxaMorphs
.sort();
1136 unionOfGoldStdMorphs
.sort();
1138 QStringList::Iterator lxaMorphsIt
= unionOfLxaMorphs
.begin(),
1139 goldStdMorphsIt
= unionOfGoldStdMorphs
.begin();
1141 while( lxaMorphsIt
!= unionOfLxaMorphs
.end() &&
1142 goldStdMorphsIt
!= unionOfGoldStdMorphs
.end() )
1144 if( *goldStdMorphsIt
== *lxaMorphsIt
)
1146 totalNumCorrectMorphemes
++;
1147 totalNumLxaMorphemes
++;
1148 totalNumGSMorphemes
++;
1153 else if( *goldStdMorphsIt
> *lxaMorphsIt
)
1155 totalNumLxaMorphemes
++;
1159 else // *goldStdMorphsIt < *lxaMorphsIt
1161 totalNumGSMorphemes
++;
1167 totalPrecision
= (double) totalNumCorrectMorphemes
/ (double) totalNumLxaMorphemes
;
1168 totalRecall
= (double) totalNumCorrectMorphemes
/ (double) totalNumGSMorphemes
;
1170 averagePrecision
= totalPrecision
;
1171 averageRecall
= totalRecall
;
1176 /* This is the old version that Yu Hu wrote before Alchemist 3.0
1177 I am keeping it here for reference. The last part of compareGoldStdSlot()
1178 was transferred to the new version.
1180 void LinguisticaMainWindow::compareGoldStdSlot()
1184 QString FirstPiece, RemainingPiece;
1188 StringToParse* TempSedCuts;
1189 StringToParse::Iterator StringToParseIt;
1190 StringToPtrCStem GoldCuts;
1191 StringToParse SedCuts;
1192 StringToPtrCStem::Iterator GoldStIt;
1193 StringToCStem::Iterator SFIt;
1194 CCorpusWordCollection* TempSFCut;
1195 StringToParse SFCuts;
1196 CCorpusWord* theCorpusWord;
1198 // 1. Read GoldStand File
1200 QString goldFileName;
1202 goldFileName = QFileDialog::getOpenFileName( m_projectDirectory,
1203 "XML Files (*.xml)",
1206 "Choose a file to open" );
1208 if( !goldFileName.isEmpty() )
1211 QFile goldFile( goldFileName );
1212 if ( goldFile.open( IO_ReadOnly ) )
1214 QDomDocument doc( "Goldstandard" );
1216 if( !doc.setContent(&goldFile) )
1222 QDomElement root = doc.documentElement();
1224 QString rootTagName = root.tagName();
1226 if( root.tagName() != "GDS")
1228 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + rootTagName, "OK" );
1234 QDomNode header = root.firstChild();
1236 QDomElement direction = header.nextSibling().toElement();
1238 // Filling into the docinfo
1240 QDomNode contentnode = direction.nextSibling();
1241 QDomElement content = contentnode.toElement();
1243 if( content.tagName() != "content" )
1245 QMessageBox::information( NULL, "Error", "Sorry, the xml file format is not supported", "OK" );
1253 int supposedtotalnumberofwords;
1255 QString wordcomment;
1261 QString allomorph, comment;
1265 value = content.attribute("number", "100");
1266 supposedtotalnumberofwords = value.toInt();
1268 QDomNode onewordnode = content.firstChild();
1270 while (!onewordnode.isNull())
1272 QDomElement oneword = onewordnode.toElement();
1273 if( !oneword.isNull() )
1275 if( oneword.tagName() == "word" )
1277 key = oneword.attribute( "key", "" );
1280 onewordnode = onewordnode.nextSibling();
1285 theCStem = new CStem(key);
1286 GoldCuts.insert(theWord, theCStem);
1288 value = oneword.attribute("morphemes", "0");
1289 numberofpieces = value.toInt();
1291 if (numberofpieces == 0)
1293 onewordnode = onewordnode.nextSibling();
1297 wordcomment = oneword.attribute("comment", "");
1300 QDomNode onepiecenode = oneword.firstChild();
1302 while (!onepiecenode.isNull())
1304 QDomElement onepiece = onepiecenode.toElement();
1305 if( !onepiece.isNull() )
1307 if( onepiece.tagName() == "morpheme" )
1309 value = onepiece.attribute("start", "-1");
1310 start = value.toInt();
1311 value =onepiece.attribute("type", "-1");
1312 type = value.toInt();
1317 GoldCuts.remove(theWord);
1323 value = onepiece.attribute("color", "-2");
1324 color = value.toInt();
1325 value = onepiece.attribute("score", "0");
1326 score = value.toInt();
1327 allomorph = onepiece.attribute("allomorph", "");
1328 comment = onepiece.attribute("comment", "");
1330 if ( ( start == -1) || (type == -1) || (color == -2))
1332 onepiecenode = onepiecenode.nextSibling();
1336 theCStem ->CutRightBeforeHere(start);
1339 //m_WordCollection ->ParseOneWord(key, start, type, color, true, score, allomorph, comment, Index, wordcomment);
1345 onepiecenode = onepiecenode.nextSibling();
1352 onewordnode = onewordnode.nextSibling();
1358 QMessageBox::information( NULL, "Attention", "Unable to open " + goldFileName + " .", "OK" );
1370 // Get the Lingustica analyses result SF or PF
1372 if ( !m_lexicon) return;
1373 TempSFCut = m_lexicon->GetWords();
1375 TempSFCut->Sort( KEY );
1377 for( int i = 0; i < TempSFCut->GetCount(); i++ )
1379 theCorpusWord = TempSFCut->GetAt(i);
1380 theWord = theCorpusWord->Display();
1382 SFCuts.insert(theWord, theCorpusWord);
1387 if ( m_Words_Templates != NULL)
1389 TempSedCuts = m_Words_Templates ->GetParsedResult();
1390 for ( StringToParseIt = TempSedCuts ->begin(); StringToParseIt != TempSedCuts ->end(); StringToParseIt++)
1392 theWord = StringToParseIt.key();
1393 theParse = StringToParseIt.data();
1394 theCStem = new CStem(*theParse);
1396 SedCuts.insert(theWord, theCStem);
1402 // Goldstandard comparison output
1403 double TotalPrecision;
1405 double AveragePrecision;
1406 double AverageRecall;
1412 ////////////// compute precision recall SF
1414 GetPrecisionRecall(GoldCuts,SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
1415 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
1416 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
1417 // print out precision recall
1418 outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
1420 QMessageBox::information ( NULL, "Irinaoutput", outs );
1424 ///////////////////////////////// SED
1425 if ( m_Words_Templates != NULL)
1428 GetPrecisionRecall(GoldCuts,SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
1430 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
1431 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
1433 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
1435 QMessageBox::information ( NULL, "Irinaoutput", outs );
1438 // TODO: delete all QMap data that are pointers
1442 void LinguisticaMainWindow::GetPrecisionRecall(StringToCStemList& GoldStM, StringToParse& ResultM, double &TotalPrecision, double &TotalRecall, double &AveragePrecision, double &AverageRecall )
1445 // iterate through the gold standard QMap GoldStM
1446 // look up each word in the results QMap ResultM
1447 // get the CParses for each word from GoldStM and ResultM and compare them
1450 // precision and recall are computed for all cuts
1451 // average precision and recall: precision and recall are computed for each word and then averaged
1453 StringToCStemList::Iterator GoldStIt;
1454 CParse* ReferenceStem;
1456 // CParse* ReferenceParse;
1457 // CParse* ResultParse;
1462 // simultaneously collect the results for individual morphemes
1464 int TotalNumReferenceCuts=0;
1465 int TotalNumResultCuts=0;
1466 int TotalNumRightCuts=0;// use for precision
1467 int TotalNumMissedReferenceCuts=0; // use for recall
1476 for( GoldStIt = GoldStM.begin(); GoldStIt != GoldStM.end();GoldStIt++)
1481 // get the reference and the result cuts
1483 Word = GoldStIt.key();
1484 for( ReferenceStem = GoldStIt.data().first(); ReferenceStem; ReferenceStem = GoldStIt.data().next() )
1486 // For now, we are only handling one version of each word
1487 // string. We need to escape after the first.
1488 if( ReferenceStem != GoldStIt.data().first() )
1493 //ReferenceParse=ReferenceStem->GetStemPtr();
1494 ReferenceCuts=ReferenceStem->GetPieces();
1498 if(ResultM.contains(Word))
1502 ResultStem=ResultM[Word];
1503 // ResultParse=ResultStem->GetStemPtr();
1504 ResultCuts=ResultStem->GetPieces();
1507 // compare these two analyses
1508 int RefSz=ReferenceStem->Size();
1509 int ResSz=ResultStem->Size();
1512 int RefCount=1;// the first index is always 0
1513 int ResCount=1;// the first index is always 0
1516 int NumMissedCuts=0;
1519 // if there is only one morpheme=word in both analyses:
1520 if(RefSz==1&& ResSz==1)
1523 AveragePrecision+=1;
1525 TotalNumRightCuts++;
1526 TotalNumReferenceCuts+=(RefSz);
1527 TotalNumResultCuts+=(ResSz);
1531 else if(RefSz==1) // all cuts are wrong
1534 TotalNumResultCuts+=(ResSz-1);
1539 TotalNumReferenceCuts+=(RefSz-1);
1543 TotalNumReferenceCuts+=(RefSz-1);
1544 TotalNumResultCuts+=(ResSz-1);
1549 while(RefCount<(RefSz) && ResCount<(ResSz))
1551 int NextRefIndex=ReferenceCuts[RefCount];
1552 int NextResIndex=ResultCuts[ResCount];
1554 if(NextRefIndex == NextResIndex)// the right cut
1559 TotalNumRightCuts++;
1565 else if(NextRefIndex < NextResIndex)//missed cut
1569 //TotalNumMissedReferenceCuts++;
1573 else if (NextRefIndex > NextResIndex)//wrong cut
1579 double WordPrecision=(double)NumRightCuts/(double)(ResSz-1);
1580 double WordRecall=(double)NumRightCuts/(double)(RefSz-1);
1581 AveragePrecision+=WordPrecision;
1582 AverageRecall+=WordRecall;
1589 AveragePrecision/=(double)NumWords;
1590 AverageRecall/=(double)NumWords;
1592 TotalPrecision=(double)TotalNumRightCuts/(double)TotalNumResultCuts;
1593 TotalRecall=(double)TotalNumRightCuts/(double)TotalNumReferenceCuts;