linguisticamainwindow_goldstandard.cpp

   1 // Handling requests to compare the analyzed corpus to a gold standard
   2 // Copyright © 2009 The University of Chicago
   3 #include "linguisticamainwindow.h"
   4
   5 #include <QMessageBox>
   6 #include <QDomDocument>
   7 #include <QDomElement>
   8 #include <Q3ValueList>
   9 #include "Lexicon.h"
  10 #include "CorpusWord.h"
  11 #include "Stem.h"
  12 #include "CorpusWordCollection.h"
  13 #include "TemplateCollection.h"
  14 #include "Typedefs.h"
  15
  16 typedef QMap<CParse*,int> ParseToInt;
  17 typedef QMap<QString,CParse*> StringToParse;
  18 typedef QMap<QString,ParseToInt*> StringToParseToInt;
  19 typedef QMap<QString,StringToInt*> StringToStringToInt;
  20
  21 void LinguisticaMainWindow::changeGoldStdFileSlot()
  22 {
  23   m_goldStdFileName = Q3FileDialog::getOpenFileName( m_goldStdFileName,
  24                                                     "XML files (*.xml)",
  25                                                     this,
  26                                                     "Linguistica :: New Gold Standard File",
  27                                                     "Choose a new gold standard file" );
  28   if( !m_goldStdFileName.isEmpty() )
  29   m_Settings.writeEntry( "/linguistica.uchicago.edu/Linguistica/MainWindow/DiagnosticsMenu/NewGoldStdFile", m_goldStdFileName );
  30 }
  31
  32
  33
  34 void LinguisticaMainWindow::compareCompoundsSlot()
  35 {
  36 /*
  37         QString                         word;
  38         CStem*                          pStem;
  39         StringToPtrCStem        goldStdCompounds, goldStd;
  40
  41         QString                         goldStdFileName;
  42
  43         goldStdFileName = QFileDialog::getOpenFileName( m_projectDirectory,
  44                                                                                                         "XML Files (*.xml)",
  45                                                                                                         this,
  46                                                                                                         "open file dialog",
  47                                                                                                         "Choose a gold standard:" );
  48
  49         if( !goldStdFileName.isEmpty() )
  50         {
  51                 QFile goldStdFile( goldStdFileName );
  52                 if( goldStdFile.open( IO_ReadOnly ) )
  53                 {
  54                         QDomDocument doc( "Goldstandard" );
  55
  56                         if( !doc.setContent( &goldStdFile ) )
  57                         {
  58                                 goldStdFile.close();
  59                                 return;
  60                         }
  61
  62                         QDomElement root = doc.documentElement();
  63
  64                         QString tagName = root.tagName();
  65
  66                         if( root.tagName() != "GDS")
  67                         {
  68                                 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + tagName, "OK" );
  69                                 goldStdFile.close();
  70                                 return;
  71                         }
  72
  73                         // Read the header
  74                         QDomNode header = root.firstChild();
  75
  76                         QDomElement direction = header.nextSibling().toElement();
  77
  78                         // Read all content
  79                         QDomNode contentnode = direction.nextSibling();
  80                         QDomElement content = contentnode.toElement();
  81
  82                         tagName = content.tagName();
  83                         if( tagName != "content" )
  84                         {
  85                                 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + tagName, "OK" );
  86                                 goldStdFile.close();
  87                                 return;
  88                         }
  89
  90                         QString                         value;
  91                         int                                     supposedTotalNumberOfWords;
  92                         QString                         key;
  93                         QString                         wordComment;
  94                         int                                     numberOfPieces;
  95                         int                                     start;
  96                         int                                     type;
  97                         int                                     color;
  98                         int                                     score;
  99                         int                                     rootCount;
 100                         QString                         comment, allomorph;
 101
 102                         value = content.attribute( "number", "100" );
 103                         supposedTotalNumberOfWords = value.toInt();
 104
 105                         QDomNode onewordnode = content.firstChild();
 106
 107                         while (!onewordnode.isNull())
 108                         {
 109                                 QDomElement oneword = onewordnode.toElement();
 110                                 if( !oneword.isNull() )
 111                                 {
 112                                         if( oneword.tagName() == "word" )
 113                                         {
 114                                                 key = oneword.attribute( "key", "" );
 115                                                 if (key == "")
 116                                                 {
 117                                                         onewordnode = onewordnode.nextSibling();
 118                                                         continue;
 119                                                 }
 120
 121                                                 word = key;
 122                                                 pStem = new CStem( key );
 123                                                 goldStd.insert( word, pStem );
 124
 125                                                 value = oneword.attribute("morphemes", "0");
 126                                                 numberOfPieces = value.toInt();
 127
 128                                                 // Looking for goldStdCompounds, must be length 2
 129                                                 if( numberOfPieces < 2 )
 130                                                 {
 131                                                         onewordnode = onewordnode.nextSibling();
 132                                                         continue;
 133                                                 }
 134
 135                                                 wordComment = oneword.attribute( "comment", "" );
 136
 137                                                 QDomNode onepiecenode = oneword.firstChild();
 138
 139                                                 rootCount = 0; // Need at least two roots to be a compound
 140
 141                                                 while( !onepiecenode.isNull() )
 142                                                 {
 143                                                         QDomElement onepiece = onepiecenode.toElement();
 144                                                         if( !onepiece.isNull() )
 145                                                         {
 146                                                                 if( onepiece.tagName() == "morpheme" )
 147                                                                 {
 148                                                                         value = onepiece.attribute("start", "-1");
 149                                                                         start = value.toInt();
 150                                                                         value = onepiece.attribute("type", "-1");
 151                                                                         type = value.toInt();
 152
 153
 154                                                                         if( type == 0 || word[start] == '-' )
 155                                                                         {
 156                                                                                 goldStd.remove( word );
 157                                                                                 goldStdCompounds.remove( word );
 158                                                                                 if( pStem ) delete pStem;
 159                                                                                 pStem = NULL;
 160                                                                                 break;
 161                                                                         }
 162
 163                                                                         if( type == 1 ) rootCount++;
 164                                                                         if( rootCount >= 2 ) goldStdCompounds.insert( word, pStem );
 165
 166                                                                         value = onepiece.attribute("color", "-2");
 167                                                                         color = value.toInt();
 168                                                                         value = onepiece.attribute("score", "0");
 169                                                                         score = value.toInt();
 170                                                                         allomorph = onepiece.attribute("allomorph", "");
 171                                                                         comment = onepiece.attribute("comment", "");
 172
 173                                                                         if( score != 1 )
 174                                                                         {
 175                                                                                 goldStd.remove( word );
 176                                                                                 goldStdCompounds.remove( word );
 177                                                                                 if( pStem ) delete pStem;
 178                                                                                 pStem = NULL;
 179                                                                                 break;
 180                                                                         }
 181
 182                                                                         if( (start == -1) || (type == -1) || (color == -2) )
 183                                                                         {
 184                                                                                 onepiecenode = onepiecenode.nextSibling();
 185                                                                                 continue;
 186                                                                         }
 187
 188                                                                         pStem->CutRightBeforeHere( start );
 189                                                                 }
 190                                                         }
 191
 192                                                         onepiecenode = onepiecenode.nextSibling();
 193                                                 }
 194
 195                                                 if( rootCount < 2 )
 196                                                 {
 197                                                         goldStdCompounds.remove( word );
 198                                                 }
 199                                         }
 200                                 }
 201
 202                                 onewordnode = onewordnode.nextSibling();
 203                         }
 204
 205                         goldStdFile.close();
 206                 }
 207                 else
 208                 {
 209                         QMessageBox::information( NULL, "Attention", "Unable to open " + goldStdFileName + " .", "OK" );
 210                         return;
 211                 }
 212         }
 213         else return;
 214
 215         CMiniLexicon* mini = m_lexicon->GetMiniLexicon( m_lexicon->GetActiveMiniIndex() );
 216         CWordCollection* pWords = mini->GetWords();
 217         CCompoundCollection* pCompounds = mini->GetCompounds();
 218         CStem* pWord, * qWord;
 219         CCompound* pCompound;
 220         CSuffixCollection* pSuffixes = NULL;
 221         CPrefixCollection* pPrefixes = NULL;
 222         CSuffix* pSuffix;
 223         CPrefix* pPrefix;
 224         int truePos = 0, trueNeg = 0, falsePos = 0, falseNeg = 0;
 225
 226         QString outputFileName;
 227
 228         outputFileName = QFileDialog::getOpenFileName( m_projectDirectory,
 229                                                                                                    "Text Files (*.txt)",
 230                                                                                                    this,
 231                                                                                                    "open file dialog",
 232                                                                                                    "Choose an output file:" );
 233
 234         if( !outputFileName.isEmpty() )
 235         {
 236                 QFile outputFile( outputFileName );
 237                 if( outputFile.open( IO_WriteOnly ) )
 238                 {
 239                         QTextStream out( &outputFile );
 240                         out.setEncoding( QTextStream::Unicode );
 241
 242                         pWords->Sort(KEY);
 243
 244                         out.setf(2);
 245
 246                         out.width(20);
 247                         out << "WORD";
 248
 249                         out.width(2);
 250                         out << "  ";
 251
 252                         out.width(20);
 253                         out << "GOLD STD";
 254
 255                         out.width(2);
 256                         out << "  ";
 257
 258                         out.width(20);
 259                         out << "COMPOUND";
 260
 261                         out.width(2);
 262                         out << "  ";
 263
 264                         out.width(1);
 265                         out << "P";
 266
 267                         out.width(2);
 268                         out << "  ";
 269
 270                         out.width(1);
 271                         out << "S" << endl;
 272
 273                         for( int i = 0; i < pWords->GetCount(); i++ )
 274                         {
 275                                 bool notInGS = FALSE;
 276
 277                                 pWord = pWords->GetAtSort(i);
 278
 279                                 pCompound = *pCompounds ^= pWord->GetKey();
 280
 281                                 if( !pCompound )
 282                                 {
 283                                         pStem = pWord->GetStemPtr();
 284                                         if( pStem ) pCompound = *pCompounds ^= pStem->GetKey();
 285                                 }
 286
 287                                 if( goldStdCompounds.find( pWord->Display() ) != goldStdCompounds.end() )
 288                                 {
 289                                         qWord = goldStdCompounds[ pWord->Display() ];
 290                                 }
 291                                 else qWord = NULL;
 292
 293                                 if( goldStd.find( pWord->Display() ) == goldStd.end() )
 294                                 {
 295                                         notInGS = TRUE;
 296                                 }
 297                                 else
 298                                 {
 299                                         if( qWord && pCompound ) truePos++;
 300                                         if( qWord && !pCompound ) falseNeg++;
 301                                         if( !qWord && pCompound ) falsePos++;
 302
 303                                         if( !qWord && !pCompound )
 304                                         {
 305                                                 trueNeg++;
 306                                         }
 307                                 }
 308
 309                                 if( !qWord && !pCompound )
 310                                 {
 311                                         continue;
 312                                 }
 313
 314                                 if( notInGS )
 315                                 {
 316                                         out.width(3);
 317                                         out << "*  ";
 318                                 }
 319
 320                                 out.width(20);
 321                                 out << pWord->Display();
 322
 323                                 out.width(2);
 324                                 out << "  ";
 325
 326                                 out.width(20);
 327                                 if( qWord ) out << qWord->Display('+');
 328                                 else out << " ";
 329
 330                                 out.width(2);
 331                                 out << "  ";
 332
 333                                 out.width(20);
 334                                 if( pCompound ) out << pCompound->Display('+');
 335                                 else out << " ";
 336
 337                                 out.width(2);
 338                                 out << "  ";
 339
 340                                 out.width(1);
 341                                 CMiniLexicon* mini2;
 342                                 int j;
 343                                 for( j = 0; j < m_lexicon->GetMiniSize(); j++ )
 344                                 {
 345                                         mini2 = m_lexicon->GetMiniLexicon(j);
 346                                         if( mini2 )
 347                                         {
 348                                                 pPrefixes = mini2->GetPrefixes();
 349
 350                                                 if( pCompound && pPrefixes )
 351                                                 {
 352                                                         pPrefix = (*pPrefixes) ^= pCompound->GetPiece( 1 );
 353
 354                                                         if( pPrefix )
 355                                                         {
 356                                                                 out << "*";
 357                                                                 break;
 358                                                         }
 359                                                 }
 360                                         }
 361                                 }
 362
 363                                 out.width(2);
 364                                 out << "  ";
 365
 366                                 out.width(1);
 367                                 for( j = 0; j < m_lexicon->GetMiniSize(); j++ )
 368                                 {
 369                                         mini2 = m_lexicon->GetMiniLexicon(j);
 370                                         if( mini2 )
 371                                         {
 372                                                 pSuffixes = mini2->GetSuffixes();
 373
 374                                                 if( pCompound && pSuffixes )
 375                                                 {
 376                                                         pSuffix = (*pSuffixes) ^= pCompound->GetPiece( 1 );
 377
 378                                                         if( pSuffix )
 379                                                         {
 380                                                                 out << "*";
 381                                                                 break;
 382                                                         }
 383                                                 }
 384                                         }
 385                                 }
 386
 387                                 out << endl;
 388                         }
 389
 390                         double precision = (double) truePos / (double)( truePos + falsePos );
 391                         double recall = (double) truePos / (double)( truePos + falseNeg );
 392
 393                         out << endl << QString( "True Positive Count = %1" ).arg( truePos ) << endl;
 394                         out << QString( "True Negative Count = %1" ).arg( trueNeg ) << endl;
 395                         out << QString( "False Positive Count = %1" ).arg( falsePos ) << endl;
 396                         out << QString( "False Negative Count = %1" ).arg( falseNeg ) << endl;
 397                         out << endl << QString( "Precision = %1" ).arg( precision ) << endl;
 398                         out << QString( "Recall = %1" ).arg( recall ) << endl;
 399                         out << QString( "F-Score = %1" ).arg( ( 2.0 * precision * recall ) / ( precision + recall ) ) << endl;
 400
 401                         outputFile.close();
 402                 }
 403         }
 404 */
 405 }
 406
 407
 408 void LinguisticaMainWindow::compareGoldStdSlot()
 409 {
 410         StringToCStemList                       goldStdWords;
 411         StringToCStemList::Iterator     goldStdWordsIt;
 412         CStemList                                       stemList;
 413
 414         QString goldStdFileName = Q3FileDialog::getOpenFileName( m_projectDirectory,
 415                                                                                                                         "XML Files (*.xml)",
 416                                                                                                                         this,
 417                                                                                                                         "open file dialog",
 418                                                                                                                         "Choose a gold standard file to open" );
 419         if( !goldStdFileName.isEmpty() )
 420         {
 421
 422                 QFile goldStdFile( goldStdFileName );
 423                 if( goldStdFile.open( QIODevice::ReadOnly ) )
 424                 {
 425
 426                         QDomDocument doc( "Alchemist" ), author_data, document_data;
 427
 428                         QString errorMsg;
 429                         int errorLine, errorColumn;
 430                         if( !doc.setContent( &goldStdFile, &errorMsg, &errorLine, &errorColumn ) )
 431                         {
 432 //Maybe we should put this back in.
 433 //                              QMessageBox::warning( this, "Gold Standard : XML Error",
 434 //                                QString( errorMsg + "\nLine: %1" + "Col: %2" ).arg( errorLine ).arg( errorColumn ), QMessageBox::Ok, NULL, NULL );
 435
 436                                 return;
 437                         }
 438
 439                         QString feature_name;
 440
 441                         QDomElement alchemist_doc, element, word, string, gloss,
 442                                                 morph, piece, notes, morpheme, allomorph,
 443                                                 lmnt, feature, name, feature_id, instance_id;
 444                         QDomNodeList nodes;
 445                         QDomNode node1, node2, node3, node4;
 446                         QDomText text;
 447
 448                         CStem* pStem;
 449
 450                         QString strStem;
 451
 452                         bool skipWord = FALSE;
 453                         int pieceCount, lastEnd, start, length;
 454
 455                         alchemist_doc = doc.documentElement();
 456
 457                         if( alchemist_doc.tagName() != "alchemist-doc" )
 458                         {
 459                                 errorMsg = "The XML document \"" + alchemist_doc.tagName() + "\" is not an alchemist document.";
 460                                 QMessageBox::information( NULL, "Gold Standard : XML Error", errorMsg, "OK" );
 461                                 return;
 462                         }
 463
 464
 465                         // Author data (optional)
 466                         node1 = alchemist_doc.firstChild();
 467                         if( !node1.isNull() && node1.isElement() && node1.nodeName() == "author-data" )
 468                         {
 469                                 // Skip...
 470                                 node1 = node1.nextSibling();
 471                         }
 472
 473
 474                         // Document data (optional)
 475                         if( !node1.isNull() && node1.isElement() && node1.nodeName() == "document-data" )
 476                         {
 477                                 // Skip...
 478                                 node1 = node1.nextSibling();
 479                         }
 480
 481
 482                         // Feature list first of morphology description
 483                         if( node1.isNull() || !node1.isElement() || node1.nodeName() != "feature-list" )
 484                         {
 485                                 // TODO: add to error string
 486                         }
 487                         else
 488                         {
 489                                 // Skip...
 490                                 node1 = node1.nextSibling();
 491                         }
 492
 493
 494                         // Morpheme list second
 495                         if( node1.isNull() || !node1.isElement() || node1.nodeName() != "morpheme-list" )
 496                         {
 497                                 // TODO: add to error string
 498                         }
 499                         else
 500                         {
 501                                 // Skip...
 502                                 node1 = node1.nextSibling();
 503                         }
 504
 505
 506                         // Word list last.. this is what we need!
 507                         if( node1.isNull() || !node1.isElement() || node1.nodeName() != "word-list" )
 508                         {
 509                                 // TODO: add to error string
 510                         }
 511                         else
 512                         {
 513                                 node2 = node1.firstChild();
 514
 515                                 while( !node2.isNull() &&
 516                                            node2.isElement() &&
 517                                            node2.nodeName() == "word" )
 518                                 {
 519                                         word = node2.toElement();
 520                                         node2 = node2.nextSibling();
 521
 522                                         // score attribute
 523                                         if( !word.hasAttribute( "score" ) )
 524                                         {
 525                                                 // TODO: add to error string
 526                                                 continue;
 527                                         }
 528                                         else
 529                                         {
 530                                                 if( word.attribute( "score" ) == "Not Scored" ) continue;
 531                                                 else if( word.attribute( "score" ) == "Certain" ); // we want to look at these words
 532                                                 else if( word.attribute( "score" ) == "Somewhat Certain" ) continue;
 533                                                 else if( word.attribute( "score" ) == "Uncertain" ) continue;
 534                                         }
 535
 536                                         // string element
 537                                         node3 = word.firstChild();
 538                                         if( !node3.isElement() || node3.nodeName() != "string" )
 539                                         {
 540                                                 // TODO: add to error message
 541                                                 continue;
 542                                         }
 543                                         string = node3.toElement();
 544
 545                                         // Make new gold standard word
 546                                         strStem = string.text();
 547                                         pStem = new CStem( strStem );
 548
 549                                         // gloss element
 550                                         node3 = string.nextSibling();
 551                                         if( node3.isElement() && node3.nodeName() == "gloss" )
 552                                         {
 553                                                 // Skip...
 554                                                 node3 = node3.nextSibling();
 555                                         }
 556
 557                                         // affix, root, and piece elements
 558                                         skipWord = FALSE;
 559                                         while( !node3.isNull() &&
 560                                                    node3.isElement() &&
 561                                                    ( node3.nodeName() == "piece" ||
 562                                                          node3.nodeName() == "affix" ||
 563                                                          node3.nodeName() == "root" ) &&
 564                                                    !skipWord )
 565                                         {
 566                                                 if( node3.nodeName() == "affix" || node3.nodeName() == "root" )
 567                                                 {
 568                                                         morph = node3.toElement();
 569
 570                                                         // string element
 571                                                         node4 = morph.firstChild();
 572                                                         if( node4.isElement() && node4.nodeName() == "string" )
 573                                                         {
 574                                                                 string = node4.toElement();
 575
 576                                                                 // no need to do anything with this string
 577                                                                 node4 = string.nextSibling();
 578                                                         }
 579                                                         else
 580                                                         {
 581                                                                 // TODO: add to error string
 582                                                         }
 583
 584                                                         // piece elements
 585                                                         pieceCount = 0;
 586                                                         lastEnd = -1;
 587                                                         while( !node4.isNull() && node4.isElement() && node4.nodeName() == "piece" )
 588                                                         {
 589                                                                 piece = node4.toElement();
 590                                                                 pieceCount++;
 591
 592                                                                 // Not handling multi-piece morphemes yet
 593                                                                 if( pieceCount > 1 )
 594                                                                 {
 595                                                                         skipWord = TRUE;
 596                                                                         delete pStem; pStem = NULL;
 597                                                                         break;
 598                                                                 }
 599
 600                                                                 if( !piece.hasAttribute( "start" ) ||
 601                                                                         !piece.hasAttribute( "length" ) )
 602                                                                 {
 603                                                                         // TODO: add to error string
 604                                                                         skipWord = TRUE;
 605                                                                         delete pStem; pStem = NULL;
 606                                                                         break;
 607                                                                 }
 608
 609                                                                 // Maybe we'll want to handle these differently in the future
 610                                                                 // so I am leaving the distinction
 611                                                                 if( morph.tagName() == "affix" || morph.tagName() == "root" )
 612                                                                 {
 613                                                                         start = piece.attribute( "start" ).toInt();
 614                                                                         length = piece.attribute( "length" ).toInt();
 615
 616                                                                         // Not handling overlapping morphemes yet
 617                                                                         if( start <= lastEnd )
 618                                                                         {
 619                                                                                 skipWord = TRUE;
 620                                                                                 delete pStem; pStem = NULL;
 621                                                                                 break;
 622                                                                         }
 623                                                                         lastEnd = start + length - 1;
 624
 625                                                                         pStem->CutRightBeforeHere( start );
 626                                                                 }
 627
 628                                                                 node4 = node4.nextSibling();
 629                                                         }
 630                                                 }
 631                                                 else
 632                                                 {
 633                                                         // Word has unassigned pieces, skip...
 634                                                         skipWord = TRUE;
 635                                                         delete pStem; pStem = NULL;
 636                                                 }
 637
 638                                                 node3 = node3.nextSibling();
 639                                         }
 640
 641                                         if( skipWord ) continue;
 642
 643
 644                                         // Add word to gold standard words
 645                                         goldStdWordsIt = goldStdWords.find( strStem );
 646                                         if( goldStdWordsIt == goldStdWords.end() )
 647                                         {
 648                                                 // New word...
 649                                                 goldStdWordsIt = goldStdWords.insert( strStem, stemList );
 650                                                 //goldStdWordsIt.data().setAutoDelete( TRUE );    @@@ fix this, make sure there are no memory leaks created here.
 651                                         }
 652                                         goldStdWordsIt.data().append( pStem );
 653
 654
 655                                         // notes element
 656                                         if( !node3.isNull() && node3.isElement() && node3.nodeName() == "notes" )
 657                                         {
 658                                                 // Skip...
 659                                                 node3 = node3.nextSibling();
 660                                         }
 661
 662                                         if( !node3.isNull() )
 663                                         {
 664                                                 // TODO: add to error string
 665                                         }
 666                                 }
 667                         }
 668                 }
 669                 else
 670                 {
 671                         QMessageBox::information( NULL, "Attention", "Unable to open " + goldStdFileName + " .", "OK" );
 672                         return;
 673                 }
 674
 675                 goldStdFile.close();
 676         }
 677         else
 678         {
 679                 return;
 680         }
 681
 682
 683 // Yu Hu's code
 684 //----------------------------------------------------------------------
 685         QString                                                         theWord;
 686         CStem*                                                          theCStem;
 687         CParse*                                                         theParse;
 688         StringToParse*                                          TempSedCuts;
 689         StringToParse::Iterator                         StringToParseIt;
 690         StringToParse                                           SedCuts;
 691         CCorpusWordCollection*                          TempSFCut;
 692         StringToParse                                           SFCuts;
 693         CCorpusWord*                                            theCorpusWord;
 694
 695         // Get the Lingustica analyses result SF or PF
 696         if ( !m_lexicon) return;
 697         TempSFCut = m_lexicon->GetWords();
 698
 699         TempSFCut->Sort( KEY );
 700
 701         for( int i = 0; i < TempSFCut->GetCount(); i++ )
 702         {
 703                 theCorpusWord = TempSFCut->GetAt(i);
 704                 theWord = theCorpusWord->Display();
 705
 706                 SFCuts.insert(theWord, theCorpusWord);
 707
 708         }
 709
 710         // Get SED analyses
 711         if ( m_Words_Templates != NULL)
 712         {
 713                 TempSedCuts = m_Words_Templates ->GetParsedResult();
 714                 for ( StringToParseIt = TempSedCuts ->begin(); StringToParseIt != TempSedCuts ->end();  StringToParseIt++)
 715                 {
 716                         theWord = StringToParseIt.key();
 717                         theParse = StringToParseIt.data();
 718                         theCStem = new CStem(*theParse);
 719
 720                         SedCuts.insert(theWord, theCStem);
 721                 }
 722
 723
 724         }
 725
 726         // Goldstandard comparison output
 727         double TotalPrecision;
 728         double TotalRecall;
 729         double AveragePrecision;
 730         double AverageRecall;
 731
 732         double Ftot=0.0;
 733         double Fav=0.0;
 734         QString                                 outs;
 735
 736         ////////////// compute precision recall         SF
 737
 738         GetMorphPrecisionRecallByWord( goldStdWords, SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
 739         Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
 740         Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
 741         // print out precision recall
 742         outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
 743
 744         QMessageBox::information ( NULL, "SF Morpheme Precision/Recall By Word", outs );
 745
 746         GetMorphPrecisionRecall( goldStdWords, SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
 747         Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
 748         Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
 749         // print out precision recall
 750         outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
 751
 752         QMessageBox::information ( NULL, "SF Morpheme Precision/Recall", outs );
 753
 754         GetCutPrecisionRecall( goldStdWords, SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
 755         Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
 756         Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
 757         // print out precision recall
 758         outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
 759
 760         QMessageBox::information ( NULL, "SF Cut Precision/Recall", outs );
 761
 762
 763
 764
 765
 766         ///////////////////////////////// SED
 767         if ( m_Words_Templates != NULL)
 768         {
 769
 770                 GetMorphPrecisionRecallByWord( goldStdWords, SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
 771
 772                 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
 773                 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
 774
 775                 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
 776
 777                 QMessageBox::information ( NULL, "SED Morpheme Precision/Recall By Word", outs );
 778
 779                 GetMorphPrecisionRecall( goldStdWords, SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
 780
 781                 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
 782                 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
 783
 784                 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
 785
 786                 QMessageBox::information ( NULL, "SED Morpheme Precision/Recall By Word", outs );
 787
 788
 789                 GetCutPrecisionRecall( goldStdWords, SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
 790
 791                 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
 792                 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
 793
 794                 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
 795
 796                 QMessageBox::information ( NULL, "SED Cut Precision/Recall", outs );
 797         }
 798 }
 799
 800
 801
 802 void LinguisticaMainWindow::GetMorphPrecisionRecallByWord( StringToCStemList& goldStdWords,
 803                                                                                                                    StringToParse& lxaWords,
 804                                                                                                                    double& totalPrecision,
 805                                                                                                                    double& totalRecall,
 806                                                                                                                    double& averagePrecision,
 807                                                                                                                    double& averageRecall )
 808 {
 809         CParse* pGoldStdStem;
 810         int*    goldStdStemCuts;
 811         int     goldStdStemCutsPos;
 812
 813         CParse* pLxaStem;
 814         int*    lxaStemCuts;
 815         int     lxaStemCutsPos;
 816
 817         int     totalNumLxaWordsCompared = 0;
 818         int     totalNumGSWordsCompared = 0;
 819
 820         int     totalNumLxaMorphemes = 0;
 821         int     totalNumGSMorphemes = 0;
 822         int     totalNumCorrectMorphemes = 0;
 823         int     totalNumFoundMorphemes = 0;             // For precision, see generalization notes below in function
 824
 825                 averagePrecision = 0.0;
 826                 averageRecall = 0.0;
 827
 828         QString strWord;
 829
 830         StringToCStemList::Iterator goldStdIt;
 831         for( goldStdIt = goldStdWords.begin(); goldStdIt != goldStdWords.end(); goldStdIt++ )
 832         {
 833                 strWord = goldStdIt.key();
 834
 835                 // We only look through words that exist in both spaces
 836                 if( lxaWords.find( strWord ) == lxaWords.end() ) continue;
 837                 pLxaStem = lxaWords.find( strWord ).data();
 838
 839                 lxaStemCuts = pLxaStem->GetPieces();
 840
 841                 int numLxaMorphemes,
 842                         numGSMorphemes,
 843                         numCorrectMorphemes;
 844
 845                 totalNumLxaWordsCompared++;
 846
 847                 // There may be duplicates in gold standard, we should consider all
 848                 for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
 849                 {
 850                         numLxaMorphemes = 0;
 851                         numGSMorphemes = 0;
 852                         numCorrectMorphemes = 0;
 853
 854                         totalNumGSWordsCompared++;
 855
 856                         goldStdStemCuts = pGoldStdStem->GetPieces();
 857
 858                         // The word strings should match now...
 859                         Q_ASSERT( pLxaStem->Display() == pGoldStdStem->Display() );
 860
 861                         // Therefore we can look at the cuts to compare the morphemes
 862                         lxaStemCutsPos = goldStdStemCutsPos = 0;
 863                         while( lxaStemCutsPos < pLxaStem->Size() && goldStdStemCutsPos < pGoldStdStem->Size() )
 864                         {
 865                                 if( lxaStemCuts[ lxaStemCutsPos ] == goldStdStemCuts[ goldStdStemCutsPos ] &&
 866                                         lxaStemCuts[ lxaStemCutsPos + 1 ] == goldStdStemCuts[ goldStdStemCutsPos + 1 ] )
 867                                 {
 868                                         // Morphemes match, increment everything
 869                                         numLxaMorphemes++;
 870                                         numGSMorphemes++;
 871                                         numCorrectMorphemes++;
 872
 873                                         // Move both positions
 874                                         lxaStemCutsPos++;
 875                                         goldStdStemCutsPos++;
 876                                 }
 877                                 else if( lxaStemCuts[ lxaStemCutsPos ] == goldStdStemCuts[ goldStdStemCutsPos ] )
 878                                 {
 879                                         if( lxaStemCuts[ lxaStemCutsPos + 1 ] < goldStdStemCuts[ goldStdStemCutsPos + 1 ] )
 880                                         {
 881                                                 numLxaMorphemes++;
 882                                                 lxaStemCutsPos++;
 883                                         }
 884                                         else
 885                                         {
 886                                                 numGSMorphemes++;
 887                                                 goldStdStemCutsPos++;
 888                                         }
 889                                 }
 890                                 else
 891                                 {
 892                                         if( lxaStemCuts[ lxaStemCutsPos ] < goldStdStemCuts[ goldStdStemCutsPos ] )
 893                                         {
 894                                                 numLxaMorphemes++;
 895                                                 lxaStemCutsPos++;
 896                                         }
 897                                         else
 898                                         {
 899                                                 numGSMorphemes++;
 900                                                 goldStdStemCutsPos++;
 901                                         }
 902                                 }
 903                         }
 904
 905                         // Handle remaining morphemes in either group
 906                         while( lxaStemCutsPos < pLxaStem->Size() )
 907                         {
 908                                 numLxaMorphemes++;
 909                                 lxaStemCutsPos++;
 910                         }
 911                         while( goldStdStemCutsPos < pGoldStdStem->Size() )
 912                         {
 913                                 numGSMorphemes++;
 914                                 goldStdStemCutsPos++;
 915                         }
 916
 917                         averageRecall += ( (double) numCorrectMorphemes / (double) numGSMorphemes );
 918
 919                         totalNumGSMorphemes += numGSMorphemes;
 920                         totalNumCorrectMorphemes += numCorrectMorphemes;
 921                 }
 922
 923
 924                 // Precision generalization: if Lxa finds a morpheme M in a word W, it
 925                 // gets credit for it if M appears in any of the analyses spelled W.
 926                 // From John's e-mail to Colin, July 27, 2006
 927
 928                 numLxaMorphemes = 0;
 929                 int numFoundMorphemes = 0;
 930                 int piece = 1;
 931                 while( piece <= pLxaStem->Size() )
 932                 {
 933                         numLxaMorphemes++;
 934
 935                         for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
 936                         {
 937                                 if( pGoldStdStem->Contains( pLxaStem->GetPiece( piece ) ) )
 938                                 {
 939                                         numFoundMorphemes++;
 940                                         break;
 941                                 }
 942                         }
 943
 944                         piece++;
 945                 }
 946
 947                 totalNumLxaMorphemes += numLxaMorphemes;
 948                 totalNumFoundMorphemes += numFoundMorphemes;
 949
 950                 averagePrecision += ( (double) numFoundMorphemes / (double) numLxaMorphemes );
 951         }
 952
 953         averagePrecision /= (double) totalNumLxaWordsCompared;
 954         averageRecall /= (double) totalNumGSWordsCompared;
 955
 956         totalPrecision = (double) totalNumFoundMorphemes / (double) totalNumLxaMorphemes;
 957         totalRecall = (double) totalNumCorrectMorphemes / (double) totalNumGSMorphemes;
 958 }
 959
 960
 961 void LinguisticaMainWindow::GetCutPrecisionRecall( StringToCStemList& goldStdWords,
 962                                                                                                    StringToParse& lxaWords,
 963                                                                                                    double& totalPrecision,
 964                                                                                                    double& totalRecall,
 965                                                                                                    double& averagePrecision,
 966                                                                                                    double& averageRecall )
 967 {
 968         CParse* pGoldStdStem;
 969         int*    goldStdStemCuts;
 970         int             goldStdStemCutsPos;
 971
 972         CParse* pLxaStem;
 973         int*    lxaStemCuts;
 974         int             lxaStemCutsPos;
 975
 976         int totalNumLxaWordsCompared = 0;
 977         int totalNumGSWordsCompared = 0;
 978
 979         int totalNumLxaCuts = 0;
 980         int totalNumGSCuts = 0;
 981         int totalNumCorrectCuts = 0;
 982         int totalNumFoundCuts = 0;              // Need different number for precision (using totalNumCorrectCuts for recall)
 983         int totalNumOnePieceWords = 0;  // One piece Lxa words are undefined for precision, we need to subtract when
 984                                                                         // totalling
 985
 986         averagePrecision = 0.0;
 987         averageRecall = 0.0;
 988
 989         QString strWord;
 990
 991         StringToCStemList::Iterator goldStdIt;
 992         for( goldStdIt = goldStdWords.begin(); goldStdIt != goldStdWords.end(); goldStdIt++ )
 993         {
 994                 strWord = goldStdIt.key();
 995
 996                 // We only look through words that exist in both spaces
 997                 if( lxaWords.find( strWord ) == lxaWords.end() ) continue;
 998                 pLxaStem = lxaWords.find( strWord ).data();
 999
1000                 lxaStemCuts = pLxaStem->GetPieces();
1001
1002                 int numLxaCuts = 0,
1003                         numGSCuts = 0,
1004                         numCorrectCuts = 0,
1005                         numFoundCuts = 0;
1006
1007                 totalNumLxaWordsCompared++;
1008
1009                 Q3ValueList<int> unionOfGSCuts;
1010
1011                 // There may be duplicates in gold standard, we need the union of all their cuts
1012                 for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
1013                 {
1014                         totalNumGSWordsCompared++;
1015
1016                         goldStdStemCuts = pGoldStdStem->GetPieces();
1017
1018                         // The word strings should match here.
1019                         Q_ASSERT( pLxaStem->Display() == pGoldStdStem->Display() );
1020
1021                         goldStdStemCutsPos = 0;
1022                         while( goldStdStemCutsPos < pGoldStdStem->Size() )
1023                         {
1024                                 if( unionOfGSCuts.find( goldStdStemCuts[ goldStdStemCutsPos ] ) == unionOfGSCuts.end() )
1025                                 {
1026                                         unionOfGSCuts.append( goldStdStemCuts[ goldStdStemCutsPos ] );
1027                                 }
1028                                 goldStdStemCutsPos++;
1029                         }
1030                 }
1031
1032                 lxaStemCutsPos = 0;
1033                 while( lxaStemCutsPos < pLxaStem->Size() )
1034                 {
1035                         numLxaCuts++;
1036
1037                         if( unionOfGSCuts.find( lxaStemCuts[ lxaStemCutsPos ] ) != unionOfGSCuts.end() )
1038                         {
1039                                 numCorrectCuts++;
1040                                 numFoundCuts++;
1041                         }
1042                         lxaStemCutsPos++;
1043                 }
1044
1045                 numGSCuts = unionOfGSCuts.count();
1046
1047                 averageRecall += ( (double) numCorrectCuts / (double) numGSCuts );
1048
1049                 totalNumGSCuts += numGSCuts;
1050                 totalNumCorrectCuts += numCorrectCuts;
1051
1052                 if( pLxaStem->Size() < 2 )
1053                 {
1054                         totalNumOnePieceWords++;
1055
1056                         Q_ASSERT( numFoundCuts == 1 && numLxaCuts == 1 );
1057                         numFoundCuts--;
1058                         numLxaCuts--;
1059
1060                         if( numFoundCuts < 0 ) numFoundCuts = 0;
1061                         if( numLxaCuts < 0 ) numLxaCuts = 0;
1062                 }
1063
1064                 totalNumLxaCuts += numLxaCuts;
1065                 totalNumFoundCuts += numFoundCuts;
1066
1067                 if( numLxaCuts > 0 ) averagePrecision += ( (double) numCorrectCuts / (double) numLxaCuts );
1068         }
1069
1070         averagePrecision /= (double) ( totalNumLxaWordsCompared - totalNumOnePieceWords );
1071         averageRecall /= (double) totalNumGSWordsCompared;
1072
1073         totalPrecision = (double) totalNumFoundCuts / (double) totalNumLxaCuts;
1074         totalRecall = (double) totalNumCorrectCuts / (double) totalNumGSCuts;
1075 }
1076
1077
1078 void LinguisticaMainWindow::GetMorphPrecisionRecall( StringToCStemList& goldStdWords,
1079                                                                                                          StringToParse& lxaWords,
1080                                                                                                          double& totalPrecision,
1081                                                                                                          double& totalRecall,
1082                                                                                                          double& averagePrecision,
1083                                                                                                          double& averageRecall )
1084 {
1085         QStringList unionOfGoldStdMorphs,
1086                                 unionOfLxaMorphs;
1087
1088         QString strWord, strPiece;
1089
1090         CParse* pLxaStem, * pGoldStdStem;
1091
1092         int i,
1093                 totalNumLxaWordsCompared = 0,
1094                 totalNumGSWordsCompared = 0,
1095                 totalNumLxaMorphemes = 0,
1096                 totalNumGSMorphemes = 0,
1097                 totalNumCorrectMorphemes = 0;
1098
1099         StringToCStemList::Iterator goldStdIt;
1100         for( goldStdIt = goldStdWords.begin(); goldStdIt != goldStdWords.end(); goldStdIt++ )
1101         {
1102                 strWord = goldStdIt.key();
1103
1104                 // We only look through words that exist in both spaces
1105                 if( lxaWords.find( strWord ) == lxaWords.end() ) continue;
1106                 pLxaStem = lxaWords.find( strWord ).data();
1107
1108                 totalNumLxaWordsCompared++;
1109
1110                 for( i = 1; i <= pLxaStem->Size(); i++ )
1111                 {
1112                         strPiece = pLxaStem->GetPiece(i).Display();
1113                         if( unionOfLxaMorphs.findIndex( strPiece ) == -1 )
1114                         {
1115                                 unionOfLxaMorphs.append( strPiece );
1116                         }
1117                 }
1118
1119                 // There may be duplicates in gold standard, we need the union of all their morphemes
1120                 for( pGoldStdStem = goldStdIt.data().first(); pGoldStdStem; pGoldStdStem = goldStdIt.data().next() )
1121                 {
1122                         totalNumGSWordsCompared++;
1123
1124                         for( i = 1; i <= pGoldStdStem->Size(); i++ )
1125                         {
1126                                 strPiece = pGoldStdStem->GetPiece(i).Display();
1127                                 if( unionOfGoldStdMorphs.findIndex( strPiece ) == -1 )
1128                                 {
1129                                         unionOfGoldStdMorphs.append( strPiece );
1130                                 }
1131                         }
1132                 }
1133         }
1134
1135         unionOfLxaMorphs.sort();
1136         unionOfGoldStdMorphs.sort();
1137
1138         QStringList::Iterator lxaMorphsIt = unionOfLxaMorphs.begin(),
1139                                                   goldStdMorphsIt = unionOfGoldStdMorphs.begin();
1140
1141         while( lxaMorphsIt != unionOfLxaMorphs.end() &&
1142                    goldStdMorphsIt != unionOfGoldStdMorphs.end() )
1143         {
1144                 if( *goldStdMorphsIt == *lxaMorphsIt )
1145                 {
1146                         totalNumCorrectMorphemes++;
1147                         totalNumLxaMorphemes++;
1148                         totalNumGSMorphemes++;
1149
1150                         ++goldStdMorphsIt;
1151                         ++lxaMorphsIt;
1152                 }
1153                 else if( *goldStdMorphsIt > *lxaMorphsIt )
1154                 {
1155                         totalNumLxaMorphemes++;
1156
1157                         ++lxaMorphsIt;
1158                 }
1159                 else // *goldStdMorphsIt < *lxaMorphsIt
1160                 {
1161                         totalNumGSMorphemes++;
1162
1163                         ++goldStdMorphsIt;
1164                 }
1165         }
1166
1167         totalPrecision = (double) totalNumCorrectMorphemes / (double) totalNumLxaMorphemes;
1168         totalRecall = (double) totalNumCorrectMorphemes / (double) totalNumGSMorphemes;
1169
1170         averagePrecision = totalPrecision;
1171         averageRecall = totalRecall;
1172 }
1173
1174
1175
1176 /*  This is the old version that Yu Hu wrote before Alchemist 3.0
1177         I am keeping it here for reference. The last part of compareGoldStdSlot()
1178         was transferred to the new version.
1179
1180 void LinguisticaMainWindow::compareGoldStdSlot()
1181 {
1182         QString                                                         line;
1183         //int                                                                   FoundLoc;
1184         QString                                                         FirstPiece, RemainingPiece;
1185         QString                                                         theWord;
1186         CStem*                                                          theCStem;
1187         CParse*                                                         theParse;
1188         StringToParse*                                          TempSedCuts;
1189         StringToParse::Iterator                         StringToParseIt;
1190         StringToPtrCStem                                        GoldCuts;
1191         StringToParse                                           SedCuts;
1192         StringToPtrCStem::Iterator                      GoldStIt;
1193         StringToCStem::Iterator                         SFIt;
1194         CCorpusWordCollection*                          TempSFCut;
1195         StringToParse                                           SFCuts;
1196         CCorpusWord*                                            theCorpusWord;
1197
1198         // 1. Read GoldStand File
1199
1200         QString goldFileName;
1201
1202         goldFileName = QFileDialog::getOpenFileName( m_projectDirectory,
1203                                                                                                  "XML Files (*.xml)",
1204                                                                                                  this,
1205                                                                                                  "open file dialog",
1206                                                                                                  "Choose a file to open" );
1207
1208         if( !goldFileName.isEmpty() )
1209         {
1210
1211                 QFile goldFile( goldFileName );
1212                 if ( goldFile.open( IO_ReadOnly ) )
1213                 {
1214                         QDomDocument doc( "Goldstandard" );
1215
1216                          if( !doc.setContent(&goldFile) )
1217                          {
1218                                 goldFile.close();
1219                                 return;
1220                          }
1221
1222                         QDomElement root = doc.documentElement();
1223
1224                         QString rootTagName = root.tagName();
1225
1226                         if( root.tagName() != "GDS")
1227                         {
1228                                 QMessageBox::information( NULL, "Error", "There was an error reading the Gold Standard. The following XML tag cannot be read: " + rootTagName, "OK" );
1229                                 goldFile.close();
1230                                 return;
1231                         }
1232
1233                         // Read the header
1234                          QDomNode header = root.firstChild();
1235
1236                          QDomElement direction = header.nextSibling().toElement();
1237
1238                         // Filling into the docinfo
1239                          // Read the content
1240                          QDomNode contentnode = direction.nextSibling();
1241                          QDomElement content = contentnode.toElement();
1242
1243                          if( content.tagName() != "content" )
1244                  {
1245                                 QMessageBox::information( NULL, "Error", "Sorry, the xml file format is not supported", "OK" );
1246                                 goldFile.close();
1247                                 return;
1248                          }
1249
1250
1251
1252                          QString                                value;
1253                          int                                    supposedtotalnumberofwords;
1254                          QString                                key;
1255                          QString                                wordcomment;
1256                          int                                    numberofpieces;
1257                          int                                    start;
1258                          int                                    type;
1259                          int                                    color;
1260                          int                                    score;
1261                          QString                                allomorph, comment;
1262 //                       int                                    Index;
1263
1264
1265                          value = content.attribute("number", "100");
1266                          supposedtotalnumberofwords = value.toInt();
1267
1268                          QDomNode onewordnode = content.firstChild();
1269
1270                          while (!onewordnode.isNull())
1271                          {
1272                                 QDomElement oneword = onewordnode.toElement();
1273                                 if( !oneword.isNull() )
1274                                 {
1275                                         if( oneword.tagName() == "word" )
1276                                 {
1277                                                 key = oneword.attribute( "key", "" );
1278                                                 if (key == "")
1279                                                 {
1280                                                         onewordnode = onewordnode.nextSibling();
1281                                                         continue;
1282                                                 }
1283
1284                                                 theWord = key;
1285                                                 theCStem = new CStem(key);
1286                                                 GoldCuts.insert(theWord, theCStem);
1287
1288                                                 value = oneword.attribute("morphemes", "0");
1289                                                 numberofpieces = value.toInt();
1290
1291                                                 if (numberofpieces == 0)
1292                                                 {
1293                                                         onewordnode = onewordnode.nextSibling();
1294                                                         continue;
1295                                                 }
1296
1297                                                 wordcomment = oneword.attribute("comment", "");
1298
1299
1300                                                 QDomNode onepiecenode = oneword.firstChild();
1301
1302                                                 while (!onepiecenode.isNull())
1303                                                 {
1304                                                         QDomElement onepiece = onepiecenode.toElement();
1305                                                         if( !onepiece.isNull() )
1306                                                         {
1307                                                                 if( onepiece.tagName() == "morpheme" )
1308                                                         {
1309                                                                         value = onepiece.attribute("start", "-1");
1310                                                                         start = value.toInt();
1311                                                                         value =onepiece.attribute("type", "-1");
1312                                                                         type = value.toInt();
1313
1314
1315                                                                         if ( type == 0)
1316                                                                         {
1317                                                                                 GoldCuts.remove(theWord);
1318                                                                                 delete theCStem;
1319                                                                                 break;
1320                                                                         }
1321
1322
1323                                                                         value = onepiece.attribute("color", "-2");
1324                                                                         color = value.toInt();
1325                                                                         value = onepiece.attribute("score", "0");
1326                                                                         score = value.toInt();
1327                                                                         allomorph = onepiece.attribute("allomorph", "");
1328                                                                         comment = onepiece.attribute("comment", "");
1329
1330                                                                         if ( ( start == -1) || (type == -1) || (color == -2))
1331                                                                         {
1332                                                                                 onepiecenode = onepiecenode.nextSibling();
1333                                                                                 continue;
1334                                                                         }
1335
1336                                                                         theCStem ->CutRightBeforeHere(start);
1337
1338
1339                                                                         //m_WordCollection ->ParseOneWord(key, start, type, color, true, score, allomorph, comment, Index, wordcomment);
1340
1341
1342                                                                 }
1343                                                         }
1344
1345                                                         onepiecenode = onepiecenode.nextSibling();
1346                                                 }
1347
1348
1349                                         }
1350                                 }
1351
1352                                 onewordnode = onewordnode.nextSibling();
1353                          }
1354
1355                 }
1356                 else
1357                 {
1358                         QMessageBox::information( NULL, "Attention", "Unable to open " + goldFileName + " .", "OK" );
1359                         return;
1360                 }
1361
1362                 goldFile.close();
1363
1364         }
1365         else
1366         {
1367                 return;
1368         }
1369
1370         // Get the Lingustica analyses result SF or PF
1371
1372         if ( !m_lexicon) return;
1373         TempSFCut = m_lexicon->GetWords();
1374
1375         TempSFCut->Sort( KEY );
1376
1377         for( int i = 0; i < TempSFCut->GetCount(); i++ )
1378         {
1379                 theCorpusWord = TempSFCut->GetAt(i);
1380                 theWord = theCorpusWord->Display();
1381
1382                 SFCuts.insert(theWord, theCorpusWord);
1383
1384         }
1385
1386         // Get SED analyses
1387         if ( m_Words_Templates != NULL)
1388         {
1389                 TempSedCuts = m_Words_Templates ->GetParsedResult();
1390                 for ( StringToParseIt = TempSedCuts ->begin(); StringToParseIt != TempSedCuts ->end();  StringToParseIt++)
1391                 {
1392                         theWord = StringToParseIt.key();
1393                         theParse = StringToParseIt.data();
1394                         theCStem = new CStem(*theParse);
1395
1396                         SedCuts.insert(theWord, theCStem);
1397                 }
1398
1399
1400         }
1401
1402         // Goldstandard comparison output
1403         double TotalPrecision;
1404         double TotalRecall;
1405         double AveragePrecision;
1406         double AverageRecall;
1407
1408         double Ftot=0.0;
1409         double Fav=0.0;
1410         QString                                 outs;
1411
1412         ////////////// compute precision recall         SF
1413
1414         GetPrecisionRecall(GoldCuts,SFCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
1415         Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
1416         Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
1417         // print out precision recall
1418         outs = QString("Total precision MiniLexicon = %1, total recall= %2 Ftot=%3 ").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
1419
1420         QMessageBox::information ( NULL, "Irinaoutput", outs );
1421
1422
1423
1424         ///////////////////////////////// SED
1425         if ( m_Words_Templates != NULL)
1426         {
1427
1428                 GetPrecisionRecall(GoldCuts,SedCuts, TotalPrecision, TotalRecall,AveragePrecision,AverageRecall);
1429
1430                 Ftot=2*TotalPrecision*TotalRecall/(TotalPrecision+TotalRecall);
1431                 Fav=2*AveragePrecision*AverageRecall/(AveragePrecision+AverageRecall);
1432
1433                 outs = QString("Total precision SED= %1, total recall= %2 Ftot=%3").arg(TotalPrecision).arg(TotalRecall).arg(Ftot);
1434
1435                 QMessageBox::information ( NULL, "Irinaoutput", outs );
1436         }
1437
1438         // TODO: delete all QMap data that are pointers
1439 }
1440
1441
1442 void LinguisticaMainWindow::GetPrecisionRecall(StringToCStemList& GoldStM, StringToParse&  ResultM, double &TotalPrecision, double &TotalRecall, double &AveragePrecision, double &AverageRecall )
1443 {
1444
1445         // iterate through the gold standard QMap GoldStM
1446         // look up each word in the results QMap ResultM
1447         // get the CParses for each word from GoldStM and ResultM and compare them
1448
1449
1450         // precision and recall are computed for all cuts
1451         // average precision and recall: precision and recall are computed for each word and then averaged
1452
1453         StringToCStemList::Iterator             GoldStIt;
1454         CParse*                                                 ReferenceStem;
1455         CParse*                                                 ResultStem;
1456 //      CParse*                                                 ReferenceParse;
1457 //      CParse*                                                 ResultParse;
1458         int*                                                    ReferenceCuts;
1459         int*                                                    ResultCuts;
1460         QString                                                 Word;
1461
1462     // simultaneously collect the results for individual morphemes
1463
1464         int TotalNumReferenceCuts=0;
1465         int TotalNumResultCuts=0;
1466         int TotalNumRightCuts=0;// use for precision
1467         int TotalNumMissedReferenceCuts=0; // use for recall
1468
1469         AveragePrecision=0;
1470         AverageRecall=0;
1471         TotalPrecision=0;
1472         TotalRecall=0;
1473
1474         int NumWords=0;
1475
1476         for( GoldStIt = GoldStM.begin(); GoldStIt != GoldStM.end();GoldStIt++)
1477         {
1478
1479
1480         // do for each word
1481         // get the reference and the result cuts
1482
1483                 Word = GoldStIt.key();
1484                 for( ReferenceStem = GoldStIt.data().first(); ReferenceStem; ReferenceStem = GoldStIt.data().next() )
1485                 {
1486                         // For now, we are only handling one version of each word
1487                         // string. We need to escape after the first.
1488                         if( ReferenceStem != GoldStIt.data().first() )
1489                         {
1490                                 break;
1491                         }
1492
1493                         //ReferenceParse=ReferenceStem->GetStemPtr();
1494                         ReferenceCuts=ReferenceStem->GetPieces();
1495
1496
1497
1498                         if(ResultM.contains(Word))
1499                         {
1500                                 NumWords++;
1501
1502                                 ResultStem=ResultM[Word];
1503                         //      ResultParse=ResultStem->GetStemPtr();
1504                                 ResultCuts=ResultStem->GetPieces();
1505
1506
1507                                 // compare these two analyses
1508                                 int RefSz=ReferenceStem->Size();
1509                                 int ResSz=ResultStem->Size();
1510
1511
1512                                 int RefCount=1;// the first index is always 0
1513                                 int ResCount=1;// the first index is always 0
1514
1515                                 int NumRightCuts=0;
1516                                 int NumMissedCuts=0;
1517
1518                                 // compare the cuts
1519                                 // if there is only one morpheme=word in both analyses:
1520                                 if(RefSz==1&& ResSz==1)
1521                                         {
1522
1523                                                 AveragePrecision+=1;
1524                                                 AverageRecall+=1;
1525                                                 TotalNumRightCuts++;
1526                                                 TotalNumReferenceCuts+=(RefSz);
1527                                                 TotalNumResultCuts+=(ResSz);
1528
1529                                                 continue;
1530                                         }
1531                                         else if(RefSz==1) // all cuts are wrong
1532                                         {
1533
1534                                                 TotalNumResultCuts+=(ResSz-1);
1535                                                 continue;
1536                                         }
1537                                         else if(ResSz==1)
1538                                         {
1539                                                 TotalNumReferenceCuts+=(RefSz-1);
1540                                                 continue;
1541                                         }
1542
1543                                 TotalNumReferenceCuts+=(RefSz-1);
1544                                 TotalNumResultCuts+=(ResSz-1);
1545
1546                                 int begin=0;
1547                                 int end=0;
1548
1549                                 while(RefCount<(RefSz) && ResCount<(ResSz))
1550                                 {
1551                                   int NextRefIndex=ReferenceCuts[RefCount];
1552                                   int NextResIndex=ResultCuts[ResCount];
1553
1554                                         if(NextRefIndex == NextResIndex)// the right cut
1555                                         {
1556
1557
1558                                                         NumRightCuts++;
1559                                                         TotalNumRightCuts++;
1560
1561                                                         RefCount++;
1562                                                         ResCount++;
1563
1564                                         }
1565                                         else if(NextRefIndex < NextResIndex)//missed cut
1566                                         {
1567
1568                                                         //NumMissedCuts++;
1569                                                         //TotalNumMissedReferenceCuts++;
1570
1571                                                         RefCount++;
1572                                         }
1573                                         else if (NextRefIndex > NextResIndex)//wrong cut
1574                                         {
1575                                                         ResCount++;
1576                                         }
1577                                 }
1578
1579                                 double WordPrecision=(double)NumRightCuts/(double)(ResSz-1);
1580                                 double WordRecall=(double)NumRightCuts/(double)(RefSz-1);
1581                                 AveragePrecision+=WordPrecision;
1582                                 AverageRecall+=WordRecall;
1583
1584                         }
1585                 }
1586         }
1587
1588
1589         AveragePrecision/=(double)NumWords;
1590         AverageRecall/=(double)NumWords;
1591
1592         TotalPrecision=(double)TotalNumRightCuts/(double)TotalNumResultCuts;
1593         TotalRecall=(double)TotalNumRightCuts/(double)TotalNumReferenceCuts;
1594
1595 }
1596
1597 */
1598