Lexicon_Compounds.cpp

   1 // Compound discovery and analysis within the Lexicon class
   2 // Copyright © 2009 The University of Chicago
   3 #include "Lexicon.h"
   4
   5 #include <QTextStream>
   6 #include <QList>
   7 #include "linguisticamainwindow.h"
   8 #include "ui/Status.h"
   9 #include "EarleyParser.h"
  10 #include "MiniLexicon.h"
  11 #include "GrammarRule.h"
  12 #include "Signature.h"
  13 #include "Compound.h"
  14 #include "Linker.h"
  15 #include "Suffix.h"
  16 #include "Prefix.h"
  17 #include "Stem.h"
  18 #include "Edge.h"
  19 #include "TerminalRuleCollection.h"
  20 #include "CompoundCollection.h"
  21 #include "LinkerCollection.h"
  22 #include "WordCollection.h"
  23 #include "StemCollection.h"
  24 #include "Typedefs.h"
  25
  26 /**
  27  * Displays all compounds in the collection view <i>pView</i>. All joined characters are
  28  * re-filtered on output to separate characters with <i>filter</i>. The <i>separator</i> will be
  29  * placed between components of a compound in the first column.
  30  */
  31 void CLexicon::CompoundListDisplay( Q3ListView* pView, StringToString* filter, QChar separator )
  32 {
  33         CCompound*    pCompound;
  34         int     MostParses = 0;
  35         int n;
  36
  37         pView->setRootIsDecorated( FALSE );
  38
  39         for (int i = 0; i < (int)m_pCompounds->GetCount(); i++) {
  40                 if( !m_pCompounds->GetAt(i)->GetParses() ) continue;
  41
  42                 n = m_pCompounds->GetAt(i)->GetParses()->count();
  43                 if( n > MostParses ) MostParses = n;
  44         }
  45
  46         // Remove all previous columns
  47         while( pView->columns() ) pView->removeColumn( 0 );
  48
  49         // Add Column headers
  50         pView->addColumn( "Compound" );
  51         pView->addColumn( "# Parses" );
  52         pView->addColumn( "Most Frequent Stem" );
  53         pView->addColumn( "Prefixness" );
  54         pView->addColumn( "Suffixness" );
  55         m_pCompounds->Sort(KEY);
  56
  57         linguistica::ui::status_user_agent& status = status_display();
  58         status.major_operation = "Creating compound list for display";
  59         status.progress.clear();
  60         status.progress.set_denominator(m_pCompounds->GetCount());
  61         for (int i = 0; i < (int)m_pCompounds->GetCount(); i++) {
  62                 status.progress = i;
  63                 pCompound = m_pCompounds->GetAtSort(i);
  64                 pCompound->CompoundListDisplay(pView, filter, separator);
  65         }
  66         status.progress.clear();
  67         status.major_operation.clear();
  68 }
  69
  70
  71 /**
  72  * Displays all linkers in the collection view <i>pView</i>. All joined characters are
  73  * re-filtered on output to separate characters with <i>filter</i>.
  74  */
  75 void CLexicon::LinkerListDisplay(Q3ListView* pView, QMap<QString, QString>* filter)
  76 {
  77         pView->setRootIsDecorated(false);
  78
  79         // Remove all previous columns
  80         while( pView->columns() ) pView->removeColumn( 0 );
  81
  82         // Add Column headers
  83         pView->addColumn( "Linker" );
  84         pView->addColumn( "Corpus Count" );
  85         pView->addColumn( "Compound Count" );
  86         pView->addColumn( "Compounds" );
  87         m_pLinkers->Sort(KEY);
  88
  89         linguistica::ui::status_user_agent& status = status_display();
  90         status.major_operation = "Creating linker list for display";
  91         status.progress.clear();
  92         status.progress.set_denominator(m_pLinkers->GetCount());
  93         for (int i = 0; i < (int)m_pLinkers->GetCount(); i++) {
  94                 status.progress = i;
  95                 CLinker* pLinker = m_pLinkers->GetAtSort(i);
  96                 pLinker->ListDisplay( pView, filter );
  97         }
  98         status.progress.clear();
  99         status.major_operation.clear();
 100 }
 101
 102 /**
 103  * Displays all compound components in the collection view <i>pView</i>.
 104  * All joined characters are re-filtered on output to separate characters
 105  * with the lexicon’s filter.
 106  */
 107 void CLexicon::CompoundComponentListDisplay(Q3ListView* pView)
 108 {
 109         pView->setRootIsDecorated(false);
 110
 111         // Remove all previous columns
 112         while( pView->columns() ) pView->removeColumn( 0 );
 113
 114         // Add Column headers
 115         pView->addColumn( "Compound component" );
 116         pView->addColumn( "Corpus Count" );
 117         pView->addColumn( "Compound Count" );
 118
 119         linguistica::ui::status_user_agent& status = status_display();
 120         status.progress.clear();
 121         status.progress.set_denominator(m_pCompounds->GetComponents()->GetCount());
 122         for (int i = 0; i < (int)m_pCompounds->GetComponents()->GetCount(); i++) {
 123                 status.progress = i;
 124                 CStem* pStem = m_pCompounds->GetComponents()->GetAtSort(i);
 125                 static_cast<void>(new Q3ListViewItem(
 126                         pView, pStem->GetKey().Display(),
 127                         IntToStringWithCommas(pStem->GetCorpusCount())));
 128         }
 129         status.progress.clear();
 130
 131         // XXX. necessary?
 132         status.major_operation.clear();
 133 }
 134
 135 void CLexicon::FromStemsFindFlatCompounds( QList<CStem*>* compounds, QList<CStem*>* components, QString linker, int maxNumberOfRoots )
 136 {
 137         CMiniLexicon* pMini;
 138         int stemCount = 0;
 139         int j;
 140
 141         int MaximumParseDepth = GetIntParameter ("EarleyParser\\MaximumParseDepth", 6 );
 142
 143         // We need to have analyzed some stems.
 144         if( !compounds )
 145         {
 146                 for (int i = 0; i < static_cast <int> ( m_pMiniLexica->size() ) ; i++ )
 147                 {
 148                         pMini = (*m_pMiniLexica)[i];
 149                         if( pMini ) stemCount += pMini->GetStems()->GetCount();
 150                 }
 151                 if( stemCount == 0 ) return;
 152         }
 153
 154         CGrammarRule* pRule;
 155         CTerminalRule* tRule;
 156         CWildCardRule* wRule;
 157         QString rule, compound;
 158         Q3PtrList<CEdge>* allParses = NULL;
 159         CCompound* pCompound;
 160         CParse oneParse;
 161         CEdge* pEdge;
 162         CSS ssCompound;
 163         CMiniLexicon* mini;
 164
 165         QTextStream* logFile = NULL;
 166
 167         int longestCompound = 0;
 168         int shortestComponent = 0;
 169
 170         m_pCompounds->SetComponentCount( 0.0 );
 171         m_pCompounds->SetLinkerCount( 0.0 );
 172
 173         int MINIMUM_STEM_LENGTH = GetIntParameter( "Main\\MinimumStemLength", 3 );
 174         int MAXIMUM_LINKER_LENGTH = GetIntParameter( "Compounds\\MaximumLinkerLength", 1 );
 175
 176         // We will attempt to parse all stems and unanalyzed
 177         // words (assumed to be stems), so we should collect
 178         // them all into one place.
 179         CStemCollection Stems;
 180         Stems.CreateReverseTrie();
 181
 182         if (!compounds) {
 183                 for (int i = GetMiniSize()-1; i >= 0; i--) {
 184                         mini = GetMiniLexicon(i);
 185                         if( !mini ) continue;
 186
 187                         for( j = 0; j < mini->GetStems()->GetCount(); j++ )
 188                         {
 189                                 Stems << mini->GetStems()->GetAt(j);
 190                         }
 191
 192                         for( j = 0; j < mini->GetWords()->GetCount(); j++ )
 193                         {
 194                                 // We don't want to parse analyzed words
 195                                 if( mini->GetWords()->GetAt(j)->Size() > 1 ) continue;
 196
 197                                 // We don't want words that are too short
 198                                 if( mini->GetWords()->GetAt(j)->GetKeyLength() < MINIMUM_STEM_LENGTH ) continue;
 199
 200                                 Stems << mini->GetWords()->GetAt(j);
 201                         }
 202                 }
 203         } else {
 204                 for (int z = 0; z < compounds->size(); z++) {
 205                         CStem* pStem = compounds->at(z);
 206                         Stems << pStem;
 207                         if (pStem->GetKeyLength() > longestCompound)
 208                                 longestCompound = pStem->GetKeyLength();
 209                 }
 210
 211                 for (int y = 0; y < components->size(); y++) {
 212                         CStem* pStem = components->at(y);
 213                         Stems << pStem;
 214                         if( shortestComponent == 0 || pStem->GetKeyLength() < shortestComponent )
 215                         {
 216                                 shortestComponent = pStem->GetKeyLength();
 217                         }
 218                 }
 219         }
 220
 221
 222         // We need a grammar to parse from
 223         RuleCollection grammar;
 224
 225         // Start
 226         grammar.insert( "Start", RuleList() );
 227         //grammar[ "Start" ].setAutoDelete( TRUE );
 228
 229         pRule = new CGrammarRule();
 230         rule = "Start Word";                                                    // Start --> Word
 231         pRule->Collapse( rule );
 232         grammar[ "Start" ].append( pRule );
 233
 234         // Word
 235         grammar.insert( "Word", RuleList() );
 236         //grammar[ "Word" ].setAutoDelete( TRUE );
 237
 238         pRule = new CGrammarRule();
 239         rule = "Word Compound";                                                 // Word --> Compound
 240         pRule->Collapse( rule );
 241         grammar[ "Word" ].append( pRule );
 242
 243         // Compound
 244         grammar.insert( "Compound", RuleList() );
 245         //grammar[ "Compound" ].setAutoDelete( TRUE );
 246
 247         for (int i = 2; i <= maxNumberOfRoots; i++) {           // Compound --> Root Root+
 248                 pRule = new CGrammarRule();
 249
 250                 rule = "Compound";
 251                 for( j = 0; j < i; j++ ) rule += " Root";
 252
 253                 pRule->Collapse( rule );
 254                 grammar[ "Compound" ].append( pRule );
 255         }
 256
 257
 258         // Linker
 259         CTerminalRuleCollection linkers;
 260
 261         if( linker == QString::null )
 262         {
 263                 if( MAXIMUM_LINKER_LENGTH > 0 )
 264                 {
 265                         grammar.insert( "Linker", RuleList() );
 266                         //grammar[ "Linker" ].setAutoDelete( TRUE );
 267
 268                         for (int i = 1; i <= MAXIMUM_LINKER_LENGTH; i++) {
 269                                 rule = "Linker";
 270                                 wRule = new CWildCardRule( rule, i );
 271                                 grammar[ "Linker" ].append( wRule );
 272                         }
 273                 }
 274         }
 275         else
 276         {
 277                 rule = "Linker";
 278                 tRule = new CTerminalRule( rule );
 279                 tRule->SetKey( CStringSurrogate( linker ) );
 280                 Q_ASSERT( tRule == ( linkers << tRule ) );
 281         }
 282
 283
 284         if( grammar.find( "Linker" ) != grammar.end() ||
 285                 linker != QString::null )
 286         {
 287
 288                 for (int i = 2; i <= maxNumberOfRoots; i++) {   // Compound --> Root (Linker Root)+
 289                         pRule = new CGrammarRule();
 290
 291                         rule = "Compound";
 292                         for( j = 0; j < i - 1; j++ ) rule += " Root Linker";
 293                         rule += " Root";
 294
 295                         pRule->Collapse( rule );
 296                         grammar[ "Compound" ].append( pRule );
 297                 }
 298         }
 299
 300         // Add all the stems as terminal root rules
 301         CTerminalRuleCollection stems;
 302
 303         if (!components) {
 304                 Stems.Sort(KEY);
 305                 for (int i = 0; i < Stems.GetCount(); i++) {
 306                         CStem* pStem = Stems.GetAtSort(i);
 307                         rule = "Root";
 308                         tRule = new CTerminalRule( rule );
 309                         tRule->Append( pStem );
 310                         Q_ASSERT( tRule == ( stems << tRule ) );
 311                 }
 312         } else {
 313                 for (int z = 0; z < components->size(); z++) {
 314                         CStem* pStem = components->at(z);
 315                         rule = "Root";
 316                         tRule = new CTerminalRule( rule );
 317                         tRule->Append( pStem );
 318                         Q_ASSERT( tRule == ( stems << tRule ) );
 319                 }
 320         }
 321
 322         QMap<QString, CTerminalRuleCollection*> lexicon;
 323         lexicon.insert( "Root", &stems );
 324         if( linker != QString::null ) lexicon.insert( "Linker", &linkers );
 325
 326         // Parse all stems
 327         Stems.Sort(KEY);
 328
 329         linguistica::ui::status_user_agent& status = status_display();
 330         status.major_operation = "Parsing possible compounds";
 331         status.progress.clear();
 332         status.progress.set_denominator(Stems.GetCount());
 333         CEarleyParser* parser = NULL;
 334         for (int i = 0; i < Stems.GetCount(); i++) {
 335                 status.progress = i;
 336                 CStem* pStem = Stems.GetAtSort(i);
 337
 338                 // We don't want to analyze anything that can't contain two stems
 339                 // Words that are not at least the length of two stems
 340                 if( pStem->GetKeyLength() < 2 * MINIMUM_STEM_LENGTH ) continue;
 341
 342                 // This includes anything that has less than two valid stems that start
 343                 // at the first character of this word
 344                 CSS key = pStem->GetKey();
 345                 if( Stems.CountValidSubstrings( key ) < 2 ) continue;
 346
 347                 // And also includes anything that has less than two valid stems that
 348                 // end at the last character of this word
 349                 key.SetBackwards();
 350                 if( Stems.GetReverseTrie()->CountValidSubstrings( key ) < 2 ) continue;
 351
 352                 // Log if desired
 353                 logFile = LogFileOn() ? GetLogFileStream() : NULL;
 354
 355                 // Parse the word
 356                 parser = new CEarleyParser(pStem, &grammar, &lexicon, logFile, MaximumParseDepth);
 357                 pStem->SetMyEarleyParser(parser);
 358
 359                 if (parser->isValidGrammar())
 360                         allParses = parser->Parse();
 361
 362                 if (allParses && !allParses->isEmpty()) {
 363                         // We found some parses, now we can create a compound
 364                         // we'll use the first parse until we can get component
 365                         // probability information
 366
 367                         pEdge = allParses->first();
 368                         pEdge->GetParse( &oneParse );
 369
 370                         pCompound = *m_pCompounds << oneParse.Display();
 371                         pCompound->IncrementCorpusCount( pStem->GetCorpusCount() - 1 );
 372
 373                         double componentCount = 0.0,
 374                                    linkerCount = 0.0;
 375                         pCompound->SetLexicon( this );
 376                         pCompound->SetParses( allParses, &componentCount, &linkerCount );
 377                         m_pCompounds->SetComponentCount( m_pCompounds->GetComponentCount() + componentCount );
 378                         m_pCompounds->SetLinkerCount( m_pCompounds->GetLinkerCount() + linkerCount );
 379                         pCompound->SetBestParse(0);
 380
 381                         UpdateCompound( pCompound->Display() );
 382                 }
 383         }
 384         status.progress.clear();
 385         status.major_operation.clear();
 386 }
 387
 388
 389 void CLexicon::FromStemsFindCompounds( QList<CStem*>* compounds, QList<CStem*>* components, QString linker )
 390 {
 391         CMiniLexicon* pMini;
 392         int stemCount = 0;
 393         int i, j;
 394
 395         int MaximumParseDepth = GetIntParameter ("EarleyParser\\MaximumParseDepth", 5 );
 396
 397         // We need to have analyzed some stems.
 398         if( !compounds )
 399         {
 400                 for( i = 0; i < static_cast <int> ( m_pMiniLexica->size() ) ; i++ )
 401                 {
 402                         pMini = (*m_pMiniLexica)[i];
 403                         if( pMini ) stemCount += pMini->GetStems()->GetCount();
 404                 }
 405                 if( stemCount == 0 ) return;
 406         }
 407
 408         CStem*           pStem;
 409         CGrammarRule*    pRule;
 410         CTerminalRule*   tRule;
 411         CWildCardRule*   wRule;
 412         QString          rule, compound;
 413         Q3PtrList<CEdge>*   allParses = NULL;
 414         CCompound*       pCompound;
 415         CParse           oneParse;
 416         CEdge*           pEdge;
 417         CSS              ssCompound;
 418         CMiniLexicon*    mini;
 419         QTextStream*     logFile = NULL;
 420
 421         int longestCompound   = 0;
 422         int shortestComponent = 0;
 423
 424         m_pCompounds->SetComponentCount( 0.0 );
 425         m_pCompounds->SetLinkerCount( 0.0 );
 426
 427         int MINIMUM_STEM_LENGTH = GetIntParameter( "Main\\MinimumStemLength", 3 );
 428         int MAXIMUM_LINKER_LENGTH = GetIntParameter( "Compounds\\MaximumLinkerLength", 1 );
 429
 430         // We will attempt to parse all stems and unanalyzed
 431         // words (assumed to be stems), so we should collect
 432         // them all into one place.
 433         CStemCollection Stems;
 434         Stems.CreateReverseTrie();
 435
 436         if( !compounds )
 437         {
 438                 for( i = GetMiniSize()-1; i >= 0; i-- )
 439                 {
 440                         mini = GetMiniLexicon(i);
 441                         if( !mini ) continue;
 442
 443                         for( j = 0; j < mini->GetStems()->GetCount(); j++ )
 444                         {
 445                                 Stems << mini->GetStems()->GetAt(j);
 446                         }
 447
 448                         for( j = 0; j < mini->GetWords()->GetCount(); j++ )
 449                         {
 450                                 // We don't want to parse analyzed words
 451                                 if( mini->GetWords()->GetAt(j)->Size() > 1 ) continue;
 452
 453                                 // We don't want words that are too short
 454                                 if( mini->GetWords()->GetAt(j)->GetKeyLength() < MINIMUM_STEM_LENGTH ) continue;
 455
 456                                 Stems << mini->GetWords()->GetAt(j);
 457                         }
 458                 }
 459         }
 460         else
 461         {
 462                 //for( pStem = compounds->first(); pStem; pStem = compounds->next() )
 463                 for (int z= 0; z < compounds->size(); z++)
 464                 {       pStem = compounds->at(z);
 465                         Stems << pStem;
 466                         if( pStem->GetKeyLength() > longestCompound )
 467                         {
 468                                 longestCompound = pStem->GetKeyLength();
 469                         }
 470                 }
 471
 472                 //for( pStem = components->first(); pStem; pStem = components->next() )
 473                 for (int y  = 0; y < components->size(); y++)
 474                 {       pStem = components->at(y);
 475                         Stems << pStem;
 476                         if( shortestComponent == 0 || pStem->GetKeyLength() < shortestComponent )
 477                         {
 478                                 shortestComponent = pStem->GetKeyLength();
 479                         }
 480                 }
 481         }
 482
 483
 484         // We need a grammar to parse from
 485         RuleCollection grammar;
 486
 487         // Start
 488         grammar.insert( "Start", RuleList() );
 489         //grammar[ "Start" ].setAutoDelete( TRUE );
 490
 491         pRule = new CGrammarRule();
 492         rule = "Start Word";                                                    // Start --> Word
 493         pRule->Collapse( rule );
 494         grammar[ "Start" ].append( pRule );
 495
 496         // Word
 497         grammar.insert( "Word", RuleList() );
 498         //grammar[ "Word" ].setAutoDelete( TRUE );
 499
 500         pRule = new CGrammarRule();
 501         rule = "Word Compound";                                                 // Word --> Compound
 502         pRule->Collapse( rule );
 503         grammar[ "Word" ].append( pRule );
 504
 505         // Compound
 506         grammar.insert( "Compound", RuleList() );
 507         //grammar[ "Compound" ].setAutoDelete( TRUE );
 508
 509         pRule = new CGrammarRule();
 510         rule = "Compound Compound Compound";                    // Compound --> Compound Compound
 511         pRule->Collapse( rule );
 512         grammar[ "Compound" ].append( pRule );
 513
 514         pRule = new CGrammarRule();
 515         rule = "Compound Compound Root";                                // Compound --> Compound Root
 516         pRule->Collapse( rule );
 517         grammar[ "Compound" ].append( pRule );
 518
 519         pRule = new CGrammarRule();
 520         rule = "Compound Root Compound";                                // Compound --> Root Compound
 521         pRule->Collapse( rule );
 522         grammar[ "Compound" ].append( pRule );
 523
 524         pRule = new CGrammarRule();
 525         rule = "Compound Root Root";                                    // Compound --> Root Root
 526         pRule->Collapse( rule );
 527         grammar[ "Compound" ].append( pRule );
 528
 529         // Linker
 530         CTerminalRuleCollection linkers;
 531
 532         if( linker == QString::null )
 533         {
 534                 if( MAXIMUM_LINKER_LENGTH > 0 )
 535                 {
 536                         grammar.insert( "Linker", RuleList() );
 537                         //grammar[ "Linker" ].setAutoDelete( TRUE );
 538
 539                         for( i = 1; i <= MAXIMUM_LINKER_LENGTH; i++ )
 540                         {
 541                                 rule = "Linker";
 542                                 wRule = new CWildCardRule( rule, i );
 543                                 grammar[ "Linker" ].append( wRule );
 544                         }
 545                 }
 546         }
 547         else
 548         {
 549                 rule = "Linker";
 550                 tRule = new CTerminalRule( rule );
 551                 tRule->SetKey( CStringSurrogate( linker ) );
 552                 Q_ASSERT( tRule == ( linkers << tRule ) );
 553         }
 554
 555
 556         if( grammar.find( "Linker" ) != grammar.end() ||
 557                 linker != QString::null )
 558         {
 559                 pRule = new CGrammarRule();
 560                 rule = "Compound Compound Linker Compound";             // Compound --> Compound Linker Compound
 561                 pRule->Collapse( rule );
 562                 grammar[ "Compound" ].append( pRule );
 563
 564                 pRule = new CGrammarRule();
 565                 rule = "Compound Compound Linker Root";                 // Compound --> Compound Linker Root
 566                 pRule->Collapse( rule );
 567                 grammar[ "Compound" ].append( pRule );
 568
 569                 pRule = new CGrammarRule();
 570                 rule = "Compound Root Linker Compound";                 // Compound --> Root Linker Compound
 571                 pRule->Collapse( rule );
 572                 grammar[ "Compound" ].append( pRule );
 573
 574                 pRule = new CGrammarRule();
 575                 rule = "Compound Root Linker Root";                             // Compound --> Root Linker Root
 576                 pRule->Collapse( rule );
 577                 grammar[ "Compound" ].append( pRule );
 578         }
 579
 580         // Add all the stems as terminal root rules
 581         CTerminalRuleCollection stems;
 582
 583         if( !components )
 584         {
 585                 Stems.Sort(KEY);
 586                 for( i = 0; i < Stems.GetCount(); i++ )
 587                 {
 588                         pStem = Stems.GetAtSort(i);
 589
 590                         rule = "Root";
 591                         tRule = new CTerminalRule( rule );
 592                         tRule->Append( pStem );
 593                         Q_ASSERT( tRule == ( stems << tRule ) );
 594                 }
 595         }
 596         else
 597         {
 598                 //for( pStem = components->first(); pStem; pStem = components->next() )
 599                 for (int z = 0; z < components->size(); z++)
 600                 {       pStem = components->at(z);
 601                         rule = "Root";
 602                         tRule = new CTerminalRule( rule );
 603                         tRule->Append( pStem );
 604                         Q_ASSERT( tRule == ( stems << tRule ) );
 605                 }
 606         }
 607
 608         QMap<QString, CTerminalRuleCollection*> lexicon;
 609         lexicon.insert( "Root", &stems );
 610         if( linker != QString::null ) lexicon.insert( "Linker", &linkers );
 611
 612         // Parse all stems
 613         Stems.Sort(KEY);
 614
 615         linguistica::ui::status_user_agent& status = status_display();
 616
 617         status.major_operation = "Parsing possible compounds";
 618         status.progress.clear();
 619         status.progress.set_denominator(Stems.GetCount());
 620         CEarleyParser* parser = NULL;
 621         for (i = 0; i < Stems.GetCount(); i++) {
 622                 status.progress = i;
 623                 pStem = Stems.GetAtSort(i);
 624
 625                 // We don't want to analyze anything that can't contain two stems
 626                 // Words that are not at least the length of two stems
 627                 if( pStem->GetKeyLength() < 2 * MINIMUM_STEM_LENGTH ) continue;
 628
 629                 // This includes anything that has less than two valid stems that start
 630                 // at the first character of this word
 631                 CSS key = pStem->GetKey();
 632                 if( Stems.CountValidSubstrings( key ) < 2 ) continue;
 633
 634                 // And also includes anything that has less than two valid stems that
 635                 // end at the last character of this word
 636                 key.SetBackwards();
 637                 if( Stems.GetReverseTrie()->CountValidSubstrings( key ) < 2 ) continue;
 638
 639                 // Log if desired
 640                 if( LogFileOn() )
 641                 {
 642                         logFile = GetLogFileStream();
 643                 } else { logFile = NULL; }
 644
 645                 // Parse the word
 646                 parser = new CEarleyParser( pStem, &grammar, &lexicon, logFile, MaximumParseDepth );
 647         pStem->SetMyEarleyParser( parser);
 648
 649                 if( parser->isValidGrammar() )
 650                 {
 651                         allParses = parser->Parse();
 652                 }
 653
 654                 if( allParses && !allParses->isEmpty() )
 655                 {
 656                         // We found some parses, now we can create a compound
 657                         // we'll use the first parse until we can get component
 658                         // probability information
 659
 660                         pEdge = allParses->first();
 661                         pEdge->GetParse( &oneParse );
 662
 663                         pCompound = *m_pCompounds << oneParse.Display();
 664                         pCompound->IncrementCorpusCount( pStem->GetCorpusCount() - 1 );
 665
 666             //-----------------------------------------------//
 667             for (int m = 1; m <= oneParse.Size(); m++)
 668             {
 669                 *m_pCompounds->GetComponents() << oneParse.GetPiece(m).Display();
 670             }
 671
 672             //-----------------------------------------------//
 673
 674                         double componentCount = 0.0,
 675                                    linkerCount = 0.0;
 676                         pCompound->SetLexicon( this );
 677                         pCompound->SetParses( allParses, &componentCount, &linkerCount );
 678                         m_pCompounds->SetComponentCount( m_pCompounds->GetComponentCount() + componentCount );
 679                         m_pCompounds->SetLinkerCount( m_pCompounds->GetLinkerCount() + linkerCount );
 680                         pCompound->SetBestParse(0);
 681
 682                         UpdateCompound( pCompound->Display() );
 683                 }
 684         }
 685         status.progress.clear();
 686         status.major_operation.clear();
 687 }
 688
 689
 690 void CLexicon::CalculateCoefficientsOfAffixness()
 691 {
 692         int          i, j, count = 0;
 693         int          numberofwords;
 694         SuffixSet*   pSuffixSet;
 695         PrefixSet*   pPrefixSet;
 696         CSuffix*     pSuffix;
 697         CPrefix*     pPrefix;
 698         CStem*       pStem, * pWord;
 699         StemSet*     pStemSet;
 700         pStemSet     = NULL;
 701         CLinker*     pLinker;
 702         pLinker      = NULL;
 703         CCompound*   pCompound;
 704         CEdge*       pEdge;
 705         CParse       parse;
 706         CSignature*  pPrefixSignature;
 707
 708         QList<CStem*>* pStems;
 709
 710         linguistica::ui::status_user_agent& status = status_display();
 711         status.major_operation = "Calculating affixness...";
 712         status.progress.clear();
 713         status.progress.set_denominator(m_pCompounds->GetComponentMap()->count());
 714         ComponentMap::Iterator it;
 715         for (it = m_pCompounds->GetComponentMap()->begin(); it != m_pCompounds->GetComponentMap()->end(); ++it) {
 716                 status.progress = ++count;
 717                 double affix_prob     = 0.0,
 718                            component_prob = 0.0;
 719
 720                 double sig_count, stem_count, affix_count;
 721
 722                 pSuffixSet = m_AllSuffixes[ it.key() ];
 723
 724                 if( pSuffixSet )
 725                 {
 726                         for(int  suffixno = 0; suffixno < static_cast <int> ( pSuffixSet->count() ); suffixno++ )
 727                         {
 728                                 pSuffix = pSuffixSet->at(suffixno);
 729
 730                                 pStems = pSuffix->GetStems();
 731
 732                                 if( !pStems ) continue;
 733
 734                                 for (int stemno = 0; stemno < pStems->size(); stemno++)
 735                                 {       pStem = pStems->at(stemno);
 736                                         stem_count = pStem->GetCorpusCount();
 737                                         sig_count = pStem->GetSuffixSignature()->GetCorpusCount();
 738                                         CSignature* pSuffixSignature = pStem->GetSuffixSignature();
 739                                         affix_count = 0;
 740                                         numberofwords = pSuffixSignature->GetNumberOfWords();
 741                                         pWord = pSuffixSignature->GetWord(stemno, suffixno);
 742                                         if( pWord->GetSuffix() != pSuffix->GetKey() ) continue;
 743                                         affix_count += pWord->GetCorpusCount();
 744                                         affix_prob += ( (double) sig_count / (double) GetCorpusCount() ) *
 745                                                                   ( (double) stem_count / (double) sig_count ) *
 746                                                                   ( (double) affix_count / (double) sig_count );
 747                                 }
 748                         }
 749                 }
 750
 751
 752
 753                 pPrefixSet = m_AllPrefixes[ it.key() ];
 754
 755                 if( pPrefixSet )
 756                 {
 757                         for( int prefixno = 0; prefixno < static_cast <int> ( pPrefixSet->count() ); j++ )
 758                         {
 759                                 pPrefix = pPrefixSet->at(prefixno);
 760
 761                                 pStems = pPrefix->GetStems();
 762
 763                                 if( !pStems ) continue;
 764
 765                                 //for( pStem = pStems->first(); pStem; pStem = pStems->next() )
 766                                 for (int stemno = 0; stemno < pStems->size(); stemno++)
 767                                 {       pStem = pStems->at(stemno);
 768                                         stem_count = pStem->GetCorpusCount();
 769                                         sig_count = pStem->GetPrefixSignature()->GetCorpusCount();
 770
 771                                         // XXX. Explain.
 772 //                                      QList<CStem*>* pWords = pStem->GetPrefixSignature()->GetWordPtrList();
 773                                         pPrefixSignature= pStem->GetPrefixSignature();
 774                                         numberofwords = pPrefixSignature->GetNumberOfWords();
 775 //                                      if( !pWords ) continue;
 776
 777                                         affix_count = 0;
 778                                         pWord = pPrefixSignature->GetWord(stemno, prefixno);
 779                                         if( pWord->GetPrefix() != pPrefix->GetKey() ) continue;
 780                                                 affix_count += pWord->GetCorpusCount();
 781
 782
 783                                         affix_prob += ( (double) sig_count / (double) GetCorpusCount() ) *
 784                                                                   ( (double) stem_count / (double) sig_count ) *
 785                                                                   ( (double) affix_count / (double) sig_count );
 786                                 }
 787                         }
 788                 }
 789
 790
 791
 792                 double word_is_compound_prob = (double) m_pCompounds->GetCorpusCount() / (double) GetCorpusCount();
 793
 794                 for( i = 0; i < m_pCompounds->GetCount(); i++ )
 795                 {
 796                         pCompound = m_pCompounds->GetAt(i);
 797
 798                         double this_parse_prob,
 799                                    piece_prob;
 800                         for( pEdge = pCompound->GetParses()->first(); pEdge; pEdge = pCompound->GetParses()->next() )
 801                         //for (int z= 0; z < pCompound->GetParses()->size(); z++)
 802                         { //      pEdge = pCompound->GetParses()->at(z);
 803                                 pEdge->GetParse( &parse );
 804
 805                                 if( parse.Find( it.key() ) == 0 ) continue;
 806
 807                                 this_parse_prob = 1.0;
 808
 809                                 for( j = 1; j < parse.Size(); j++ )
 810                                 {
 811                                         if( m_pCompounds->GetComponentMap()->find( parse.GetPiece(j).Display() ) != m_pCompounds->GetComponentMap()->end() )
 812                                         {
 813                                                 pStemSet = m_pCompounds->GetComponentMap()->find( parse.GetPiece(j).Display() ).data();
 814                                         }
 815                                         else pLinker = *GetLinkers() ^= parse.GetPiece(j);
 816
 817                                         if( pStemSet )
 818                                         {
 819                                                 piece_prob = pStemSet->at(0)->GetCompoundCount() / m_pCompounds->GetComponentCount();
 820                                         }
 821                                         else if( pLinker ) piece_prob = pLinker->GetCompoundCount() / m_pCompounds->GetLinkerCount();
 822                                         else continue;
 823
 824                                         this_parse_prob *= piece_prob;
 825                                 }
 826
 827                                 component_prob += word_is_compound_prob * this_parse_prob;
 828                         }
 829                 }
 830
 831
 832                 pStemSet = it.data();
 833                 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
 834                 for (int y = 0; y < pStemSet->size(); y++)
 835                 {       pStem = pStemSet->at(y);
 836                         if( affix_prob == 0.0 ) pStem->SetAffixness( 0.0 );
 837                         else if( component_prob == 0.0 ) pStem->SetAffixness( 1.0 );
 838                         else pStem->SetAffixness( affix_prob / ( affix_prob + component_prob ) );
 839                 }
 840         }
 841         status.progress.clear();
 842         status.major_operation.clear();
 843 }
 844
 845
 846 void CLexicon::FromAffixnessUpdateSigsAndCompounds()
 847 {
 848         bool ok;
 849         double UPPER_THRESHOLD = 0.70;
 850         UPPER_THRESHOLD = QInputDialog::getDouble( "Linguistica",
 851                                                                                            "Enter the upper affixness threshold:",
 852                                                                                            UPPER_THRESHOLD, 0.0, 1.0, 2, &ok, m_pDoc );
 853         if ( !ok ) UPPER_THRESHOLD = 0.70;
 854
 855         double LOWER_THRESHOLD = 0.30;
 856         LOWER_THRESHOLD = QInputDialog::getDouble( "Linguistica",
 857                                                                                            "Enter the lower affixness threshold:",
 858                                                                                            LOWER_THRESHOLD, 0.0, 1.0, 2, &ok, m_pDoc );
 859         if ( !ok ) LOWER_THRESHOLD = 0.30;
 860
 861         if( UPPER_THRESHOLD < LOWER_THRESHOLD ) UPPER_THRESHOLD = LOWER_THRESHOLD;
 862
 863         double affixness;
 864
 865         ComponentMap* pComponents = m_pCompounds->GetComponentMap();
 866         CCompound*    pCompound;
 867         CEdge*        pEdge;
 868         CParse        parse;
 869         QString       component, word;
 870         StemSet*      pStemSet;
 871         SuffixSet*    pSuffixSet;
 872         PrefixSet*    pPrefixSet;
 873         CStem*        pStem, *pWord;
 874         CSuffix*      pSuffix;
 875         CPrefix*      pPrefix;
 876         CMiniLexicon* pMini;
 877
 878         QList<CCompound*> cmpdDeletions;
 879         QList<CMiniLexicon*> affectedMinis;
 880
 881         for( int i = 0; i < m_pCompounds->GetCount(); i++ )
 882         {
 883                 pCompound = m_pCompounds->GetAt(i);
 884
 885                 QList<CEdge*> edgeDeletions;
 886
 887                 for( pEdge = pCompound->GetParses()->first(); pEdge; pEdge = pCompound->GetParses()->next() )
 888                 //for (int z= 0; z < pCompound->GetParses()->size(); z++)
 889                 {//       pEdge = pCompound->GetParses()->at(z);
 890                         pEdge->GetParse( &parse );
 891
 892                         for( int j = 1; j <= parse.Size(); j++ )
 893                         {
 894                                 component = parse.GetPiece(j).Display();
 895                                 if( pComponents->find( component ) == pComponents->end() ) continue;
 896
 897                                 pStemSet = pComponents->find( component ).data();
 898
 899                                 affixness = pStemSet->at(0)->GetAffixness();
 900
 901                                 // Remove compounds which have a component whose affixness
 902                                 // exceeds the upper affixness threshold
 903                                 if( affixness >= UPPER_THRESHOLD )
 904                                 {
 905                                         //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
 906                                         for (int y = 0; y < pStemSet->size(); y++)
 907                                         {   pStem = pStemSet->at(y);
 908                                             pStem->SetCompoundCount( 0.0 );
 909                                             edgeDeletions.append( pEdge );
 910                                         }
 911                                 }
 912
 913
 914                                 // Remove suffixes whose corresponding component's affixness
 915                                 // is below the lower affixness threshold
 916                                 if( affixness <= LOWER_THRESHOLD && affixness > 0 )
 917                                 {
 918                                         // Must exist as suffix or prefix also
 919                                         pSuffixSet = m_AllSuffixes[ component ];
 920                                         bool isSuffix = TRUE;
 921                                         if( !pSuffixSet )
 922                                         {
 923                                                 pPrefixSet = m_AllPrefixes[ component ];
 924                                                 isSuffix = FALSE;
 925                                                 if( !pPrefixSet ) continue;
 926                                         }
 927
 928                                         if( isSuffix )
 929                                         {
 930                                                 //for( pSuffix = pSuffixSet->first(); pSuffix; pSuffix = pSuffixSet->next() )
 931                                                 for (int z = 0; z < pSuffixSet->size(); z++)
 932                                                 {       pSuffix = pSuffixSet->at(z);
 933                                                         pMini = pSuffix->GetMyMini();
 934                                                         pStemSet = pSuffix->GetStems();
 935
 936                                                         //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
 937                                                         for (int y = 0; y < pStemSet->size(); y++)
 938                                                         {       pStem = pStemSet->at(y);
 939                                                                 word = pStem->Display() + pSuffix->Display();
 940
 941                                                                 pWord = (*pMini->GetWords()) ^= CSS( word );
 942
 943                                                                 pWord->ClearParseStructure();
 944
 945                                                                 if( affectedMinis.indexOf( pMini ) < 0 ) affectedMinis.append( pMini );
 946                                                         }
 947                                                 }
 948                                         }
 949                                         else
 950                                         {
 951                                                 //for( pPrefix = pPrefixSet->first(); pPrefix; pPrefix = pPrefixSet->next() )
 952                                                 for (int z= 0; z < pPrefixSet->size(); z++)
 953                                                 {       pPrefix = pPrefixSet->at(z);
 954                                                         pMini = pPrefix->GetMyMini();
 955                                                         pStemSet = pPrefix->GetStems();
 956
 957                                                         //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
 958                                                         for (int z = 0; z < pStemSet->size(); z++)
 959                                                         {       pStem = pStemSet->at(z);
 960                                                                 word = pPrefix->Display() + pStem->Display();
 961                                                                 pWord = (*pMini->GetWords()) ^= CSS( word );
 962                                                                 pWord->ClearParseStructure();
 963                                                                 if( affectedMinis.indexOf( pMini ) < 0 ) affectedMinis.append( pMini );
 964                                                         }
 965                                                 }
 966                                         }
 967                                 }
 968                         }
 969                 }
 970
 971                 // Remove all edges marked for deletion
 972                 //for( pEdge = edgeDeletions.first(); pEdge; pEdge = edgeDeletions.next() )
 973                 for (int z=0; z < edgeDeletions.size(); z++)
 974                 {   pEdge = edgeDeletions.at(z);
 975                     pCompound->RemoveParse( pEdge );
 976                 }
 977
 978                 if( pCompound->GetParses()->count() == 0 )
 979                 {
 980                         cmpdDeletions.append( pCompound );
 981                 }
 982                 else if( pCompound->GetBestParse() < 0 ) pCompound->SetBestParse( 0 );
 983         }
 984
 985         // Remove all compounds marked for deletion
 986         //for( pCompound = cmpdDeletions.first(); pCompound; pCompound = cmpdDeletions.next() )
 987         for (int z= 0; z < cmpdDeletions.size(); z++)
 988         {   pCompound = cmpdDeletions.at(z);
 989                 m_pCompounds->RemoveMember( pCompound );
 990         }
 991
 992         // Update all affected minis
 993         QString strAffixness = "Affixness";
 994         CStringSurrogate cssAffixness( strAffixness );
 995         //for( pMini = affectedMinis.first(); pMini; pMini = affectedMinis.next() )
 996         for (int z = 0; z < affectedMinis.size(); z++)
 997         {   pMini = affectedMinis.at(z);
 998             pMini->TakeSplitWords_ProduceStemsAndSigs( cssAffixness );
 999         }
1000
1001         // Check validity of compounds
1002         m_pCompounds->CheckAndRecount();
1003 }
1004