MiniLexicon_Core.cpp

   1 // Suffix/signature-based discovery of morphology: core
   2 // Copyright © 2009 The University of Chicago
   3 #include "MiniLexicon.h"
   4
   5 #include <memory>
   6 #include <vector>
   7 #include <QList>
   8 #include "ui/Status.h"
   9 #include "Lexicon.h"
  10 #include "DLHistory.h"
  11 #include "Signature.h"
  12 #include "Suffix.h"
  13 #include "Prefix.h"
  14 #include "Stem.h"
  15 #include "SignatureCollection.h"
  16 #include "SuffixCollection.h"
  17 #include "PrefixCollection.h"
  18 #include "WordCollection.h"
  19 #include "StemCollection.h"
  20 #include "CompareFunc.h"
  21 #include "StringFunc.h"
  22 #include "HTML.h"
  23 #include "AffixLocation.h"
  24 #include "implicit_cast.h"
  25 using linguistica::implicit_cast;
  26
  27 /// Writes to status.details instead of status.major_operation.
  28 void CMiniLexicon::TakeSplitWords_ProduceStemsAndSigs( CStringSurrogate& Remark,
  29                                                        CWordCollection* pWords,
  30                                                        CStemCollection* pStems,
  31                                                        CPrefixCollection* pPrefixes,
  32                                                        CSuffixCollection* pSuffixes )
  33
  34 {
  35         if( pWords == NULL )    { pWords = m_pWords; };
  36         if( pStems == NULL )    { pStems = m_pStems; }
  37         if( pPrefixes == NULL ) { pPrefixes = m_pPrefixes; }
  38         if( pSuffixes == NULL ) { pSuffixes = m_pSuffixes; }
  39
  40         RebuildAffixesStemsAndSignaturesFromWordSplits( Remark );
  41 }
  42
  43 void CMiniLexicon::RebuildAffixesStemsAndSignaturesFromWordSplits(
  44                 CStringSurrogate& Remark)
  45 {
  46         int HowManySuffixes = 0, HowManyPrefixes = 0;
  47         CStemCollection* OldStems;
  48         CStringSurrogate ssPrefix, ssStem, ssSuffix;
  49         CStem* pWord = NULL;
  50         CSuffix* pSuffix = NULL;
  51         CStem* pStem;
  52         CStem* qStem;
  53         CPrefix* pPrefix = NULL;
  54         CSuffix* pNullSuffix = NULL;
  55         CPrefix* pNullPrefix = NULL;
  56
  57         CLexicon& lex = *m_pLexicon;
  58         linguistica::ui::status_user_agent& status = lex.status_display();
  59
  60         LogFileSmallTitle("Rebuilding all signature structures");
  61         status.details = "Rebuilding all signature structures.";
  62         status.progress.clear();
  63
  64         bool analyzingSuffixes = true;
  65         if (m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL)
  66                 analyzingSuffixes = false;
  67         if (analyzingSuffixes) {
  68                 m_pSuffixes->RemoveAll();
  69                 pNullSuffix = *m_pSuffixes << TheStringNULL;
  70         } else {
  71                 m_pPrefixes->RemoveAll();
  72                 pNullPrefix = *m_pPrefixes << TheStringNULL;
  73         }
  74         OldStems = m_pStems;
  75         m_pStems = new CStemCollection(this);
  76         status.details = "Rebuilding all signature structures: words.";
  77         status.progress.set_denominator(m_pWords->GetCount());
  78         for (int wordno = 0; wordno < (int)m_pWords->GetCount(); wordno++) {
  79                 pWord = m_pWords->GetAt(wordno);
  80                 if (!pWord)
  81                         continue;
  82
  83                 pWord->SetSuffixSignature(NULL);
  84                 pWord->SetPrefixSignature(NULL);
  85
  86                 status.progress = wordno;
  87
  88                 // commented out to avoid seg fault
  89                 if (
  90 #if 0
  91                     pWord->GetWordType() == CStem::BIWORD_COMPOUND ||
  92                     pWord->GetWordType() == CStem::MULTIPLE_COMPOUND ||
  93 #endif
  94                     pWord->GetWordType() == CStem::NUMBER)
  95                         continue;
  96
  97                 // rebuild stems, part 1.
  98
  99                 ssStem = pWord->GetStem();
 100
 101                 if (ssStem.GetLength() == 0) {
 102                         pWord->SetStemLoc(1);
 103                         pWord->SetSuffixLoc(0);
 104                         pWord->SetPrefixLoc(0);
 105                         continue;
 106                 }
 107
 108                 // rebuild affixes.
 109
 110                 if (analyzingSuffixes) {
 111                         ssSuffix = pWord->GetSuffix();
 112                         if (ssSuffix.GetLength() < 1)
 113                                 continue;
 114                         pSuffix = *m_pSuffixes << ssSuffix;
 115                         if (!pSuffix)
 116                                 continue;
 117                         pSuffix->IncrementCorpusCount(pWord->GetCorpusCount() - 1);
 118                         pSuffix->IncrementUseCount();
 119                         pWord->SetSuffixPtr(pSuffix);
 120                         HowManySuffixes++;
 121                 } else {
 122                         ssPrefix = pWord->GetPrefix();
 123                         if (ssPrefix.GetLength() < 1)
 124                                 continue;
 125                         pPrefix = *m_pPrefixes << ssPrefix;
 126                         pPrefix->IncrementCorpusCount(pWord->GetCorpusCount() - 1);
 127                         pPrefix->IncrementUseCount();
 128                         pWord->SetPrefixPtr(pPrefix);
 129                         HowManyPrefixes++;
 130                 }
 131
 132                 // attach affix to stem.
 133
 134                 if (analyzingSuffixes) {
 135                         pStem = *m_pStems << ssStem ;
 136                         if (!pStem)
 137                                 continue;
 138                         pWord->AttachWordAndSuffixalStem(pStem);
 139                         pSuffix = *m_pSuffixes ^= pWord->GetSuffix();
 140                         if (!pSuffix)
 141                                 continue;
 142                         pSuffix->AddStem(pStem);
 143                 } else {
 144                         pStem = *m_pStems << ssStem;
 145                         pWord->AttachWordAndPrefixalStem(pStem);
 146                         pPrefix = *m_pPrefixes ^= pWord->GetPrefix();
 147                         pPrefix->AddStem(pStem);
 148                 }
 149
 150                 pStem->SetConfidence(pWord->GetConfidence());
 151                 qStem = *OldStems ^= pStem->GetKey();
 152                 if (qStem)
 153                         pStem->SetConfidence(qStem->GetConfidence());
 154                 else
 155                         pStem->SetConfidence(Remark.Display());
 156
 157                 Q_ASSERT(pWord->IsValid());
 158
 159                 if (analyzingSuffixes)
 160                         pStem->AddSuffix(pSuffix);
 161                 else
 162                         pStem->AddPrefix(pPrefix);
 163                 LogFileEndRow();
 164         } // loop over wordno
 165         status.progress.clear();
 166         LogFileEndTable();
 167         LogFile1SimpleString("End of word loop");
 168
 169         // in what follows,
 170         // if a stem is also a word, then we add NULL
 171         // to the stem's factorization.
 172         // Stems Loop.
 173         LogFileSmallTitle("Stem Loop");
 174         LogFileStartTable();
 175         status.details = "Rebuilding all signature structures:stems.";
 176         status.progress.set_denominator(m_pStems->GetCount());
 177         for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
 178                 pStem = m_pStems->GetAt(stemno);
 179                 status.progress = stemno;
 180                 LogFile(pStem->Display());
 181                 if (analyzingSuffixes) {
 182                         if (pStem->GetNumberOfSuffixes() < 1)
 183                                 continue;
 184                 } else {
 185                         if (pStem->GetNumberOfPrefixes() < 1)
 186                                 continue;
 187                 }
 188                 pWord = *m_pWords ^= pStem->GetKey();
 189                 if (pWord) {
 190                         if (analyzingSuffixes) {
 191                                 pStem->AddNULLSuffix();
 192                                 pStem->AddWord(pWord);
 193                                 pNullSuffix->IncrementUseCount();
 194                                 pNullSuffix->IncrementCorpusCount(pWord->GetCorpusCount());
 195                                 pStem->IncrementWordCount();
 196                                 pNullSuffix->AddStem(pStem);
 197                         } else {
 198                                 pStem->AddNULLPrefix();
 199                                 pStem->AddWord(pWord);
 200                                 pNullPrefix->IncrementUseCount();
 201                                 pNullPrefix->IncrementCorpusCount(pWord->GetCorpusCount());
 202                                 pStem->IncrementWordCount();
 203                                 pNullPrefix->AddStem(pStem);
 204                         }
 205
 206                         pStem->IncrementCorpusCount(pWord->GetCorpusCount());
 207                 }
 208                 pStem->GetSuffixList()->Alphabetize();
 209                 LogFile(pStem->Display());
 210         } // end of stem loop
 211         LogFileEndTable();
 212         status.details.clear();
 213         delete OldStems;
 214         FindAllSignatures();
 215 }
 216
 217 //      Find All Signatures (from each Stem signatures)
 218
 219 /*  This scans the Stems, forming entries in This
 220  *  for each Class signature (the concatenation of
 221  *  suffixes taken by a stem
 222  *  It also makes a CPtrList for each signature
 223  *  It also makes a pointer to the Sig for each Stem. <<<<
 224  */
 225 void CMiniLexicon::FindAllSignatures()
 226 {
 227         CLexicon& lex = *m_pLexicon;
 228         linguistica::ui::status_user_agent& status = lex.status_display();
 229
 230         const int MinimumSignatureLength = m_pLexicon->GetIntParameter(
 231                         "Main\\MinimumSignatureLength", 1);
 232         const bool analyzingSuffixes = !is_initial(m_AffixLocation);
 233
 234         status.details = "Find all signatures.";
 235
 236         std::auto_ptr<CSignatureCollection> OldSignatures(m_pSignatures);
 237         m_pSignatures = analyzingSuffixes ?
 238                 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
 239                 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation);
 240
 241         for (int wordno = 0; wordno < m_pWords->GetCount(); wordno++)
 242         {
 243             m_pWords->GetAt(wordno)->SetPrefixSignature(NULL);
 244             m_pWords->GetAt(wordno)->SetSuffixSignature(NULL);
 245         }
 246
 247         // For each stem: generate signature from prefix/suffix list
 248         const int num_stems = m_pStems->GetCount();
 249         bool NewlyCreatedSignature = FALSE;
 250         status.progress.clear();
 251         status.progress.set_denominator(num_stems - 1);
 252         for (int stemno = num_stems - 1; stemno >= 0 ; --stemno) {
 253                 CStem& stem = *m_pStems->GetAt(stemno);
 254                 status.progress = num_stems-1 - stemno;
 255
 256                 CParse& affixes = analyzingSuffixes ?
 257                                 *stem.GetSuffixList() :
 258                                 *stem.GetPrefixList();
 259
 260                 QList<CSuffix*> suffixes;       // used if analyzingPrefixes
 261                 QList<CPrefix*> prefixes;       // used if !analyzingPrefixes
 262
 263                 // Look up prefixes/suffixes for the new signature’s
 264                 // prefix/suffix pointer list
 265                 for (int j = 1; j <= affixes.Size(); ++j) {
 266                         CStringSurrogate affix_text = affixes.GetPiece(j);
 267
 268                         analyzingSuffixes ? suffixes.append(*m_pSuffixes ^= affix_text):
 269                                             prefixes.append(*m_pPrefixes ^= affix_text);
 270                 }
 271
 272                 // If not enough affixes, clear this stem’s signature.
 273                 // Otherwise, record the new signature.
 274
 275                 CSignature* new_sig;
 276                 if (affixes.Size() < MinimumSignatureLength) new_sig = 0;
 277
 278                 new_sig = *m_pSignatures ^= &affixes;
 279                 if ( new_sig  )
 280                 {
 281                    NewlyCreatedSignature = FALSE;
 282                 }
 283                 else {
 284                     new_sig = *m_pSignatures << &affixes;
 285                     NewlyCreatedSignature = TRUE;
 286                 }
 287                 if (analyzingSuffixes) {
 288                         if (new_sig != 0)
 289                                 new_sig->AttachToSuffixSig(&stem);
 290                         stem.SetSuffixSignature(new_sig);
 291                         foreach (CStem* word, *stem.GetWordPtrList())
 292                                 word->SetSuffixSignature(new_sig);
 293                 } else {
 294                         if (new_sig != 0)
 295                                 new_sig->AttachToPrefixSig(&stem);
 296                         stem.SetPrefixSignature(new_sig);
 297                         foreach (CStem* word, *stem.GetWordPtrList())
 298                                 word->SetPrefixSignature(new_sig);
 299                 }
 300                 if (NewlyCreatedSignature)
 301                 {
 302                     // Record signature origin.
 303                     if (new_sig != 0)
 304                     {
 305                             if (CSignature* old_sig = *OldSignatures ^= new_sig)
 306                                 new_sig->SetRemark(old_sig->GetRemark());
 307                             else
 308                                 new_sig->SetRemark(
 309                                     QString("From known stem and %1")
 310                                     .arg(analyzingSuffixes ?
 311                                     "suffix" : "prefix"));
 312                     }
 313                 }
 314
 315         } // loop over stemno
 316         status.progress.clear();
 317
 318         for (int signo = 0; signo < m_pSignatures->GetCount(); signo++)
 319         {
 320             m_pSignatures      ->GetAt(signo)->RecalculateStemAndWordPointers();
 321         }
 322
 323         status.details.clear();
 324 }
 325
 326 // From Stems Find Suffixes
 327 void CMiniLexicon::FromStemsFindAffixes()
 328 {
 329         CStem* pStem;
 330         CAffix* pAffix;
 331         CAffix* qAffix;
 332         CStem* pWord;
 333         CSignature* pSig;
 334         int StemLength;
 335         int wordno = 0;
 336         CStringSurrogate ssStem, ssAffix, ssWord, ssTemp;
 337         CSuffixCollection TempSuffixes;
 338         CPrefixCollection TempPrefixes;
 339         CParse Word;
 340         const double RobustnessThreshold = m_pLexicon->GetIntParameter(
 341                 "FromStemsFindSuffixes\\RobustnessThreshold", 10);
 342         const int MinimumNumberOfOccurrences = m_pLexicon->GetIntParameter(
 343                 "FromStemsFindSuffixes\\MinimumNumberOfOccurrences", 3);
 344         const int MinimumStemLength = m_pLexicon->GetIntParameter(
 345                 "Main\\MinimumStemLength", 3);
 346         bool analyzingSuffixes = true;
 347
 348         if (m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL)
 349                 analyzingSuffixes = false;
 350
 351         std::auto_ptr<CSuffixCollection> TempSuffixesFinal;
 352         std::auto_ptr<CPrefixCollection> TempPrefixesFinal;
 353         if (analyzingSuffixes)
 354                 TempSuffixesFinal = std::auto_ptr<CSuffixCollection>(
 355                         new CSuffixCollection(this));
 356         else
 357                 TempPrefixesFinal = std::auto_ptr<CPrefixCollection>(
 358                         new CPrefixCollection(this));
 359
 360         if (m_pStems->GetCount() == 0)
 361                 return;
 362
 363         m_pStems->Sort(KEY);
 364         m_pWords->Sort(KEY);
 365
 366         m_pSignatures->CheckRobustness();       // TODO: check this for prefix compatibility
 367
 368         CLexicon& lex = *m_pLexicon;
 369         linguistica::ui::status_user_agent& status = lex.status_display();
 370
 371         status.major_operation = QString("Mini-Lexicon %1: From stems, find %2")
 372                 .arg(m_Index + 1).arg(analyzingSuffixes ?
 373                         QString("suffixes") : QString("prefixes"));
 374
 375         status.details = "From known stems, find new affixes";
 376         status.progress.clear();
 377         status.progress.set_denominator(m_pStems->GetCount());
 378         LogFileLargeTitle("Phase: From known stems, find new affixes");
 379         LogFileHeader("Stems", "Robustness", "Affix");
 380         for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
 381                 status.progress = stemno;
 382
 383                 pStem = m_pStems->GetAtSort(stemno);
 384                 ssStem = pStem->GetKey();
 385                 LogFile(pStem->Display());
 386
 387                 StemLength = ssStem.GetLength();
 388                 if ((int) StemLength < MinimumStemLength)
 389                         continue;
 390
 391                 // Here is where to test to see
 392                 // if the stem's signature is really robust.
 393                 if (analyzingSuffixes)
 394                         pSig = pStem->GetSuffixSignature();
 395                 else
 396                         pSig = pStem->GetPrefixSignature();
 397                 LogFile(pSig->GetRobustness());
 398                 if (pSig->GetRobustness() < RobustnessThreshold)
 399                         continue;
 400
 401                 // New formulation:
 402                 CParse StemContinuations;
 403                 if (analyzingSuffixes) {
 404                         m_pWords->GetTrie()->CreateSuffixSignatures(
 405                                 pStem->GetKey(), &StemContinuations);
 406                 } else {
 407                         ssTemp = pStem->GetKey();
 408                         ssTemp.SetBackwards(true);
 409                         m_pWords->GetReverseTrie()->CreateSuffixSignatures(
 410                                 &ssTemp, &StemContinuations);
 411                         ssTemp.SetBackwards(false);
 412                 }
 413
 414                 for (int suffixno = 1; suffixno < (int) StemContinuations.Size(); suffixno++) {
 415                         ssAffix  = StemContinuations.GetPiece(suffixno);
 416                         if (analyzingSuffixes) {
 417                                 Word = ssStem + ssAffix;
 418                                 pWord = *m_pWords ^= Word;
 419                                 if (m_pWords->GetAtSort(wordno)->GetSuffixLoc() > 0)
 420                                         continue;
 421                                 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
 422                                         wordno++;
 423                                         continue;
 424                                         // or use WordType
 425                                 }
 426                                 if (*m_pSuffixes ^= ssAffix)
 427                                         continue;
 428                                 pAffix = TempSuffixes << ssAffix;
 429                                 LogFile("", "", ssAffix.Display());
 430                         } else {
 431                                 ssAffix.SetBackwards(true);
 432                                 Word = ssAffix + ssStem;
 433                                 pWord = *m_pWords ^= Word;
 434                                 if (m_pWords->GetAtSort(wordno)->GetPrefixLoc() > 0)
 435                                         continue;
 436                                 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
 437                                         wordno++;
 438                                         continue;
 439                                         // or use WordType
 440                                 }
 441                                 if (*m_pPrefixes ^= ssAffix)
 442                                         continue;
 443                                 pAffix = TempPrefixes << ssAffix;
 444                                 ssAffix.SetBackwards(false);
 445                         }
 446                 } // end of suffixno loop
 447         } // end of stemno loop
 448         status.progress.clear();
 449         status.details.clear();
 450
 451         LogFileEndTable();
 452
 453         status.details = "Testing proposed affixes";
 454         status.progress.clear();
 455         if (analyzingSuffixes) {
 456                 LogFileSmallTitle("Testing proposed affixes");
 457                 LogFileHeader("Affix", "Count");
 458                 status.progress.set_denominator(TempSuffixes.GetCount());
 459                 for (int suffixno = 0; suffixno < (int)TempSuffixes.GetCount(); suffixno++) {
 460                         status.progress = suffixno;
 461                         pAffix = TempSuffixes.GetAt(suffixno);
 462                         LogFile(pAffix->Display(), pAffix->GetUseCount());
 463                         if (pAffix->GetUseCount() >= MinimumNumberOfOccurrences) {
 464                                 qAffix = *TempSuffixesFinal << pAffix->Display();
 465                                 if (qAffix)
 466                                         LogFile("Yes", pAffix->Display());
 467                                 else
 468                                         LogFile("No",  pAffix->Display());
 469                         }
 470                 }
 471                 LogFileEndTable();
 472         } else {
 473                 status.progress.set_denominator(TempPrefixes.GetCount());
 474                 for (int prefixno = 0; prefixno < (int)TempPrefixes.GetCount(); prefixno++) {
 475                         status.progress = prefixno;
 476                         pAffix = TempPrefixes.GetAt(prefixno);
 477                         if (pAffix->GetUseCount() >= MinimumNumberOfOccurrences) {
 478                                 qAffix = *TempPrefixesFinal << pAffix->GetKey();
 479                                 if (qAffix)
 480                                         LogFile("Yes", pAffix->Display());
 481                                 else
 482                                         LogFile ("No", pAffix->Display());
 483                         }
 484                 }
 485         }
 486         status.progress.clear();
 487         status.details.clear();
 488
 489         // Now we'll look to see how often these acceptable suffixes are
 490         // used on real stems
 491         LogFileSmallTitle("How acceptable are these suffixes?");
 492         LogFileHeader("Stems", "Direction", "Word");
 493         status.details = "Cutting words using stems and proposed affixes";
 494         status.progress.clear();
 495 //      GetDocument()->CountDownOnStatusBar(-1);
 496         if (analyzingSuffixes) {
 497                 wordno=0;
 498                 status.progress.set_denominator(m_pStems->GetCount());
 499                 for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
 500                         status.progress = stemno;
 501                         pStem = m_pStems->GetAtSort(stemno);
 502                         ssStem = pStem->GetKey();
 503                         StemLength = ssStem.GetLength();
 504
 505                         if ((int)StemLength < MinimumStemLength)
 506                                 continue;
 507                         LogFile(ssStem.Display());
 508
 509                         // Make sure the Words-ptr
 510                         // is above Stem in the Word list.
 511
 512                         // Going up...
 513                         while (wordno > 0 &&
 514                                wordno < (int)m_pWords->GetCount() &&
 515                                m_pWords->GetAtSort(wordno)->GetKey().Display() >= ssStem.Display())
 516                                 wordno--;
 517
 518                         // Bring the Words-ptr down to Stem.
 519                         while (wordno < (int)(m_pWords->GetCount() - 1) &&
 520                                m_pWords->GetAtSort(wordno)->GetKey().Display() < ssStem.Display())
 521                                 wordno++;
 522
 523                         // Considering all words beginning with ssStem.
 524                         while (wordno < (int)m_pWords->GetCount() &&
 525                                m_pWords->GetAtSort(wordno)->GetKey().Left(StemLength) == ssStem) {
 526                                 LogFile("", m_pWords->GetAtSort(wordno)->Display());
 527                                 if (m_pWords->GetAtSort(wordno)->GetSuffixLoc() > 0) {
 528                                         wordno++;
 529                                         LogFile1SimpleString(" already has suffix ");
 530                                         LogFileEndRow();
 531                                         continue;
 532                                 } //the word is already analyzed
 533
 534                                 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
 535                                         wordno++;
 536                                         LogFile(" is a compound");
 537                                         continue;
 538                                 }
 539
 540                                 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
 541                                 ssAffix = ssWord.Right(ssWord.GetLength() - StemLength);
 542                                 LogFile("Suffix (a)", ssAffix.Display());
 543                                 if ((pAffix = (*TempSuffixesFinal ^= ssAffix))) {
 544                                         LogFile("already exists suffix ", ssAffix.Display());
 545                                         pWord = m_pWords->GetAtSort(wordno);
 546                                         pWord->CutRightBeforeHere(StemLength);
 547                                         pWord->AppendToConfidence("somewhat uncertain.");
 548                                         pWord->SetStemLoc(1);
 549                                         pWord->SetSuffixLoc(2);
 550                                         pAffix->IncrementUseCount();
 551                                         LogFile(pWord->Display());
 552                                         m_pLexicon->UpdateWord(pWord);
 553                                 } else
 554                                         LogFile("Suffix not found.");
 555
 556                                 wordno++;
 557                         }
 558                 } // end of loop through stems collection
 559                 LogFileEndTable();
 560         } else {        // Prefixes!
 561                 wordno=0;
 562                 m_pWords->Sort(REVERSE_KEY);
 563                 status.progress.set_denominator(m_pStems->GetCount());
 564                 for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
 565                         status.progress = stemno;
 566                         pStem = m_pStems->GetAtSort(stemno);
 567                         ssStem = pStem->GetKey();
 568                         StemLength = ssStem.GetLength();
 569                         if ((int)StemLength < MinimumStemLength)
 570                                 continue;
 571
 572                         // Make sure the Words-ptr is above Stem in the Word list.
 573                         // This is rarely executed.
 574                         ssStem.SetBackwards(true);
 575                         while (wordno > 0 && wordno < (int)m_pWords->GetCount()) {
 576                                 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
 577                                 ssWord.SetBackwards(true);
 578                                 if (ssWord.Display() >= ssStem.Display())
 579                                         break;
 580                                 wordno--;
 581                         }
 582
 583                         // bring the Words-ptr down to Stem.
 584                         while (wordno < (int)(m_pWords->GetCount() - 1)) {
 585                                 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
 586                                 ssWord.SetBackwards(true);
 587                                 if (ssWord.Display() < ssStem.Display())
 588                                         break;
 589                                 wordno++;
 590                         }
 591
 592                         ssWord.SetBackwards(false);
 593                         ssStem.SetBackwards(false);
 594
 595                         while (wordno < (int)m_pWords->GetCount() &&
 596                                m_pWords->GetAtSort(wordno)->GetKey().Right(StemLength) == ssStem.Display()) {
 597                                 if (m_pWords->GetAtSort(wordno)->GetPrefixLoc() > 0) {
 598                                         wordno++;
 599                                         continue;
 600                                 }
 601                                 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
 602                                         wordno++;
 603                                         continue;
 604                                 } // or use WordType
 605                                 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
 606                                 ssAffix = ssWord.Left(ssWord.GetLength() - StemLength);
 607                                 if ((pAffix = (*TempPrefixesFinal ^= ssAffix))) {
 608                                         pWord = m_pWords->GetAtSort(wordno);
 609                                         pWord->CutRightBeforeHere(ssWord.GetLength() - StemLength);
 610                                         pWord->AppendToConfidence("somewhat uncertain.");
 611                                         pWord->SetStemLoc(2);
 612                                         pWord->SetPrefixLoc(1);
 613                                         pAffix->IncrementUseCount();
 614                                         LogFile("Cut : ", pWord->Display());
 615                                         m_pLexicon->UpdateWord(pWord);
 616                                 }
 617                                 wordno++;
 618                         }
 619                 } // end of stemno loop
 620         }
 621         status.progress.clear();
 622         status.details.clear();
 623
 624         if (analyzingSuffixes) {
 625                 QString Remark("From stems find suffixes");
 626                 CStringSurrogate ssRemark(Remark);
 627
 628                 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark);
 629                 // XXX. not an operation.
 630                 status.major_operation = "Mini-Lexicon " + QString("%1").arg( m_Index+1 ) + ": End -- from stems, find suffixes";
 631         } else {        // Prefixes!
 632
 633                 QString Remark("From stems find prefixes");
 634                 CStringSurrogate ssRemark(Remark);
 635                 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark);
 636                 status.major_operation = "Mini-Lexicon " + QString("%1").arg( m_Index+1 ) + ": End -- from stems, find prefixes";
 637         }
 638
 639         QString mini_name("Mini-Lexicon %1");
 640         mini_name = mini_name.arg(GetIndex() + 1);
 641         QString remark = "From stems: find affixes";
 642         GetDLHistory()->append(mini_name, remark, this);
 643 }
 644
 645 void CMiniLexicon::ExtendKnownStemsToKnownAffixes()
 646 {
 647         CLexicon& lex = *m_pLexicon;
 648         linguistica::ui::status_user_agent& status = lex.status_display();
 649
 650         // We will be iterating over stems and words, so sort them.
 651         m_pStems->Sort(KEY);
 652         m_pWords->Sort(KEY);
 653
 654         // Give the user some text to stare at
 655         status.progress.clear();
 656         status.major_operation =
 657                 QString("Mini-Lexicon %1: Extend known stems and suffixes")
 658                 .arg(m_Index + 1);
 659
 660         const bool analyzingSuffixes = !is_initial(m_AffixLocation);
 661         LogFileLargeTitle("Phase: Extending analysis \n based on known stems and known affixes.");
 662         LogFileStartTable();
 663         if (analyzingSuffixes) {LogFileHeader("Stem", "Suffix");}
 664                else {LogFileHeader("Prefix", "Stem");}
 665
 666
 667         // For each stem:
 668         const int TopCount = m_pStems->GetCount();
 669         status.details = "Going through the stems ";
 670         status.progress.set_denominator(TopCount);
 671         for (int t = 0; t < TopCount; ++t) {
 672                 CStem& stem = *m_pStems->GetAtSort(t);
 673
 674                 status.progress = t;
 675
 676                 CStringSurrogate affix_text = analyzingSuffixes ?
 677                                 stem.GetSuffixList()->GetPiece(1) :
 678                                 stem.GetPrefixList()->GetPiece(1);
 679                 const int stem_length = stem.GetKeyLength();
 680                 const CStringSurrogate stem_text = stem.GetKey();
 681
 682                 // Write "" instead of "NULL".
 683                 affix_text.ConvertNULL();
 684
 685                 const CParse word = analyzingSuffixes ?
 686                                 stem_text + affix_text :
 687                                 affix_text + stem_text;
 688
 689                 int SortIndex = 0;
 690                 CStem* found_word = *m_pWords ^= word;
 691                 if (found_word != 0)
 692                         SortIndex = found_word->GetSortIndex();
 693
 694                 // First word starting with (resp ending with) stem text
 695                 int TopIndex = SortIndex;
 696                 if (analyzingSuffixes) {
 697                         while (TopIndex >= 1 &&
 698                                         m_pWords->GetAtSort(TopIndex - 1)
 699                                         ->GetKey().Left(stem_length) == stem_text)
 700                                 --TopIndex;
 701                 } else {
 702                         while (TopIndex >= 1 &&
 703                                         m_pWords->GetAtSort(TopIndex - 1)
 704                                         ->GetKey().Right(stem_length) == stem_text)
 705                                 --TopIndex;
 706                 }
 707
 708                 // Last word starting with (resp ending with) stem text
 709                 int BottomIndex = SortIndex;
 710                 if (analyzingSuffixes) {
 711                         while (BottomIndex+1 < m_pWords->GetCount() &&
 712                                         m_pWords->GetAtSort(BottomIndex+1)
 713                                         ->GetKey().Left(stem_length) == stem_text)
 714                                 ++BottomIndex;
 715                 } else {
 716                         while (BottomIndex+1 < m_pWords->GetCount() &&
 717                                         m_pWords->GetAtSort(BottomIndex+1)
 718                                         ->GetKey().Right(stem_length) == stem_text)
 719                                 ++BottomIndex;
 720                 }
 721
 722                 // For each word starting with (resp ending with) stem text:
 723                 for (int w = TopIndex; w <= BottomIndex; ++w) {
 724                         CStem& word2 = *m_pWords->GetAtSort(w);
 725
 726                         // Ignore already analyzed words
 727                         const enum CStem::type word2_type = word2.GetWordType();
 728                         const int affix_loc = analyzingSuffixes ?
 729                                         word2.GetSuffixLoc() :
 730                                         word2.GetPrefixLoc();
 731                         if (affix_loc != 0 ||
 732                             word2_type == CStem::BIWORD_COMPOUND ||
 733                             word2_type == CStem::MULTIPLE_COMPOUND ||
 734                             word2_type == CStem::POSSIBLE_COMPOUND)
 735                                 continue;
 736
 737                         // Investigate possible affix
 738                         const CStringSurrogate word2_text = word2.GetKey();
 739                         const int affixlen = word2_text.GetLength() - stem_length;
 740                         const CStringSurrogate PossibleAffix =
 741                                         analyzingSuffixes ?
 742                                         word2_text.Right(affixlen) :
 743                                         word2_text.Left(affixlen);
 744                         CAffix* found_affix;
 745                         analyzingSuffixes ?
 746                                 found_affix = *m_pSuffixes ^= PossibleAffix:
 747                                 found_affix = *m_pPrefixes ^= PossibleAffix;
 748                         if (found_affix == 0)
 749                                 continue;
 750
 751                         // Affix present.  Rewrite word2 as stem + affix.
 752                         // (Do not update counts yet.)
 753                         if (analyzingSuffixes) {
 754                                 word2.CutRightBeforeHere(stem_length);
 755                                 word2.SetStemLoc(1);
 756                                 word2.SetSuffixLoc(2);
 757                                 stem.AddSuffix(static_cast<CSuffix*>(found_affix));
 758                                 found_affix->AddStem(&stem);
 759                         } else {
 760                                 word2.CutRightBeforeHere(affixlen);
 761                                 word2.SetStemLoc(2);
 762                                 word2.SetPrefixLoc(1);
 763                                 stem.AddPrefix(static_cast<CPrefix*>(found_affix));
 764                                 found_affix->AddStem(&stem);
 765                         }
 766                         if (analyzingSuffixes)  { LogFile(stem.Display(), found_affix->Display());}
 767                          else                    { LogFile(found_affix->Display(), stem.Display());}
 768                         m_pLexicon->UpdateWord(&word2);
 769                 }
 770         }
 771         status.progress.clear();
 772         status.details.clear();
 773         LogFileEndTable();
 774
 775         // Redo Signatures.
 776         QString Remark("From known stems and suffixes");
 777         CStringSurrogate ssRemark(Remark);
 778
 779         RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark);
 780
 781         // XXX. not an operation
 782         status.major_operation = QString("Mini-Lexicon %1: "
 783                         "End of using known stems and %2")
 784                         .arg(m_Index + 1)
 785                         .arg(analyzingSuffixes ? "suffixes" : "prefixes");
 786
 787         GetDLHistory()->append(
 788                 QString("Mini-Lexicon %1").arg(m_Index + 1),
 789                 QString("Known stems and affixes"),
 790                 this);
 791 }
 792
 793 /*
 794  * At the moment, this function decides too early whether to
 795  * allow a stem to be used. It ought to build up the TempSignatures,
 796  * and after all stems have been tried, decide which way to split
 797  * a stem is the best.
 798  */
 799
 800 /* Take a word W.
 801  *    Consider all splits for W such that its suffix is already in Suffixes.
 802  *      Put the stem in TempStems, and attach the suffix to the stem.
 803  *      Look at all words that start like that stem;
 804  *           Attach the suffixes found in those words to that stem.
 805  *      Done?
 806  *      Now, if there are more than 2 suffixes on that stem now,
 807  *           put the stem's signature in a TempSignature collection.
 808  *      Find cost = cost (Sig).
 809  *      If cost is negative, go ahead and make all those splits.
 810  *  End of loop.
 811  */
 812 void CMiniLexicon::LooseFit()
 813 {
 814         // Cost of a sig =
 815         //    Sum over all of its stems :
 816         //        log ( CorpusSize / Stem-count )      ( cost    )
 817         //        length ( stem ) * cost of a letter   ( savings )
 818         //
 819         //    Sum over all of its suffixes:
 820         //        log ( CorpusSize / suffix-count )    ( cost    )
 821         //        length ( suffix ) * cost of a letter ( savings )
 822         using std::auto_ptr;
 823         using std::vector;
 824
 825         CLexicon& lex = *m_pLexicon;
 826         linguistica::ui::status_user_agent& status = lex.status_display();
 827         const bool analyzingSuffixes = !is_initial(m_AffixLocation);
 828         const int MinimumAffixLength = m_pLexicon->GetIntParameter(
 829                         analyzingSuffixes ?
 830                         "Main\\MinimumSuffixLength" :
 831                         "Main\\MinimumPrefixLength", 2);
 832         const int MaximumAffixLength = m_pLexicon->GetIntParameter(
 833                         analyzingSuffixes ?
 834                         "Main\\MaximumSuffixLength" :
 835                         "Main\\MaximumPrefixLength", 7);
 836         int MinimumStemLength = m_pLexicon->GetIntParameter(
 837                         "Main\\MinimumStemLength", 3);
 838         if (MinimumStemLength < 1)
 839                 // do not allow empty stems
 840                 // XXX. notify user that value was ignored
 841                 MinimumStemLength = 1;
 842
 843         LogFileLargeTitle("Phase: Loose fit of known signatures");
 844         LogFileStartTable();
 845
 846         auto_ptr<CSignatureCollection> proposed_sigs(analyzingSuffixes ?
 847                 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
 848                 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation));
 849         auto_ptr<CSignatureCollection> existing_sigs(analyzingSuffixes ?
 850                 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
 851                 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation));
 852         auto_ptr<CSignatureCollection> novel_sigs(analyzingSuffixes ?
 853                 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
 854                 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation));
 855
 856         QList<CSignature*> ProposedSignatures;
 857         QList<CSignature*> ExistingSignatures;
 858         QList<CSignature*> NovelSignatures;
 859
 860         if (m_pWords->GetCount() == 0)
 861                 // Need to read corpus first.
 862                 return;
 863
 864         // We will  be iterating over words, so sort them.
 865         m_pWords->Sort(KEY);
 866
 867         // Give the user something to look at.
 868         status.progress.clear();
 869         status.major_operation =
 870                 QString("Mini-Lexicon %1: Loose fit.").arg(m_Index+1);
 871
 872         // For each word: look for known prefixes/suffixes and consider them.
 873         // Fill proposed_sigs with candidates.
 874         CStemCollection potential_stems;
 875         const int num_words = m_pWords->GetCount();
 876         status.progress.set_denominator(num_words);
 877         for (int wordno = 0; wordno < num_words; ++wordno) {
 878                 CStem& word = *m_pWords->GetAtSort(wordno);
 879                 status.progress = wordno;
 880
 881                 if (word.Size() > 1)
 882                         // already analyzed
 883                         continue;
 884
 885                 if (word.GetKeyLength() < 2)
 886                         // too small to analyze
 887                         continue;
 888
 889                 CStringSurrogate word_text = word.GetKey();
 890
 891                 // For each textual prefix/suffix of word: if known, consider.
 892                 for (int i = MaximumAffixLength; i >= MinimumAffixLength; --i) {
 893                         Q_ASSERT(MinimumStemLength >= 1);
 894                         if (word_text.GetLength() - i < MinimumStemLength)
 895                                 // corresponding stem is too short
 896                                 continue;
 897
 898                         CStringSurrogate affix_text = analyzingSuffixes ?
 899                                 word_text.Right(i) : word_text.Left(i);
 900                         CAffix* affix = analyzingSuffixes ?
 901                                 implicit_cast<CAffix*>(*m_pSuffixes ^= affix_text) :
 902                                 implicit_cast<CAffix*>(*m_pPrefixes ^= affix_text);
 903
 904                         if (affix == 0)
 905                                 // suffix not known
 906                                 continue;
 907
 908                         Q_ASSERT(word_text.GetLength() > i);
 909                         CStringSurrogate stem_text = analyzingSuffixes ?
 910                                 word_text.Left(word_text.GetLength()-i) :
 911                                 word_text.Right(word_text.GetLength()-i);
 912
 913                         if (*m_pStems ^= stem_text)
 914                                 // stem already known
 915                                 continue;
 916
 917                         if (potential_stems ^= stem_text)
 918                                 // stem already under consideration
 919                                 continue;
 920
 921                         LogFile(stem_text.Display());
 922                         CStem* stem = potential_stems << stem_text;
 923
 924                         // build signature from remaining words
 925                         CParse empirical_sig;
 926                         if (analyzingSuffixes) {
 927                                 m_pWords->CreateSuffixSignatures(
 928                                                 &stem_text, &empirical_sig);
 929                         } else {
 930                                 if (!m_pWords->GetReverseTrie())
 931                                         m_pWords->CreateReverseTrie();
 932
 933                                 // build list of reversed prefixes
 934                                 CParse backwards_sig;
 935                                 stem_text.SetBackwards();
 936                                 m_pWords->GetReverseTrie()->CreateSuffixSignatures(
 937                                                 &stem_text, &backwards_sig);
 938                                 stem_text.SetBackwards(false);
 939
 940                                 // reverse them to get actual prefixes
 941                                 for (int n = 1; n <= backwards_sig.Size(); ++n) {
 942                                         CStringSurrogate prefix =
 943                                                 backwards_sig.GetPiece(n);
 944
 945                                         prefix.SetBackwards();
 946                                         empirical_sig.Append(prefix);
 947                                         prefix.SetBackwards(false);
 948                                 }
 949                         }
 950
 951                         // for each prefix/suffix that appears with stem:
 952                         //  throw it out if it's obviously bad, but
 953                         //  otherwise record it in *stem.
 954                         for (int n = 1; n <= empirical_sig.Size(); ++n) {
 955                                 CStringSurrogate affix2 = empirical_sig.GetPiece(n);
 956
 957                                 // word this affix was taken from
 958                                 CParse word2_text = analyzingSuffixes ?
 959                                                 stem_text + affix2 :
 960                                                 affix2 + stem_text;
 961                                 CStem* word2 = *m_pWords ^= word2_text;
 962                                 Q_ASSERT(word2 != 0);
 963
 964                                 if (word2->Size() > 1)
 965                                         // previously analyzed
 966                                         continue;
 967
 968                                 // If affix is of reasonable size or is
 969                                 // known from elsewhere, add it to stem's list.
 970                                 const int affix2_len = affix2.GetLength();
 971                                 const bool reasonable_size =
 972                                         affix2_len >= MinimumAffixLength ||
 973                                         affix2_len <= MaximumAffixLength;
 974
 975                                 if (reasonable_size || (analyzingSuffixes ?
 976                                                 (*m_pSuffixes ^= affix2) != 0:
 977                                                 (*m_pPrefixes ^= affix2) != 0)) {
 978                                         if (analyzingSuffixes)
 979                                                 stem->AddSuffix(affix2);
 980                                         else
 981                                                 stem->AddPrefix(affix2);
 982                                 }
 983                         }
 984                         // special case: stem appears with null affix
 985                         if (CStem* word2 = *m_pWords ^= stem_text) {
 986                                 if (word2->Size() > 1) {
 987                                         // previously analyzed
 988                                 } else {
 989                                         if (analyzingSuffixes)
 990                                                 stem->AddNULLSuffix();
 991                                         else
 992                                                 stem->AddNULLPrefix();
 993                                 }
 994                         }
 995
 996                         // fetch recorded affixes
 997                         CParse& affixes = analyzingSuffixes ?
 998                                 *stem->GetSuffixList() :
 999                                 *stem->GetPrefixList();
1000
1001                         // At least the affix that prompted this search
1002                         // should be among them.
1003                         Q_ASSERT(affixes.Contains(affix_text));
1004                         Q_ASSERT(affixes.Size() > 0);
1005
1006                         // build new signature
1007                         CSignature* new_sig = *proposed_sigs << &affixes;
1008
1009                         if (analyzingSuffixes)
1010                                 new_sig->AttachToSuffixSig(stem, false);
1011                         else
1012                                 new_sig->AttachToPrefixSig(stem, false);
1013                         new_sig->SetAffixLocation(m_AffixLocation);
1014                         LogFile1SimpleString (new_sig->Display());
1015                 } // for each textual prefix/suffix
1016
1017                 LogFile("");  // Blank line between stem-families
1018         } // end of wordno loop
1019         status.progress.clear();
1020
1021         // Now we have, or might have, a set of candidate splittings,
1022         // so let's compare them.
1023         // If any of the corresponding signatures are already real,
1024         // then we take the stem corresponding to the sig with the
1025         // highest value for Robustness.
1026         // Otherwise we use the Cost function for signatures.
1027         LogFileSmallTitle("Proposed signatures", "Signature", "status");
1028
1029
1030         // First find the set of existing signatures (won't be many).
1031
1032         for (int signo = 0; signo < ProposedSignatures.count(); signo++)
1033         {
1034                 CSignature* sig = ProposedSignatures.at(signo);
1035
1036                 if (sig->Size() < 2 && sig->GetKeyLength() < 4) {// total length < 4!  Way too small.
1037                     LogFile("too small: ",  sig->Display() );
1038                         continue;
1039                 }
1040
1041                 // known signature?
1042                 if (*GetSignatures() ^= sig) ExistingSignatures.append(sig);
1043                 else NovelSignatures.append(sig);
1044         }
1045         LogFileEndTable();
1046
1047         {
1048                 // Sort existing signatures by robustness
1049                 const int size = existing_sigs->GetCount();
1050
1051                 LogFileStartTable(); LogFileHeader("Pre-existing signature", "Robustnes" );
1052
1053
1054                 // calculate each signature’s robustness
1055                 vector<double> Robustness;
1056                 Robustness.reserve(size);
1057                 for (int signo = 0; signo < size; ++signo) {
1058                         CSignature* sig = (*existing_sigs)[signo];
1059
1060                         Robustness.push_back(sig->GetRobustness());
1061                         LogFile(sig->Display(), Robustness[signo]);
1062
1063                 }
1064                  LogFileHeader("Signature", "Stem");
1065
1066                 // For each existing signature (sorted by robustness):
1067                 // analyze (cut) unanalyzed words according to this signature.
1068                 vector<int> shuffle(size);
1069                 SortVector(&shuffle[0], &Robustness[0], size);
1070                 for (int j = 0; j < size; ++j) {
1071                         CSignature* sig = (*existing_sigs)[shuffle[j]];
1072                         LogFile(sig->Display());
1073
1074
1075                         // Analyze (cut) unanalyzed words according to signature.
1076                         sig->CutMyWordsAsIDeclare();
1077
1078                 }
1079                 LogFileEndTable();
1080         }
1081
1082         {
1083                 // Consider novel signatures.
1084                 const int size = novel_sigs->GetCount();
1085                 LogFileSmallTitle ("Novel signatures: "); LogFileStartTable();
1086
1087
1088                 // for each signature: calculate cost function
1089                 vector<double> Cost;
1090                 Cost.reserve(size);
1091                 for (int i = 0; i < size; ++i)
1092                         Cost.push_back((*novel_sigs)[i]->FindCost(this));
1093
1094                 // for each novel signature, in least-cost-first order:
1095                 // Analyze (cut) unanalyzed words according to this signature.
1096                 // Stop when the cost becomes positive.
1097                 vector<int> shuffle(size);
1098                 SortVector(&shuffle[0], &Cost[0], size);
1099
1100                 for (int signo = size-1; signo >= 0 && Cost[shuffle[signo]] < 0; --signo) {
1101                         CSignature* sig = (*novel_sigs)[shuffle[signo]];
1102                         LogFile(sig->Display(), Cost[shuffle[signo]]);
1103                        // Analyze (cut) unanalyzed words according to signature.
1104                        sig->CutMyWordsAsIDeclare();
1105
1106                 } // end of signo loop
1107                 // XXX. report cost of first rejected signature to log
1108         }
1109         // what had followed has been eliminated, and replaced by:
1110         CStringSurrogate remark(QString("Loose fit"));
1111         RebuildAffixesStemsAndSignaturesFromWordSplits (remark);
1112
1113         // XXX. not an operation
1114         status.major_operation = QString("Mini-Lexicon %1: End of loose fit.")
1115                         .arg(m_Index+1);
1116         GetDLHistory()->append(
1117                         QString("Mini-Lexicon %1").arg(m_Index+1),
1118                         QString("Loose Fit"),
1119                         this);
1120 }
1121
1122 int CMiniLexicon::GetNumberOfStems()
1123 {
1124         return m_pStems->GetCount();
1125 }