HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / MiniLexicon_Core.cpp
blob6126a50da52632ab18cd6be77941eb6658ee1338
1 // Suffix/signature-based discovery of morphology: core
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
5 #include <memory>
6 #include <vector>
7 #include <QList>
8 #include "ui/Status.h"
9 #include "Lexicon.h"
10 #include "DLHistory.h"
11 #include "Signature.h"
12 #include "Suffix.h"
13 #include "Prefix.h"
14 #include "Stem.h"
15 #include "SignatureCollection.h"
16 #include "SuffixCollection.h"
17 #include "PrefixCollection.h"
18 #include "WordCollection.h"
19 #include "StemCollection.h"
20 #include "CompareFunc.h"
21 #include "StringFunc.h"
22 #include "HTML.h"
23 #include "AffixLocation.h"
24 #include "implicit_cast.h"
25 using linguistica::implicit_cast;
27 /// Writes to status.details instead of status.major_operation.
28 void CMiniLexicon::TakeSplitWords_ProduceStemsAndSigs( CStringSurrogate& Remark,
29 CWordCollection* pWords,
30 CStemCollection* pStems,
31 CPrefixCollection* pPrefixes,
32 CSuffixCollection* pSuffixes )
35 if( pWords == NULL ) { pWords = m_pWords; };
36 if( pStems == NULL ) { pStems = m_pStems; }
37 if( pPrefixes == NULL ) { pPrefixes = m_pPrefixes; }
38 if( pSuffixes == NULL ) { pSuffixes = m_pSuffixes; }
40 RebuildAffixesStemsAndSignaturesFromWordSplits( Remark );
43 void CMiniLexicon::RebuildAffixesStemsAndSignaturesFromWordSplits(
44 CStringSurrogate& Remark)
46 int HowManySuffixes = 0, HowManyPrefixes = 0;
47 CStemCollection* OldStems;
48 CStringSurrogate ssPrefix, ssStem, ssSuffix;
49 CStem* pWord = NULL;
50 CSuffix* pSuffix = NULL;
51 CStem* pStem;
52 CStem* qStem;
53 CPrefix* pPrefix = NULL;
54 CSuffix* pNullSuffix = NULL;
55 CPrefix* pNullPrefix = NULL;
57 CLexicon& lex = *m_pLexicon;
58 linguistica::ui::status_user_agent& status = lex.status_display();
60 LogFileSmallTitle("Rebuilding all signature structures");
61 status.details = "Rebuilding all signature structures.";
62 status.progress.clear();
64 bool analyzingSuffixes = true;
65 if (m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL)
66 analyzingSuffixes = false;
67 if (analyzingSuffixes) {
68 m_pSuffixes->RemoveAll();
69 pNullSuffix = *m_pSuffixes << TheStringNULL;
70 } else {
71 m_pPrefixes->RemoveAll();
72 pNullPrefix = *m_pPrefixes << TheStringNULL;
74 OldStems = m_pStems;
75 m_pStems = new CStemCollection(this);
76 status.details = "Rebuilding all signature structures: words.";
77 status.progress.set_denominator(m_pWords->GetCount());
78 for (int wordno = 0; wordno < (int)m_pWords->GetCount(); wordno++) {
79 pWord = m_pWords->GetAt(wordno);
80 if (!pWord)
81 continue;
83 pWord->SetSuffixSignature(NULL);
84 pWord->SetPrefixSignature(NULL);
86 status.progress = wordno;
88 // commented out to avoid seg fault
89 if (
90 #if 0
91 pWord->GetWordType() == CStem::BIWORD_COMPOUND ||
92 pWord->GetWordType() == CStem::MULTIPLE_COMPOUND ||
93 #endif
94 pWord->GetWordType() == CStem::NUMBER)
95 continue;
97 // rebuild stems, part 1.
99 ssStem = pWord->GetStem();
101 if (ssStem.GetLength() == 0) {
102 pWord->SetStemLoc(1);
103 pWord->SetSuffixLoc(0);
104 pWord->SetPrefixLoc(0);
105 continue;
108 // rebuild affixes.
110 if (analyzingSuffixes) {
111 ssSuffix = pWord->GetSuffix();
112 if (ssSuffix.GetLength() < 1)
113 continue;
114 pSuffix = *m_pSuffixes << ssSuffix;
115 if (!pSuffix)
116 continue;
117 pSuffix->IncrementCorpusCount(pWord->GetCorpusCount() - 1);
118 pSuffix->IncrementUseCount();
119 pWord->SetSuffixPtr(pSuffix);
120 HowManySuffixes++;
121 } else {
122 ssPrefix = pWord->GetPrefix();
123 if (ssPrefix.GetLength() < 1)
124 continue;
125 pPrefix = *m_pPrefixes << ssPrefix;
126 pPrefix->IncrementCorpusCount(pWord->GetCorpusCount() - 1);
127 pPrefix->IncrementUseCount();
128 pWord->SetPrefixPtr(pPrefix);
129 HowManyPrefixes++;
132 // attach affix to stem.
134 if (analyzingSuffixes) {
135 pStem = *m_pStems << ssStem ;
136 if (!pStem)
137 continue;
138 pWord->AttachWordAndSuffixalStem(pStem);
139 pSuffix = *m_pSuffixes ^= pWord->GetSuffix();
140 if (!pSuffix)
141 continue;
142 pSuffix->AddStem(pStem);
143 } else {
144 pStem = *m_pStems << ssStem;
145 pWord->AttachWordAndPrefixalStem(pStem);
146 pPrefix = *m_pPrefixes ^= pWord->GetPrefix();
147 pPrefix->AddStem(pStem);
150 pStem->SetConfidence(pWord->GetConfidence());
151 qStem = *OldStems ^= pStem->GetKey();
152 if (qStem)
153 pStem->SetConfidence(qStem->GetConfidence());
154 else
155 pStem->SetConfidence(Remark.Display());
157 Q_ASSERT(pWord->IsValid());
159 if (analyzingSuffixes)
160 pStem->AddSuffix(pSuffix);
161 else
162 pStem->AddPrefix(pPrefix);
163 LogFileEndRow();
164 } // loop over wordno
165 status.progress.clear();
166 LogFileEndTable();
167 LogFile1SimpleString("End of word loop");
169 // in what follows,
170 // if a stem is also a word, then we add NULL
171 // to the stem's factorization.
172 // Stems Loop.
173 LogFileSmallTitle("Stem Loop");
174 LogFileStartTable();
175 status.details = "Rebuilding all signature structures:stems.";
176 status.progress.set_denominator(m_pStems->GetCount());
177 for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
178 pStem = m_pStems->GetAt(stemno);
179 status.progress = stemno;
180 LogFile(pStem->Display());
181 if (analyzingSuffixes) {
182 if (pStem->GetNumberOfSuffixes() < 1)
183 continue;
184 } else {
185 if (pStem->GetNumberOfPrefixes() < 1)
186 continue;
188 pWord = *m_pWords ^= pStem->GetKey();
189 if (pWord) {
190 if (analyzingSuffixes) {
191 pStem->AddNULLSuffix();
192 pStem->AddWord(pWord);
193 pNullSuffix->IncrementUseCount();
194 pNullSuffix->IncrementCorpusCount(pWord->GetCorpusCount());
195 pStem->IncrementWordCount();
196 pNullSuffix->AddStem(pStem);
197 } else {
198 pStem->AddNULLPrefix();
199 pStem->AddWord(pWord);
200 pNullPrefix->IncrementUseCount();
201 pNullPrefix->IncrementCorpusCount(pWord->GetCorpusCount());
202 pStem->IncrementWordCount();
203 pNullPrefix->AddStem(pStem);
206 pStem->IncrementCorpusCount(pWord->GetCorpusCount());
208 pStem->GetSuffixList()->Alphabetize();
209 LogFile(pStem->Display());
210 } // end of stem loop
211 LogFileEndTable();
212 status.details.clear();
213 delete OldStems;
214 FindAllSignatures();
217 // Find All Signatures (from each Stem signatures)
219 /* This scans the Stems, forming entries in This
220 * for each Class signature (the concatenation of
221 * suffixes taken by a stem
222 * It also makes a CPtrList for each signature
223 * It also makes a pointer to the Sig for each Stem. <<<<
225 void CMiniLexicon::FindAllSignatures()
227 CLexicon& lex = *m_pLexicon;
228 linguistica::ui::status_user_agent& status = lex.status_display();
230 const int MinimumSignatureLength = m_pLexicon->GetIntParameter(
231 "Main\\MinimumSignatureLength", 1);
232 const bool analyzingSuffixes = !is_initial(m_AffixLocation);
234 status.details = "Find all signatures.";
236 std::auto_ptr<CSignatureCollection> OldSignatures(m_pSignatures);
237 m_pSignatures = analyzingSuffixes ?
238 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
239 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation);
241 for (int wordno = 0; wordno < m_pWords->GetCount(); wordno++)
243 m_pWords->GetAt(wordno)->SetPrefixSignature(NULL);
244 m_pWords->GetAt(wordno)->SetSuffixSignature(NULL);
247 // For each stem: generate signature from prefix/suffix list
248 const int num_stems = m_pStems->GetCount();
249 bool NewlyCreatedSignature = FALSE;
250 status.progress.clear();
251 status.progress.set_denominator(num_stems - 1);
252 for (int stemno = num_stems - 1; stemno >= 0 ; --stemno) {
253 CStem& stem = *m_pStems->GetAt(stemno);
254 status.progress = num_stems-1 - stemno;
256 CParse& affixes = analyzingSuffixes ?
257 *stem.GetSuffixList() :
258 *stem.GetPrefixList();
260 QList<CSuffix*> suffixes; // used if analyzingPrefixes
261 QList<CPrefix*> prefixes; // used if !analyzingPrefixes
263 // Look up prefixes/suffixes for the new signature’s
264 // prefix/suffix pointer list
265 for (int j = 1; j <= affixes.Size(); ++j) {
266 CStringSurrogate affix_text = affixes.GetPiece(j);
268 analyzingSuffixes ? suffixes.append(*m_pSuffixes ^= affix_text):
269 prefixes.append(*m_pPrefixes ^= affix_text);
272 // If not enough affixes, clear this stem’s signature.
273 // Otherwise, record the new signature.
275 CSignature* new_sig;
276 if (affixes.Size() < MinimumSignatureLength) new_sig = 0;
278 new_sig = *m_pSignatures ^= &affixes;
279 if ( new_sig )
281 NewlyCreatedSignature = FALSE;
283 else {
284 new_sig = *m_pSignatures << &affixes;
285 NewlyCreatedSignature = TRUE;
287 if (analyzingSuffixes) {
288 if (new_sig != 0)
289 new_sig->AttachToSuffixSig(&stem);
290 stem.SetSuffixSignature(new_sig);
291 foreach (CStem* word, *stem.GetWordPtrList())
292 word->SetSuffixSignature(new_sig);
293 } else {
294 if (new_sig != 0)
295 new_sig->AttachToPrefixSig(&stem);
296 stem.SetPrefixSignature(new_sig);
297 foreach (CStem* word, *stem.GetWordPtrList())
298 word->SetPrefixSignature(new_sig);
300 if (NewlyCreatedSignature)
302 // Record signature origin.
303 if (new_sig != 0)
305 if (CSignature* old_sig = *OldSignatures ^= new_sig)
306 new_sig->SetRemark(old_sig->GetRemark());
307 else
308 new_sig->SetRemark(
309 QString("From known stem and %1")
310 .arg(analyzingSuffixes ?
311 "suffix" : "prefix"));
315 } // loop over stemno
316 status.progress.clear();
318 for (int signo = 0; signo < m_pSignatures->GetCount(); signo++)
320 m_pSignatures ->GetAt(signo)->RecalculateStemAndWordPointers();
323 status.details.clear();
326 // From Stems Find Suffixes
327 void CMiniLexicon::FromStemsFindAffixes()
329 CStem* pStem;
330 CAffix* pAffix;
331 CAffix* qAffix;
332 CStem* pWord;
333 CSignature* pSig;
334 int StemLength;
335 int wordno = 0;
336 CStringSurrogate ssStem, ssAffix, ssWord, ssTemp;
337 CSuffixCollection TempSuffixes;
338 CPrefixCollection TempPrefixes;
339 CParse Word;
340 const double RobustnessThreshold = m_pLexicon->GetIntParameter(
341 "FromStemsFindSuffixes\\RobustnessThreshold", 10);
342 const int MinimumNumberOfOccurrences = m_pLexicon->GetIntParameter(
343 "FromStemsFindSuffixes\\MinimumNumberOfOccurrences", 3);
344 const int MinimumStemLength = m_pLexicon->GetIntParameter(
345 "Main\\MinimumStemLength", 3);
346 bool analyzingSuffixes = true;
348 if (m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL)
349 analyzingSuffixes = false;
351 std::auto_ptr<CSuffixCollection> TempSuffixesFinal;
352 std::auto_ptr<CPrefixCollection> TempPrefixesFinal;
353 if (analyzingSuffixes)
354 TempSuffixesFinal = std::auto_ptr<CSuffixCollection>(
355 new CSuffixCollection(this));
356 else
357 TempPrefixesFinal = std::auto_ptr<CPrefixCollection>(
358 new CPrefixCollection(this));
360 if (m_pStems->GetCount() == 0)
361 return;
363 m_pStems->Sort(KEY);
364 m_pWords->Sort(KEY);
366 m_pSignatures->CheckRobustness(); // TODO: check this for prefix compatibility
368 CLexicon& lex = *m_pLexicon;
369 linguistica::ui::status_user_agent& status = lex.status_display();
371 status.major_operation = QString("Mini-Lexicon %1: From stems, find %2")
372 .arg(m_Index + 1).arg(analyzingSuffixes ?
373 QString("suffixes") : QString("prefixes"));
375 status.details = "From known stems, find new affixes";
376 status.progress.clear();
377 status.progress.set_denominator(m_pStems->GetCount());
378 LogFileLargeTitle("Phase: From known stems, find new affixes");
379 LogFileHeader("Stems", "Robustness", "Affix");
380 for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
381 status.progress = stemno;
383 pStem = m_pStems->GetAtSort(stemno);
384 ssStem = pStem->GetKey();
385 LogFile(pStem->Display());
387 StemLength = ssStem.GetLength();
388 if ((int) StemLength < MinimumStemLength)
389 continue;
391 // Here is where to test to see
392 // if the stem's signature is really robust.
393 if (analyzingSuffixes)
394 pSig = pStem->GetSuffixSignature();
395 else
396 pSig = pStem->GetPrefixSignature();
397 LogFile(pSig->GetRobustness());
398 if (pSig->GetRobustness() < RobustnessThreshold)
399 continue;
401 // New formulation:
402 CParse StemContinuations;
403 if (analyzingSuffixes) {
404 m_pWords->GetTrie()->CreateSuffixSignatures(
405 pStem->GetKey(), &StemContinuations);
406 } else {
407 ssTemp = pStem->GetKey();
408 ssTemp.SetBackwards(true);
409 m_pWords->GetReverseTrie()->CreateSuffixSignatures(
410 &ssTemp, &StemContinuations);
411 ssTemp.SetBackwards(false);
414 for (int suffixno = 1; suffixno < (int) StemContinuations.Size(); suffixno++) {
415 ssAffix = StemContinuations.GetPiece(suffixno);
416 if (analyzingSuffixes) {
417 Word = ssStem + ssAffix;
418 pWord = *m_pWords ^= Word;
419 if (m_pWords->GetAtSort(wordno)->GetSuffixLoc() > 0)
420 continue;
421 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
422 wordno++;
423 continue;
424 // or use WordType
426 if (*m_pSuffixes ^= ssAffix)
427 continue;
428 pAffix = TempSuffixes << ssAffix;
429 LogFile("", "", ssAffix.Display());
430 } else {
431 ssAffix.SetBackwards(true);
432 Word = ssAffix + ssStem;
433 pWord = *m_pWords ^= Word;
434 if (m_pWords->GetAtSort(wordno)->GetPrefixLoc() > 0)
435 continue;
436 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
437 wordno++;
438 continue;
439 // or use WordType
441 if (*m_pPrefixes ^= ssAffix)
442 continue;
443 pAffix = TempPrefixes << ssAffix;
444 ssAffix.SetBackwards(false);
446 } // end of suffixno loop
447 } // end of stemno loop
448 status.progress.clear();
449 status.details.clear();
451 LogFileEndTable();
453 status.details = "Testing proposed affixes";
454 status.progress.clear();
455 if (analyzingSuffixes) {
456 LogFileSmallTitle("Testing proposed affixes");
457 LogFileHeader("Affix", "Count");
458 status.progress.set_denominator(TempSuffixes.GetCount());
459 for (int suffixno = 0; suffixno < (int)TempSuffixes.GetCount(); suffixno++) {
460 status.progress = suffixno;
461 pAffix = TempSuffixes.GetAt(suffixno);
462 LogFile(pAffix->Display(), pAffix->GetUseCount());
463 if (pAffix->GetUseCount() >= MinimumNumberOfOccurrences) {
464 qAffix = *TempSuffixesFinal << pAffix->Display();
465 if (qAffix)
466 LogFile("Yes", pAffix->Display());
467 else
468 LogFile("No", pAffix->Display());
471 LogFileEndTable();
472 } else {
473 status.progress.set_denominator(TempPrefixes.GetCount());
474 for (int prefixno = 0; prefixno < (int)TempPrefixes.GetCount(); prefixno++) {
475 status.progress = prefixno;
476 pAffix = TempPrefixes.GetAt(prefixno);
477 if (pAffix->GetUseCount() >= MinimumNumberOfOccurrences) {
478 qAffix = *TempPrefixesFinal << pAffix->GetKey();
479 if (qAffix)
480 LogFile("Yes", pAffix->Display());
481 else
482 LogFile ("No", pAffix->Display());
486 status.progress.clear();
487 status.details.clear();
489 // Now we'll look to see how often these acceptable suffixes are
490 // used on real stems
491 LogFileSmallTitle("How acceptable are these suffixes?");
492 LogFileHeader("Stems", "Direction", "Word");
493 status.details = "Cutting words using stems and proposed affixes";
494 status.progress.clear();
495 // GetDocument()->CountDownOnStatusBar(-1);
496 if (analyzingSuffixes) {
497 wordno=0;
498 status.progress.set_denominator(m_pStems->GetCount());
499 for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
500 status.progress = stemno;
501 pStem = m_pStems->GetAtSort(stemno);
502 ssStem = pStem->GetKey();
503 StemLength = ssStem.GetLength();
505 if ((int)StemLength < MinimumStemLength)
506 continue;
507 LogFile(ssStem.Display());
509 // Make sure the Words-ptr
510 // is above Stem in the Word list.
512 // Going up...
513 while (wordno > 0 &&
514 wordno < (int)m_pWords->GetCount() &&
515 m_pWords->GetAtSort(wordno)->GetKey().Display() >= ssStem.Display())
516 wordno--;
518 // Bring the Words-ptr down to Stem.
519 while (wordno < (int)(m_pWords->GetCount() - 1) &&
520 m_pWords->GetAtSort(wordno)->GetKey().Display() < ssStem.Display())
521 wordno++;
523 // Considering all words beginning with ssStem.
524 while (wordno < (int)m_pWords->GetCount() &&
525 m_pWords->GetAtSort(wordno)->GetKey().Left(StemLength) == ssStem) {
526 LogFile("", m_pWords->GetAtSort(wordno)->Display());
527 if (m_pWords->GetAtSort(wordno)->GetSuffixLoc() > 0) {
528 wordno++;
529 LogFile1SimpleString(" already has suffix ");
530 LogFileEndRow();
531 continue;
532 } //the word is already analyzed
534 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
535 wordno++;
536 LogFile(" is a compound");
537 continue;
540 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
541 ssAffix = ssWord.Right(ssWord.GetLength() - StemLength);
542 LogFile("Suffix (a)", ssAffix.Display());
543 if ((pAffix = (*TempSuffixesFinal ^= ssAffix))) {
544 LogFile("already exists suffix ", ssAffix.Display());
545 pWord = m_pWords->GetAtSort(wordno);
546 pWord->CutRightBeforeHere(StemLength);
547 pWord->AppendToConfidence("somewhat uncertain.");
548 pWord->SetStemLoc(1);
549 pWord->SetSuffixLoc(2);
550 pAffix->IncrementUseCount();
551 LogFile(pWord->Display());
552 m_pLexicon->UpdateWord(pWord);
553 } else
554 LogFile("Suffix not found.");
556 wordno++;
558 } // end of loop through stems collection
559 LogFileEndTable();
560 } else { // Prefixes!
561 wordno=0;
562 m_pWords->Sort(REVERSE_KEY);
563 status.progress.set_denominator(m_pStems->GetCount());
564 for (int stemno = 0; stemno < (int)m_pStems->GetCount(); stemno++) {
565 status.progress = stemno;
566 pStem = m_pStems->GetAtSort(stemno);
567 ssStem = pStem->GetKey();
568 StemLength = ssStem.GetLength();
569 if ((int)StemLength < MinimumStemLength)
570 continue;
572 // Make sure the Words-ptr is above Stem in the Word list.
573 // This is rarely executed.
574 ssStem.SetBackwards(true);
575 while (wordno > 0 && wordno < (int)m_pWords->GetCount()) {
576 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
577 ssWord.SetBackwards(true);
578 if (ssWord.Display() >= ssStem.Display())
579 break;
580 wordno--;
583 // bring the Words-ptr down to Stem.
584 while (wordno < (int)(m_pWords->GetCount() - 1)) {
585 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
586 ssWord.SetBackwards(true);
587 if (ssWord.Display() < ssStem.Display())
588 break;
589 wordno++;
592 ssWord.SetBackwards(false);
593 ssStem.SetBackwards(false);
595 while (wordno < (int)m_pWords->GetCount() &&
596 m_pWords->GetAtSort(wordno)->GetKey().Right(StemLength) == ssStem.Display()) {
597 if (m_pWords->GetAtSort(wordno)->GetPrefixLoc() > 0) {
598 wordno++;
599 continue;
601 if (m_pWords->GetAtSort(wordno)->GetStem2Loc() > 0) {
602 wordno++;
603 continue;
604 } // or use WordType
605 ssWord = m_pWords->GetAtSort(wordno)->GetKey();
606 ssAffix = ssWord.Left(ssWord.GetLength() - StemLength);
607 if ((pAffix = (*TempPrefixesFinal ^= ssAffix))) {
608 pWord = m_pWords->GetAtSort(wordno);
609 pWord->CutRightBeforeHere(ssWord.GetLength() - StemLength);
610 pWord->AppendToConfidence("somewhat uncertain.");
611 pWord->SetStemLoc(2);
612 pWord->SetPrefixLoc(1);
613 pAffix->IncrementUseCount();
614 LogFile("Cut : ", pWord->Display());
615 m_pLexicon->UpdateWord(pWord);
617 wordno++;
619 } // end of stemno loop
621 status.progress.clear();
622 status.details.clear();
624 if (analyzingSuffixes) {
625 QString Remark("From stems find suffixes");
626 CStringSurrogate ssRemark(Remark);
628 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark);
629 // XXX. not an operation.
630 status.major_operation = "Mini-Lexicon " + QString("%1").arg( m_Index+1 ) + ": End -- from stems, find suffixes";
631 } else { // Prefixes!
633 QString Remark("From stems find prefixes");
634 CStringSurrogate ssRemark(Remark);
635 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark);
636 status.major_operation = "Mini-Lexicon " + QString("%1").arg( m_Index+1 ) + ": End -- from stems, find prefixes";
639 QString mini_name("Mini-Lexicon %1");
640 mini_name = mini_name.arg(GetIndex() + 1);
641 QString remark = "From stems: find affixes";
642 GetDLHistory()->append(mini_name, remark, this);
645 void CMiniLexicon::ExtendKnownStemsToKnownAffixes()
647 CLexicon& lex = *m_pLexicon;
648 linguistica::ui::status_user_agent& status = lex.status_display();
650 // We will be iterating over stems and words, so sort them.
651 m_pStems->Sort(KEY);
652 m_pWords->Sort(KEY);
654 // Give the user some text to stare at
655 status.progress.clear();
656 status.major_operation =
657 QString("Mini-Lexicon %1: Extend known stems and suffixes")
658 .arg(m_Index + 1);
660 const bool analyzingSuffixes = !is_initial(m_AffixLocation);
661 LogFileLargeTitle("Phase: Extending analysis \n based on known stems and known affixes.");
662 LogFileStartTable();
663 if (analyzingSuffixes) {LogFileHeader("Stem", "Suffix");}
664 else {LogFileHeader("Prefix", "Stem");}
667 // For each stem:
668 const int TopCount = m_pStems->GetCount();
669 status.details = "Going through the stems ";
670 status.progress.set_denominator(TopCount);
671 for (int t = 0; t < TopCount; ++t) {
672 CStem& stem = *m_pStems->GetAtSort(t);
674 status.progress = t;
676 CStringSurrogate affix_text = analyzingSuffixes ?
677 stem.GetSuffixList()->GetPiece(1) :
678 stem.GetPrefixList()->GetPiece(1);
679 const int stem_length = stem.GetKeyLength();
680 const CStringSurrogate stem_text = stem.GetKey();
682 // Write "" instead of "NULL".
683 affix_text.ConvertNULL();
685 const CParse word = analyzingSuffixes ?
686 stem_text + affix_text :
687 affix_text + stem_text;
689 int SortIndex = 0;
690 CStem* found_word = *m_pWords ^= word;
691 if (found_word != 0)
692 SortIndex = found_word->GetSortIndex();
694 // First word starting with (resp ending with) stem text
695 int TopIndex = SortIndex;
696 if (analyzingSuffixes) {
697 while (TopIndex >= 1 &&
698 m_pWords->GetAtSort(TopIndex - 1)
699 ->GetKey().Left(stem_length) == stem_text)
700 --TopIndex;
701 } else {
702 while (TopIndex >= 1 &&
703 m_pWords->GetAtSort(TopIndex - 1)
704 ->GetKey().Right(stem_length) == stem_text)
705 --TopIndex;
708 // Last word starting with (resp ending with) stem text
709 int BottomIndex = SortIndex;
710 if (analyzingSuffixes) {
711 while (BottomIndex+1 < m_pWords->GetCount() &&
712 m_pWords->GetAtSort(BottomIndex+1)
713 ->GetKey().Left(stem_length) == stem_text)
714 ++BottomIndex;
715 } else {
716 while (BottomIndex+1 < m_pWords->GetCount() &&
717 m_pWords->GetAtSort(BottomIndex+1)
718 ->GetKey().Right(stem_length) == stem_text)
719 ++BottomIndex;
722 // For each word starting with (resp ending with) stem text:
723 for (int w = TopIndex; w <= BottomIndex; ++w) {
724 CStem& word2 = *m_pWords->GetAtSort(w);
726 // Ignore already analyzed words
727 const enum CStem::type word2_type = word2.GetWordType();
728 const int affix_loc = analyzingSuffixes ?
729 word2.GetSuffixLoc() :
730 word2.GetPrefixLoc();
731 if (affix_loc != 0 ||
732 word2_type == CStem::BIWORD_COMPOUND ||
733 word2_type == CStem::MULTIPLE_COMPOUND ||
734 word2_type == CStem::POSSIBLE_COMPOUND)
735 continue;
737 // Investigate possible affix
738 const CStringSurrogate word2_text = word2.GetKey();
739 const int affixlen = word2_text.GetLength() - stem_length;
740 const CStringSurrogate PossibleAffix =
741 analyzingSuffixes ?
742 word2_text.Right(affixlen) :
743 word2_text.Left(affixlen);
744 CAffix* found_affix;
745 analyzingSuffixes ?
746 found_affix = *m_pSuffixes ^= PossibleAffix:
747 found_affix = *m_pPrefixes ^= PossibleAffix;
748 if (found_affix == 0)
749 continue;
751 // Affix present. Rewrite word2 as stem + affix.
752 // (Do not update counts yet.)
753 if (analyzingSuffixes) {
754 word2.CutRightBeforeHere(stem_length);
755 word2.SetStemLoc(1);
756 word2.SetSuffixLoc(2);
757 stem.AddSuffix(static_cast<CSuffix*>(found_affix));
758 found_affix->AddStem(&stem);
759 } else {
760 word2.CutRightBeforeHere(affixlen);
761 word2.SetStemLoc(2);
762 word2.SetPrefixLoc(1);
763 stem.AddPrefix(static_cast<CPrefix*>(found_affix));
764 found_affix->AddStem(&stem);
766 if (analyzingSuffixes) { LogFile(stem.Display(), found_affix->Display());}
767 else { LogFile(found_affix->Display(), stem.Display());}
768 m_pLexicon->UpdateWord(&word2);
771 status.progress.clear();
772 status.details.clear();
773 LogFileEndTable();
775 // Redo Signatures.
776 QString Remark("From known stems and suffixes");
777 CStringSurrogate ssRemark(Remark);
779 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark);
781 // XXX. not an operation
782 status.major_operation = QString("Mini-Lexicon %1: "
783 "End of using known stems and %2")
784 .arg(m_Index + 1)
785 .arg(analyzingSuffixes ? "suffixes" : "prefixes");
787 GetDLHistory()->append(
788 QString("Mini-Lexicon %1").arg(m_Index + 1),
789 QString("Known stems and affixes"),
790 this);
794 * At the moment, this function decides too early whether to
795 * allow a stem to be used. It ought to build up the TempSignatures,
796 * and after all stems have been tried, decide which way to split
797 * a stem is the best.
800 /* Take a word W.
801 * Consider all splits for W such that its suffix is already in Suffixes.
802 * Put the stem in TempStems, and attach the suffix to the stem.
803 * Look at all words that start like that stem;
804 * Attach the suffixes found in those words to that stem.
805 * Done?
806 * Now, if there are more than 2 suffixes on that stem now,
807 * put the stem's signature in a TempSignature collection.
808 * Find cost = cost (Sig).
809 * If cost is negative, go ahead and make all those splits.
810 * End of loop.
812 void CMiniLexicon::LooseFit()
814 // Cost of a sig =
815 // Sum over all of its stems :
816 // log ( CorpusSize / Stem-count ) ( cost )
817 // length ( stem ) * cost of a letter ( savings )
819 // Sum over all of its suffixes:
820 // log ( CorpusSize / suffix-count ) ( cost )
821 // length ( suffix ) * cost of a letter ( savings )
822 using std::auto_ptr;
823 using std::vector;
825 CLexicon& lex = *m_pLexicon;
826 linguistica::ui::status_user_agent& status = lex.status_display();
827 const bool analyzingSuffixes = !is_initial(m_AffixLocation);
828 const int MinimumAffixLength = m_pLexicon->GetIntParameter(
829 analyzingSuffixes ?
830 "Main\\MinimumSuffixLength" :
831 "Main\\MinimumPrefixLength", 2);
832 const int MaximumAffixLength = m_pLexicon->GetIntParameter(
833 analyzingSuffixes ?
834 "Main\\MaximumSuffixLength" :
835 "Main\\MaximumPrefixLength", 7);
836 int MinimumStemLength = m_pLexicon->GetIntParameter(
837 "Main\\MinimumStemLength", 3);
838 if (MinimumStemLength < 1)
839 // do not allow empty stems
840 // XXX. notify user that value was ignored
841 MinimumStemLength = 1;
843 LogFileLargeTitle("Phase: Loose fit of known signatures");
844 LogFileStartTable();
846 auto_ptr<CSignatureCollection> proposed_sigs(analyzingSuffixes ?
847 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
848 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation));
849 auto_ptr<CSignatureCollection> existing_sigs(analyzingSuffixes ?
850 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
851 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation));
852 auto_ptr<CSignatureCollection> novel_sigs(analyzingSuffixes ?
853 new CSignatureCollection(this, m_pSuffixes, m_AffixLocation) :
854 new CSignatureCollection(this, m_pPrefixes, m_AffixLocation));
856 QList<CSignature*> ProposedSignatures;
857 QList<CSignature*> ExistingSignatures;
858 QList<CSignature*> NovelSignatures;
860 if (m_pWords->GetCount() == 0)
861 // Need to read corpus first.
862 return;
864 // We will be iterating over words, so sort them.
865 m_pWords->Sort(KEY);
867 // Give the user something to look at.
868 status.progress.clear();
869 status.major_operation =
870 QString("Mini-Lexicon %1: Loose fit.").arg(m_Index+1);
872 // For each word: look for known prefixes/suffixes and consider them.
873 // Fill proposed_sigs with candidates.
874 CStemCollection potential_stems;
875 const int num_words = m_pWords->GetCount();
876 status.progress.set_denominator(num_words);
877 for (int wordno = 0; wordno < num_words; ++wordno) {
878 CStem& word = *m_pWords->GetAtSort(wordno);
879 status.progress = wordno;
881 if (word.Size() > 1)
882 // already analyzed
883 continue;
885 if (word.GetKeyLength() < 2)
886 // too small to analyze
887 continue;
889 CStringSurrogate word_text = word.GetKey();
891 // For each textual prefix/suffix of word: if known, consider.
892 for (int i = MaximumAffixLength; i >= MinimumAffixLength; --i) {
893 Q_ASSERT(MinimumStemLength >= 1);
894 if (word_text.GetLength() - i < MinimumStemLength)
895 // corresponding stem is too short
896 continue;
898 CStringSurrogate affix_text = analyzingSuffixes ?
899 word_text.Right(i) : word_text.Left(i);
900 CAffix* affix = analyzingSuffixes ?
901 implicit_cast<CAffix*>(*m_pSuffixes ^= affix_text) :
902 implicit_cast<CAffix*>(*m_pPrefixes ^= affix_text);
904 if (affix == 0)
905 // suffix not known
906 continue;
908 Q_ASSERT(word_text.GetLength() > i);
909 CStringSurrogate stem_text = analyzingSuffixes ?
910 word_text.Left(word_text.GetLength()-i) :
911 word_text.Right(word_text.GetLength()-i);
913 if (*m_pStems ^= stem_text)
914 // stem already known
915 continue;
917 if (potential_stems ^= stem_text)
918 // stem already under consideration
919 continue;
921 LogFile(stem_text.Display());
922 CStem* stem = potential_stems << stem_text;
924 // build signature from remaining words
925 CParse empirical_sig;
926 if (analyzingSuffixes) {
927 m_pWords->CreateSuffixSignatures(
928 &stem_text, &empirical_sig);
929 } else {
930 if (!m_pWords->GetReverseTrie())
931 m_pWords->CreateReverseTrie();
933 // build list of reversed prefixes
934 CParse backwards_sig;
935 stem_text.SetBackwards();
936 m_pWords->GetReverseTrie()->CreateSuffixSignatures(
937 &stem_text, &backwards_sig);
938 stem_text.SetBackwards(false);
940 // reverse them to get actual prefixes
941 for (int n = 1; n <= backwards_sig.Size(); ++n) {
942 CStringSurrogate prefix =
943 backwards_sig.GetPiece(n);
945 prefix.SetBackwards();
946 empirical_sig.Append(prefix);
947 prefix.SetBackwards(false);
951 // for each prefix/suffix that appears with stem:
952 // throw it out if it's obviously bad, but
953 // otherwise record it in *stem.
954 for (int n = 1; n <= empirical_sig.Size(); ++n) {
955 CStringSurrogate affix2 = empirical_sig.GetPiece(n);
957 // word this affix was taken from
958 CParse word2_text = analyzingSuffixes ?
959 stem_text + affix2 :
960 affix2 + stem_text;
961 CStem* word2 = *m_pWords ^= word2_text;
962 Q_ASSERT(word2 != 0);
964 if (word2->Size() > 1)
965 // previously analyzed
966 continue;
968 // If affix is of reasonable size or is
969 // known from elsewhere, add it to stem's list.
970 const int affix2_len = affix2.GetLength();
971 const bool reasonable_size =
972 affix2_len >= MinimumAffixLength ||
973 affix2_len <= MaximumAffixLength;
975 if (reasonable_size || (analyzingSuffixes ?
976 (*m_pSuffixes ^= affix2) != 0:
977 (*m_pPrefixes ^= affix2) != 0)) {
978 if (analyzingSuffixes)
979 stem->AddSuffix(affix2);
980 else
981 stem->AddPrefix(affix2);
984 // special case: stem appears with null affix
985 if (CStem* word2 = *m_pWords ^= stem_text) {
986 if (word2->Size() > 1) {
987 // previously analyzed
988 } else {
989 if (analyzingSuffixes)
990 stem->AddNULLSuffix();
991 else
992 stem->AddNULLPrefix();
996 // fetch recorded affixes
997 CParse& affixes = analyzingSuffixes ?
998 *stem->GetSuffixList() :
999 *stem->GetPrefixList();
1001 // At least the affix that prompted this search
1002 // should be among them.
1003 Q_ASSERT(affixes.Contains(affix_text));
1004 Q_ASSERT(affixes.Size() > 0);
1006 // build new signature
1007 CSignature* new_sig = *proposed_sigs << &affixes;
1009 if (analyzingSuffixes)
1010 new_sig->AttachToSuffixSig(stem, false);
1011 else
1012 new_sig->AttachToPrefixSig(stem, false);
1013 new_sig->SetAffixLocation(m_AffixLocation);
1014 LogFile1SimpleString (new_sig->Display());
1015 } // for each textual prefix/suffix
1017 LogFile(""); // Blank line between stem-families
1018 } // end of wordno loop
1019 status.progress.clear();
1021 // Now we have, or might have, a set of candidate splittings,
1022 // so let's compare them.
1023 // If any of the corresponding signatures are already real,
1024 // then we take the stem corresponding to the sig with the
1025 // highest value for Robustness.
1026 // Otherwise we use the Cost function for signatures.
1027 LogFileSmallTitle("Proposed signatures", "Signature", "status");
1030 // First find the set of existing signatures (won't be many).
1032 for (int signo = 0; signo < ProposedSignatures.count(); signo++)
1034 CSignature* sig = ProposedSignatures.at(signo);
1036 if (sig->Size() < 2 && sig->GetKeyLength() < 4) {// total length < 4! Way too small.
1037 LogFile("too small: ", sig->Display() );
1038 continue;
1041 // known signature?
1042 if (*GetSignatures() ^= sig) ExistingSignatures.append(sig);
1043 else NovelSignatures.append(sig);
1045 LogFileEndTable();
1048 // Sort existing signatures by robustness
1049 const int size = existing_sigs->GetCount();
1051 LogFileStartTable(); LogFileHeader("Pre-existing signature", "Robustnes" );
1054 // calculate each signature’s robustness
1055 vector<double> Robustness;
1056 Robustness.reserve(size);
1057 for (int signo = 0; signo < size; ++signo) {
1058 CSignature* sig = (*existing_sigs)[signo];
1060 Robustness.push_back(sig->GetRobustness());
1061 LogFile(sig->Display(), Robustness[signo]);
1064 LogFileHeader("Signature", "Stem");
1066 // For each existing signature (sorted by robustness):
1067 // analyze (cut) unanalyzed words according to this signature.
1068 vector<int> shuffle(size);
1069 SortVector(&shuffle[0], &Robustness[0], size);
1070 for (int j = 0; j < size; ++j) {
1071 CSignature* sig = (*existing_sigs)[shuffle[j]];
1072 LogFile(sig->Display());
1075 // Analyze (cut) unanalyzed words according to signature.
1076 sig->CutMyWordsAsIDeclare();
1079 LogFileEndTable();
1083 // Consider novel signatures.
1084 const int size = novel_sigs->GetCount();
1085 LogFileSmallTitle ("Novel signatures: "); LogFileStartTable();
1088 // for each signature: calculate cost function
1089 vector<double> Cost;
1090 Cost.reserve(size);
1091 for (int i = 0; i < size; ++i)
1092 Cost.push_back((*novel_sigs)[i]->FindCost(this));
1094 // for each novel signature, in least-cost-first order:
1095 // Analyze (cut) unanalyzed words according to this signature.
1096 // Stop when the cost becomes positive.
1097 vector<int> shuffle(size);
1098 SortVector(&shuffle[0], &Cost[0], size);
1100 for (int signo = size-1; signo >= 0 && Cost[shuffle[signo]] < 0; --signo) {
1101 CSignature* sig = (*novel_sigs)[shuffle[signo]];
1102 LogFile(sig->Display(), Cost[shuffle[signo]]);
1103 // Analyze (cut) unanalyzed words according to signature.
1104 sig->CutMyWordsAsIDeclare();
1106 } // end of signo loop
1107 // XXX. report cost of first rejected signature to log
1109 // what had followed has been eliminated, and replaced by:
1110 CStringSurrogate remark(QString("Loose fit"));
1111 RebuildAffixesStemsAndSignaturesFromWordSplits (remark);
1113 // XXX. not an operation
1114 status.major_operation = QString("Mini-Lexicon %1: End of loose fit.")
1115 .arg(m_Index+1);
1116 GetDLHistory()->append(
1117 QString("Mini-Lexicon %1").arg(m_Index+1),
1118 QString("Loose Fit"),
1119 this);
1122 int CMiniLexicon::GetNumberOfStems()
1124 return m_pStems->GetCount();