1 // Suffix/signature-based discovery of morphology: core
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
10 #include "DLHistory.h"
11 #include "Signature.h"
15 #include "SignatureCollection.h"
16 #include "SuffixCollection.h"
17 #include "PrefixCollection.h"
18 #include "WordCollection.h"
19 #include "StemCollection.h"
20 #include "CompareFunc.h"
21 #include "StringFunc.h"
23 #include "AffixLocation.h"
24 #include "implicit_cast.h"
25 using linguistica::implicit_cast
;
27 /// Writes to status.details instead of status.major_operation.
28 void CMiniLexicon::TakeSplitWords_ProduceStemsAndSigs( CStringSurrogate
& Remark
,
29 CWordCollection
* pWords
,
30 CStemCollection
* pStems
,
31 CPrefixCollection
* pPrefixes
,
32 CSuffixCollection
* pSuffixes
)
35 if( pWords
== NULL
) { pWords
= m_pWords
; };
36 if( pStems
== NULL
) { pStems
= m_pStems
; }
37 if( pPrefixes
== NULL
) { pPrefixes
= m_pPrefixes
; }
38 if( pSuffixes
== NULL
) { pSuffixes
= m_pSuffixes
; }
40 RebuildAffixesStemsAndSignaturesFromWordSplits( Remark
);
43 void CMiniLexicon::RebuildAffixesStemsAndSignaturesFromWordSplits(
44 CStringSurrogate
& Remark
)
46 int HowManySuffixes
= 0, HowManyPrefixes
= 0;
47 CStemCollection
* OldStems
;
48 CStringSurrogate ssPrefix
, ssStem
, ssSuffix
;
50 CSuffix
* pSuffix
= NULL
;
53 CPrefix
* pPrefix
= NULL
;
54 CSuffix
* pNullSuffix
= NULL
;
55 CPrefix
* pNullPrefix
= NULL
;
57 CLexicon
& lex
= *m_pLexicon
;
58 linguistica::ui::status_user_agent
& status
= lex
.status_display();
60 LogFileSmallTitle("Rebuilding all signature structures");
61 status
.details
= "Rebuilding all signature structures.";
62 status
.progress
.clear();
64 bool analyzingSuffixes
= true;
65 if (m_AffixLocation
== STEM_INITIAL
|| m_AffixLocation
== WORD_INITIAL
)
66 analyzingSuffixes
= false;
67 if (analyzingSuffixes
) {
68 m_pSuffixes
->RemoveAll();
69 pNullSuffix
= *m_pSuffixes
<< TheStringNULL
;
71 m_pPrefixes
->RemoveAll();
72 pNullPrefix
= *m_pPrefixes
<< TheStringNULL
;
75 m_pStems
= new CStemCollection(this);
76 status
.details
= "Rebuilding all signature structures: words.";
77 status
.progress
.set_denominator(m_pWords
->GetCount());
78 for (int wordno
= 0; wordno
< (int)m_pWords
->GetCount(); wordno
++) {
79 pWord
= m_pWords
->GetAt(wordno
);
83 pWord
->SetSuffixSignature(NULL
);
84 pWord
->SetPrefixSignature(NULL
);
86 status
.progress
= wordno
;
88 // commented out to avoid seg fault
91 pWord
->GetWordType() == CStem::BIWORD_COMPOUND
||
92 pWord
->GetWordType() == CStem::MULTIPLE_COMPOUND
||
94 pWord
->GetWordType() == CStem::NUMBER
)
97 // rebuild stems, part 1.
99 ssStem
= pWord
->GetStem();
101 if (ssStem
.GetLength() == 0) {
102 pWord
->SetStemLoc(1);
103 pWord
->SetSuffixLoc(0);
104 pWord
->SetPrefixLoc(0);
110 if (analyzingSuffixes
) {
111 ssSuffix
= pWord
->GetSuffix();
112 if (ssSuffix
.GetLength() < 1)
114 pSuffix
= *m_pSuffixes
<< ssSuffix
;
117 pSuffix
->IncrementCorpusCount(pWord
->GetCorpusCount() - 1);
118 pSuffix
->IncrementUseCount();
119 pWord
->SetSuffixPtr(pSuffix
);
122 ssPrefix
= pWord
->GetPrefix();
123 if (ssPrefix
.GetLength() < 1)
125 pPrefix
= *m_pPrefixes
<< ssPrefix
;
126 pPrefix
->IncrementCorpusCount(pWord
->GetCorpusCount() - 1);
127 pPrefix
->IncrementUseCount();
128 pWord
->SetPrefixPtr(pPrefix
);
132 // attach affix to stem.
134 if (analyzingSuffixes
) {
135 pStem
= *m_pStems
<< ssStem
;
138 pWord
->AttachWordAndSuffixalStem(pStem
);
139 pSuffix
= *m_pSuffixes
^= pWord
->GetSuffix();
142 pSuffix
->AddStem(pStem
);
144 pStem
= *m_pStems
<< ssStem
;
145 pWord
->AttachWordAndPrefixalStem(pStem
);
146 pPrefix
= *m_pPrefixes
^= pWord
->GetPrefix();
147 pPrefix
->AddStem(pStem
);
150 pStem
->SetConfidence(pWord
->GetConfidence());
151 qStem
= *OldStems
^= pStem
->GetKey();
153 pStem
->SetConfidence(qStem
->GetConfidence());
155 pStem
->SetConfidence(Remark
.Display());
157 Q_ASSERT(pWord
->IsValid());
159 if (analyzingSuffixes
)
160 pStem
->AddSuffix(pSuffix
);
162 pStem
->AddPrefix(pPrefix
);
164 } // loop over wordno
165 status
.progress
.clear();
167 LogFile1SimpleString("End of word loop");
170 // if a stem is also a word, then we add NULL
171 // to the stem's factorization.
173 LogFileSmallTitle("Stem Loop");
175 status
.details
= "Rebuilding all signature structures:stems.";
176 status
.progress
.set_denominator(m_pStems
->GetCount());
177 for (int stemno
= 0; stemno
< (int)m_pStems
->GetCount(); stemno
++) {
178 pStem
= m_pStems
->GetAt(stemno
);
179 status
.progress
= stemno
;
180 LogFile(pStem
->Display());
181 if (analyzingSuffixes
) {
182 if (pStem
->GetNumberOfSuffixes() < 1)
185 if (pStem
->GetNumberOfPrefixes() < 1)
188 pWord
= *m_pWords
^= pStem
->GetKey();
190 if (analyzingSuffixes
) {
191 pStem
->AddNULLSuffix();
192 pStem
->AddWord(pWord
);
193 pNullSuffix
->IncrementUseCount();
194 pNullSuffix
->IncrementCorpusCount(pWord
->GetCorpusCount());
195 pStem
->IncrementWordCount();
196 pNullSuffix
->AddStem(pStem
);
198 pStem
->AddNULLPrefix();
199 pStem
->AddWord(pWord
);
200 pNullPrefix
->IncrementUseCount();
201 pNullPrefix
->IncrementCorpusCount(pWord
->GetCorpusCount());
202 pStem
->IncrementWordCount();
203 pNullPrefix
->AddStem(pStem
);
206 pStem
->IncrementCorpusCount(pWord
->GetCorpusCount());
208 pStem
->GetSuffixList()->Alphabetize();
209 LogFile(pStem
->Display());
210 } // end of stem loop
212 status
.details
.clear();
217 // Find All Signatures (from each Stem signatures)
219 /* This scans the Stems, forming entries in This
220 * for each Class signature (the concatenation of
221 * suffixes taken by a stem
222 * It also makes a CPtrList for each signature
223 * It also makes a pointer to the Sig for each Stem. <<<<
225 void CMiniLexicon::FindAllSignatures()
227 CLexicon
& lex
= *m_pLexicon
;
228 linguistica::ui::status_user_agent
& status
= lex
.status_display();
230 const int MinimumSignatureLength
= m_pLexicon
->GetIntParameter(
231 "Main\\MinimumSignatureLength", 1);
232 const bool analyzingSuffixes
= !is_initial(m_AffixLocation
);
234 status
.details
= "Find all signatures.";
236 std::auto_ptr
<CSignatureCollection
> OldSignatures(m_pSignatures
);
237 m_pSignatures
= analyzingSuffixes
?
238 new CSignatureCollection(this, m_pSuffixes
, m_AffixLocation
) :
239 new CSignatureCollection(this, m_pPrefixes
, m_AffixLocation
);
241 for (int wordno
= 0; wordno
< m_pWords
->GetCount(); wordno
++)
243 m_pWords
->GetAt(wordno
)->SetPrefixSignature(NULL
);
244 m_pWords
->GetAt(wordno
)->SetSuffixSignature(NULL
);
247 // For each stem: generate signature from prefix/suffix list
248 const int num_stems
= m_pStems
->GetCount();
249 bool NewlyCreatedSignature
= FALSE
;
250 status
.progress
.clear();
251 status
.progress
.set_denominator(num_stems
- 1);
252 for (int stemno
= num_stems
- 1; stemno
>= 0 ; --stemno
) {
253 CStem
& stem
= *m_pStems
->GetAt(stemno
);
254 status
.progress
= num_stems
-1 - stemno
;
256 CParse
& affixes
= analyzingSuffixes
?
257 *stem
.GetSuffixList() :
258 *stem
.GetPrefixList();
260 QList
<CSuffix
*> suffixes
; // used if analyzingPrefixes
261 QList
<CPrefix
*> prefixes
; // used if !analyzingPrefixes
263 // Look up prefixes/suffixes for the new signature’s
264 // prefix/suffix pointer list
265 for (int j
= 1; j
<= affixes
.Size(); ++j
) {
266 CStringSurrogate affix_text
= affixes
.GetPiece(j
);
268 analyzingSuffixes
? suffixes
.append(*m_pSuffixes
^= affix_text
):
269 prefixes
.append(*m_pPrefixes
^= affix_text
);
272 // If not enough affixes, clear this stem’s signature.
273 // Otherwise, record the new signature.
276 if (affixes
.Size() < MinimumSignatureLength
) new_sig
= 0;
278 new_sig
= *m_pSignatures
^= &affixes
;
281 NewlyCreatedSignature
= FALSE
;
284 new_sig
= *m_pSignatures
<< &affixes
;
285 NewlyCreatedSignature
= TRUE
;
287 if (analyzingSuffixes
) {
289 new_sig
->AttachToSuffixSig(&stem
);
290 stem
.SetSuffixSignature(new_sig
);
291 foreach (CStem
* word
, *stem
.GetWordPtrList())
292 word
->SetSuffixSignature(new_sig
);
295 new_sig
->AttachToPrefixSig(&stem
);
296 stem
.SetPrefixSignature(new_sig
);
297 foreach (CStem
* word
, *stem
.GetWordPtrList())
298 word
->SetPrefixSignature(new_sig
);
300 if (NewlyCreatedSignature
)
302 // Record signature origin.
305 if (CSignature
* old_sig
= *OldSignatures
^= new_sig
)
306 new_sig
->SetRemark(old_sig
->GetRemark());
309 QString("From known stem and %1")
310 .arg(analyzingSuffixes
?
311 "suffix" : "prefix"));
315 } // loop over stemno
316 status
.progress
.clear();
318 for (int signo
= 0; signo
< m_pSignatures
->GetCount(); signo
++)
320 m_pSignatures
->GetAt(signo
)->RecalculateStemAndWordPointers();
323 status
.details
.clear();
326 // From Stems Find Suffixes
327 void CMiniLexicon::FromStemsFindAffixes()
336 CStringSurrogate ssStem
, ssAffix
, ssWord
, ssTemp
;
337 CSuffixCollection TempSuffixes
;
338 CPrefixCollection TempPrefixes
;
340 const double RobustnessThreshold
= m_pLexicon
->GetIntParameter(
341 "FromStemsFindSuffixes\\RobustnessThreshold", 10);
342 const int MinimumNumberOfOccurrences
= m_pLexicon
->GetIntParameter(
343 "FromStemsFindSuffixes\\MinimumNumberOfOccurrences", 3);
344 const int MinimumStemLength
= m_pLexicon
->GetIntParameter(
345 "Main\\MinimumStemLength", 3);
346 bool analyzingSuffixes
= true;
348 if (m_AffixLocation
== STEM_INITIAL
|| m_AffixLocation
== WORD_INITIAL
)
349 analyzingSuffixes
= false;
351 std::auto_ptr
<CSuffixCollection
> TempSuffixesFinal
;
352 std::auto_ptr
<CPrefixCollection
> TempPrefixesFinal
;
353 if (analyzingSuffixes
)
354 TempSuffixesFinal
= std::auto_ptr
<CSuffixCollection
>(
355 new CSuffixCollection(this));
357 TempPrefixesFinal
= std::auto_ptr
<CPrefixCollection
>(
358 new CPrefixCollection(this));
360 if (m_pStems
->GetCount() == 0)
366 m_pSignatures
->CheckRobustness(); // TODO: check this for prefix compatibility
368 CLexicon
& lex
= *m_pLexicon
;
369 linguistica::ui::status_user_agent
& status
= lex
.status_display();
371 status
.major_operation
= QString("Mini-Lexicon %1: From stems, find %2")
372 .arg(m_Index
+ 1).arg(analyzingSuffixes
?
373 QString("suffixes") : QString("prefixes"));
375 status
.details
= "From known stems, find new affixes";
376 status
.progress
.clear();
377 status
.progress
.set_denominator(m_pStems
->GetCount());
378 LogFileLargeTitle("Phase: From known stems, find new affixes");
379 LogFileHeader("Stems", "Robustness", "Affix");
380 for (int stemno
= 0; stemno
< (int)m_pStems
->GetCount(); stemno
++) {
381 status
.progress
= stemno
;
383 pStem
= m_pStems
->GetAtSort(stemno
);
384 ssStem
= pStem
->GetKey();
385 LogFile(pStem
->Display());
387 StemLength
= ssStem
.GetLength();
388 if ((int) StemLength
< MinimumStemLength
)
391 // Here is where to test to see
392 // if the stem's signature is really robust.
393 if (analyzingSuffixes
)
394 pSig
= pStem
->GetSuffixSignature();
396 pSig
= pStem
->GetPrefixSignature();
397 LogFile(pSig
->GetRobustness());
398 if (pSig
->GetRobustness() < RobustnessThreshold
)
402 CParse StemContinuations
;
403 if (analyzingSuffixes
) {
404 m_pWords
->GetTrie()->CreateSuffixSignatures(
405 pStem
->GetKey(), &StemContinuations
);
407 ssTemp
= pStem
->GetKey();
408 ssTemp
.SetBackwards(true);
409 m_pWords
->GetReverseTrie()->CreateSuffixSignatures(
410 &ssTemp
, &StemContinuations
);
411 ssTemp
.SetBackwards(false);
414 for (int suffixno
= 1; suffixno
< (int) StemContinuations
.Size(); suffixno
++) {
415 ssAffix
= StemContinuations
.GetPiece(suffixno
);
416 if (analyzingSuffixes
) {
417 Word
= ssStem
+ ssAffix
;
418 pWord
= *m_pWords
^= Word
;
419 if (m_pWords
->GetAtSort(wordno
)->GetSuffixLoc() > 0)
421 if (m_pWords
->GetAtSort(wordno
)->GetStem2Loc() > 0) {
426 if (*m_pSuffixes
^= ssAffix
)
428 pAffix
= TempSuffixes
<< ssAffix
;
429 LogFile("", "", ssAffix
.Display());
431 ssAffix
.SetBackwards(true);
432 Word
= ssAffix
+ ssStem
;
433 pWord
= *m_pWords
^= Word
;
434 if (m_pWords
->GetAtSort(wordno
)->GetPrefixLoc() > 0)
436 if (m_pWords
->GetAtSort(wordno
)->GetStem2Loc() > 0) {
441 if (*m_pPrefixes
^= ssAffix
)
443 pAffix
= TempPrefixes
<< ssAffix
;
444 ssAffix
.SetBackwards(false);
446 } // end of suffixno loop
447 } // end of stemno loop
448 status
.progress
.clear();
449 status
.details
.clear();
453 status
.details
= "Testing proposed affixes";
454 status
.progress
.clear();
455 if (analyzingSuffixes
) {
456 LogFileSmallTitle("Testing proposed affixes");
457 LogFileHeader("Affix", "Count");
458 status
.progress
.set_denominator(TempSuffixes
.GetCount());
459 for (int suffixno
= 0; suffixno
< (int)TempSuffixes
.GetCount(); suffixno
++) {
460 status
.progress
= suffixno
;
461 pAffix
= TempSuffixes
.GetAt(suffixno
);
462 LogFile(pAffix
->Display(), pAffix
->GetUseCount());
463 if (pAffix
->GetUseCount() >= MinimumNumberOfOccurrences
) {
464 qAffix
= *TempSuffixesFinal
<< pAffix
->Display();
466 LogFile("Yes", pAffix
->Display());
468 LogFile("No", pAffix
->Display());
473 status
.progress
.set_denominator(TempPrefixes
.GetCount());
474 for (int prefixno
= 0; prefixno
< (int)TempPrefixes
.GetCount(); prefixno
++) {
475 status
.progress
= prefixno
;
476 pAffix
= TempPrefixes
.GetAt(prefixno
);
477 if (pAffix
->GetUseCount() >= MinimumNumberOfOccurrences
) {
478 qAffix
= *TempPrefixesFinal
<< pAffix
->GetKey();
480 LogFile("Yes", pAffix
->Display());
482 LogFile ("No", pAffix
->Display());
486 status
.progress
.clear();
487 status
.details
.clear();
489 // Now we'll look to see how often these acceptable suffixes are
490 // used on real stems
491 LogFileSmallTitle("How acceptable are these suffixes?");
492 LogFileHeader("Stems", "Direction", "Word");
493 status
.details
= "Cutting words using stems and proposed affixes";
494 status
.progress
.clear();
495 // GetDocument()->CountDownOnStatusBar(-1);
496 if (analyzingSuffixes
) {
498 status
.progress
.set_denominator(m_pStems
->GetCount());
499 for (int stemno
= 0; stemno
< (int)m_pStems
->GetCount(); stemno
++) {
500 status
.progress
= stemno
;
501 pStem
= m_pStems
->GetAtSort(stemno
);
502 ssStem
= pStem
->GetKey();
503 StemLength
= ssStem
.GetLength();
505 if ((int)StemLength
< MinimumStemLength
)
507 LogFile(ssStem
.Display());
509 // Make sure the Words-ptr
510 // is above Stem in the Word list.
514 wordno
< (int)m_pWords
->GetCount() &&
515 m_pWords
->GetAtSort(wordno
)->GetKey().Display() >= ssStem
.Display())
518 // Bring the Words-ptr down to Stem.
519 while (wordno
< (int)(m_pWords
->GetCount() - 1) &&
520 m_pWords
->GetAtSort(wordno
)->GetKey().Display() < ssStem
.Display())
523 // Considering all words beginning with ssStem.
524 while (wordno
< (int)m_pWords
->GetCount() &&
525 m_pWords
->GetAtSort(wordno
)->GetKey().Left(StemLength
) == ssStem
) {
526 LogFile("", m_pWords
->GetAtSort(wordno
)->Display());
527 if (m_pWords
->GetAtSort(wordno
)->GetSuffixLoc() > 0) {
529 LogFile1SimpleString(" already has suffix ");
532 } //the word is already analyzed
534 if (m_pWords
->GetAtSort(wordno
)->GetStem2Loc() > 0) {
536 LogFile(" is a compound");
540 ssWord
= m_pWords
->GetAtSort(wordno
)->GetKey();
541 ssAffix
= ssWord
.Right(ssWord
.GetLength() - StemLength
);
542 LogFile("Suffix (a)", ssAffix
.Display());
543 if ((pAffix
= (*TempSuffixesFinal
^= ssAffix
))) {
544 LogFile("already exists suffix ", ssAffix
.Display());
545 pWord
= m_pWords
->GetAtSort(wordno
);
546 pWord
->CutRightBeforeHere(StemLength
);
547 pWord
->AppendToConfidence("somewhat uncertain.");
548 pWord
->SetStemLoc(1);
549 pWord
->SetSuffixLoc(2);
550 pAffix
->IncrementUseCount();
551 LogFile(pWord
->Display());
552 m_pLexicon
->UpdateWord(pWord
);
554 LogFile("Suffix not found.");
558 } // end of loop through stems collection
560 } else { // Prefixes!
562 m_pWords
->Sort(REVERSE_KEY
);
563 status
.progress
.set_denominator(m_pStems
->GetCount());
564 for (int stemno
= 0; stemno
< (int)m_pStems
->GetCount(); stemno
++) {
565 status
.progress
= stemno
;
566 pStem
= m_pStems
->GetAtSort(stemno
);
567 ssStem
= pStem
->GetKey();
568 StemLength
= ssStem
.GetLength();
569 if ((int)StemLength
< MinimumStemLength
)
572 // Make sure the Words-ptr is above Stem in the Word list.
573 // This is rarely executed.
574 ssStem
.SetBackwards(true);
575 while (wordno
> 0 && wordno
< (int)m_pWords
->GetCount()) {
576 ssWord
= m_pWords
->GetAtSort(wordno
)->GetKey();
577 ssWord
.SetBackwards(true);
578 if (ssWord
.Display() >= ssStem
.Display())
583 // bring the Words-ptr down to Stem.
584 while (wordno
< (int)(m_pWords
->GetCount() - 1)) {
585 ssWord
= m_pWords
->GetAtSort(wordno
)->GetKey();
586 ssWord
.SetBackwards(true);
587 if (ssWord
.Display() < ssStem
.Display())
592 ssWord
.SetBackwards(false);
593 ssStem
.SetBackwards(false);
595 while (wordno
< (int)m_pWords
->GetCount() &&
596 m_pWords
->GetAtSort(wordno
)->GetKey().Right(StemLength
) == ssStem
.Display()) {
597 if (m_pWords
->GetAtSort(wordno
)->GetPrefixLoc() > 0) {
601 if (m_pWords
->GetAtSort(wordno
)->GetStem2Loc() > 0) {
605 ssWord
= m_pWords
->GetAtSort(wordno
)->GetKey();
606 ssAffix
= ssWord
.Left(ssWord
.GetLength() - StemLength
);
607 if ((pAffix
= (*TempPrefixesFinal
^= ssAffix
))) {
608 pWord
= m_pWords
->GetAtSort(wordno
);
609 pWord
->CutRightBeforeHere(ssWord
.GetLength() - StemLength
);
610 pWord
->AppendToConfidence("somewhat uncertain.");
611 pWord
->SetStemLoc(2);
612 pWord
->SetPrefixLoc(1);
613 pAffix
->IncrementUseCount();
614 LogFile("Cut : ", pWord
->Display());
615 m_pLexicon
->UpdateWord(pWord
);
619 } // end of stemno loop
621 status
.progress
.clear();
622 status
.details
.clear();
624 if (analyzingSuffixes
) {
625 QString
Remark("From stems find suffixes");
626 CStringSurrogate
ssRemark(Remark
);
628 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark
);
629 // XXX. not an operation.
630 status
.major_operation
= "Mini-Lexicon " + QString("%1").arg( m_Index
+1 ) + ": End -- from stems, find suffixes";
631 } else { // Prefixes!
633 QString
Remark("From stems find prefixes");
634 CStringSurrogate
ssRemark(Remark
);
635 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark
);
636 status
.major_operation
= "Mini-Lexicon " + QString("%1").arg( m_Index
+1 ) + ": End -- from stems, find prefixes";
639 QString
mini_name("Mini-Lexicon %1");
640 mini_name
= mini_name
.arg(GetIndex() + 1);
641 QString remark
= "From stems: find affixes";
642 GetDLHistory()->append(mini_name
, remark
, this);
645 void CMiniLexicon::ExtendKnownStemsToKnownAffixes()
647 CLexicon
& lex
= *m_pLexicon
;
648 linguistica::ui::status_user_agent
& status
= lex
.status_display();
650 // We will be iterating over stems and words, so sort them.
654 // Give the user some text to stare at
655 status
.progress
.clear();
656 status
.major_operation
=
657 QString("Mini-Lexicon %1: Extend known stems and suffixes")
660 const bool analyzingSuffixes
= !is_initial(m_AffixLocation
);
661 LogFileLargeTitle("Phase: Extending analysis \n based on known stems and known affixes.");
663 if (analyzingSuffixes
) {LogFileHeader("Stem", "Suffix");}
664 else {LogFileHeader("Prefix", "Stem");}
668 const int TopCount
= m_pStems
->GetCount();
669 status
.details
= "Going through the stems ";
670 status
.progress
.set_denominator(TopCount
);
671 for (int t
= 0; t
< TopCount
; ++t
) {
672 CStem
& stem
= *m_pStems
->GetAtSort(t
);
676 CStringSurrogate affix_text
= analyzingSuffixes
?
677 stem
.GetSuffixList()->GetPiece(1) :
678 stem
.GetPrefixList()->GetPiece(1);
679 const int stem_length
= stem
.GetKeyLength();
680 const CStringSurrogate stem_text
= stem
.GetKey();
682 // Write "" instead of "NULL".
683 affix_text
.ConvertNULL();
685 const CParse word
= analyzingSuffixes
?
686 stem_text
+ affix_text
:
687 affix_text
+ stem_text
;
690 CStem
* found_word
= *m_pWords
^= word
;
692 SortIndex
= found_word
->GetSortIndex();
694 // First word starting with (resp ending with) stem text
695 int TopIndex
= SortIndex
;
696 if (analyzingSuffixes
) {
697 while (TopIndex
>= 1 &&
698 m_pWords
->GetAtSort(TopIndex
- 1)
699 ->GetKey().Left(stem_length
) == stem_text
)
702 while (TopIndex
>= 1 &&
703 m_pWords
->GetAtSort(TopIndex
- 1)
704 ->GetKey().Right(stem_length
) == stem_text
)
708 // Last word starting with (resp ending with) stem text
709 int BottomIndex
= SortIndex
;
710 if (analyzingSuffixes
) {
711 while (BottomIndex
+1 < m_pWords
->GetCount() &&
712 m_pWords
->GetAtSort(BottomIndex
+1)
713 ->GetKey().Left(stem_length
) == stem_text
)
716 while (BottomIndex
+1 < m_pWords
->GetCount() &&
717 m_pWords
->GetAtSort(BottomIndex
+1)
718 ->GetKey().Right(stem_length
) == stem_text
)
722 // For each word starting with (resp ending with) stem text:
723 for (int w
= TopIndex
; w
<= BottomIndex
; ++w
) {
724 CStem
& word2
= *m_pWords
->GetAtSort(w
);
726 // Ignore already analyzed words
727 const enum CStem::type word2_type
= word2
.GetWordType();
728 const int affix_loc
= analyzingSuffixes
?
729 word2
.GetSuffixLoc() :
730 word2
.GetPrefixLoc();
731 if (affix_loc
!= 0 ||
732 word2_type
== CStem::BIWORD_COMPOUND
||
733 word2_type
== CStem::MULTIPLE_COMPOUND
||
734 word2_type
== CStem::POSSIBLE_COMPOUND
)
737 // Investigate possible affix
738 const CStringSurrogate word2_text
= word2
.GetKey();
739 const int affixlen
= word2_text
.GetLength() - stem_length
;
740 const CStringSurrogate PossibleAffix
=
742 word2_text
.Right(affixlen
) :
743 word2_text
.Left(affixlen
);
746 found_affix
= *m_pSuffixes
^= PossibleAffix
:
747 found_affix
= *m_pPrefixes
^= PossibleAffix
;
748 if (found_affix
== 0)
751 // Affix present. Rewrite word2 as stem + affix.
752 // (Do not update counts yet.)
753 if (analyzingSuffixes
) {
754 word2
.CutRightBeforeHere(stem_length
);
756 word2
.SetSuffixLoc(2);
757 stem
.AddSuffix(static_cast<CSuffix
*>(found_affix
));
758 found_affix
->AddStem(&stem
);
760 word2
.CutRightBeforeHere(affixlen
);
762 word2
.SetPrefixLoc(1);
763 stem
.AddPrefix(static_cast<CPrefix
*>(found_affix
));
764 found_affix
->AddStem(&stem
);
766 if (analyzingSuffixes
) { LogFile(stem
.Display(), found_affix
->Display());}
767 else { LogFile(found_affix
->Display(), stem
.Display());}
768 m_pLexicon
->UpdateWord(&word2
);
771 status
.progress
.clear();
772 status
.details
.clear();
776 QString
Remark("From known stems and suffixes");
777 CStringSurrogate
ssRemark(Remark
);
779 RebuildAffixesStemsAndSignaturesFromWordSplits(ssRemark
);
781 // XXX. not an operation
782 status
.major_operation
= QString("Mini-Lexicon %1: "
783 "End of using known stems and %2")
785 .arg(analyzingSuffixes
? "suffixes" : "prefixes");
787 GetDLHistory()->append(
788 QString("Mini-Lexicon %1").arg(m_Index
+ 1),
789 QString("Known stems and affixes"),
794 * At the moment, this function decides too early whether to
795 * allow a stem to be used. It ought to build up the TempSignatures,
796 * and after all stems have been tried, decide which way to split
797 * a stem is the best.
801 * Consider all splits for W such that its suffix is already in Suffixes.
802 * Put the stem in TempStems, and attach the suffix to the stem.
803 * Look at all words that start like that stem;
804 * Attach the suffixes found in those words to that stem.
806 * Now, if there are more than 2 suffixes on that stem now,
807 * put the stem's signature in a TempSignature collection.
808 * Find cost = cost (Sig).
809 * If cost is negative, go ahead and make all those splits.
812 void CMiniLexicon::LooseFit()
815 // Sum over all of its stems :
816 // log ( CorpusSize / Stem-count ) ( cost )
817 // length ( stem ) * cost of a letter ( savings )
819 // Sum over all of its suffixes:
820 // log ( CorpusSize / suffix-count ) ( cost )
821 // length ( suffix ) * cost of a letter ( savings )
825 CLexicon
& lex
= *m_pLexicon
;
826 linguistica::ui::status_user_agent
& status
= lex
.status_display();
827 const bool analyzingSuffixes
= !is_initial(m_AffixLocation
);
828 const int MinimumAffixLength
= m_pLexicon
->GetIntParameter(
830 "Main\\MinimumSuffixLength" :
831 "Main\\MinimumPrefixLength", 2);
832 const int MaximumAffixLength
= m_pLexicon
->GetIntParameter(
834 "Main\\MaximumSuffixLength" :
835 "Main\\MaximumPrefixLength", 7);
836 int MinimumStemLength
= m_pLexicon
->GetIntParameter(
837 "Main\\MinimumStemLength", 3);
838 if (MinimumStemLength
< 1)
839 // do not allow empty stems
840 // XXX. notify user that value was ignored
841 MinimumStemLength
= 1;
843 LogFileLargeTitle("Phase: Loose fit of known signatures");
846 auto_ptr
<CSignatureCollection
> proposed_sigs(analyzingSuffixes
?
847 new CSignatureCollection(this, m_pSuffixes
, m_AffixLocation
) :
848 new CSignatureCollection(this, m_pPrefixes
, m_AffixLocation
));
849 auto_ptr
<CSignatureCollection
> existing_sigs(analyzingSuffixes
?
850 new CSignatureCollection(this, m_pSuffixes
, m_AffixLocation
) :
851 new CSignatureCollection(this, m_pPrefixes
, m_AffixLocation
));
852 auto_ptr
<CSignatureCollection
> novel_sigs(analyzingSuffixes
?
853 new CSignatureCollection(this, m_pSuffixes
, m_AffixLocation
) :
854 new CSignatureCollection(this, m_pPrefixes
, m_AffixLocation
));
856 QList
<CSignature
*> ProposedSignatures
;
857 QList
<CSignature
*> ExistingSignatures
;
858 QList
<CSignature
*> NovelSignatures
;
860 if (m_pWords
->GetCount() == 0)
861 // Need to read corpus first.
864 // We will be iterating over words, so sort them.
867 // Give the user something to look at.
868 status
.progress
.clear();
869 status
.major_operation
=
870 QString("Mini-Lexicon %1: Loose fit.").arg(m_Index
+1);
872 // For each word: look for known prefixes/suffixes and consider them.
873 // Fill proposed_sigs with candidates.
874 CStemCollection potential_stems
;
875 const int num_words
= m_pWords
->GetCount();
876 status
.progress
.set_denominator(num_words
);
877 for (int wordno
= 0; wordno
< num_words
; ++wordno
) {
878 CStem
& word
= *m_pWords
->GetAtSort(wordno
);
879 status
.progress
= wordno
;
885 if (word
.GetKeyLength() < 2)
886 // too small to analyze
889 CStringSurrogate word_text
= word
.GetKey();
891 // For each textual prefix/suffix of word: if known, consider.
892 for (int i
= MaximumAffixLength
; i
>= MinimumAffixLength
; --i
) {
893 Q_ASSERT(MinimumStemLength
>= 1);
894 if (word_text
.GetLength() - i
< MinimumStemLength
)
895 // corresponding stem is too short
898 CStringSurrogate affix_text
= analyzingSuffixes
?
899 word_text
.Right(i
) : word_text
.Left(i
);
900 CAffix
* affix
= analyzingSuffixes
?
901 implicit_cast
<CAffix
*>(*m_pSuffixes
^= affix_text
) :
902 implicit_cast
<CAffix
*>(*m_pPrefixes
^= affix_text
);
908 Q_ASSERT(word_text
.GetLength() > i
);
909 CStringSurrogate stem_text
= analyzingSuffixes
?
910 word_text
.Left(word_text
.GetLength()-i
) :
911 word_text
.Right(word_text
.GetLength()-i
);
913 if (*m_pStems
^= stem_text
)
914 // stem already known
917 if (potential_stems
^= stem_text
)
918 // stem already under consideration
921 LogFile(stem_text
.Display());
922 CStem
* stem
= potential_stems
<< stem_text
;
924 // build signature from remaining words
925 CParse empirical_sig
;
926 if (analyzingSuffixes
) {
927 m_pWords
->CreateSuffixSignatures(
928 &stem_text
, &empirical_sig
);
930 if (!m_pWords
->GetReverseTrie())
931 m_pWords
->CreateReverseTrie();
933 // build list of reversed prefixes
934 CParse backwards_sig
;
935 stem_text
.SetBackwards();
936 m_pWords
->GetReverseTrie()->CreateSuffixSignatures(
937 &stem_text
, &backwards_sig
);
938 stem_text
.SetBackwards(false);
940 // reverse them to get actual prefixes
941 for (int n
= 1; n
<= backwards_sig
.Size(); ++n
) {
942 CStringSurrogate prefix
=
943 backwards_sig
.GetPiece(n
);
945 prefix
.SetBackwards();
946 empirical_sig
.Append(prefix
);
947 prefix
.SetBackwards(false);
951 // for each prefix/suffix that appears with stem:
952 // throw it out if it's obviously bad, but
953 // otherwise record it in *stem.
954 for (int n
= 1; n
<= empirical_sig
.Size(); ++n
) {
955 CStringSurrogate affix2
= empirical_sig
.GetPiece(n
);
957 // word this affix was taken from
958 CParse word2_text
= analyzingSuffixes
?
961 CStem
* word2
= *m_pWords
^= word2_text
;
962 Q_ASSERT(word2
!= 0);
964 if (word2
->Size() > 1)
965 // previously analyzed
968 // If affix is of reasonable size or is
969 // known from elsewhere, add it to stem's list.
970 const int affix2_len
= affix2
.GetLength();
971 const bool reasonable_size
=
972 affix2_len
>= MinimumAffixLength
||
973 affix2_len
<= MaximumAffixLength
;
975 if (reasonable_size
|| (analyzingSuffixes
?
976 (*m_pSuffixes
^= affix2
) != 0:
977 (*m_pPrefixes
^= affix2
) != 0)) {
978 if (analyzingSuffixes
)
979 stem
->AddSuffix(affix2
);
981 stem
->AddPrefix(affix2
);
984 // special case: stem appears with null affix
985 if (CStem
* word2
= *m_pWords
^= stem_text
) {
986 if (word2
->Size() > 1) {
987 // previously analyzed
989 if (analyzingSuffixes
)
990 stem
->AddNULLSuffix();
992 stem
->AddNULLPrefix();
996 // fetch recorded affixes
997 CParse
& affixes
= analyzingSuffixes
?
998 *stem
->GetSuffixList() :
999 *stem
->GetPrefixList();
1001 // At least the affix that prompted this search
1002 // should be among them.
1003 Q_ASSERT(affixes
.Contains(affix_text
));
1004 Q_ASSERT(affixes
.Size() > 0);
1006 // build new signature
1007 CSignature
* new_sig
= *proposed_sigs
<< &affixes
;
1009 if (analyzingSuffixes
)
1010 new_sig
->AttachToSuffixSig(stem
, false);
1012 new_sig
->AttachToPrefixSig(stem
, false);
1013 new_sig
->SetAffixLocation(m_AffixLocation
);
1014 LogFile1SimpleString (new_sig
->Display());
1015 } // for each textual prefix/suffix
1017 LogFile(""); // Blank line between stem-families
1018 } // end of wordno loop
1019 status
.progress
.clear();
1021 // Now we have, or might have, a set of candidate splittings,
1022 // so let's compare them.
1023 // If any of the corresponding signatures are already real,
1024 // then we take the stem corresponding to the sig with the
1025 // highest value for Robustness.
1026 // Otherwise we use the Cost function for signatures.
1027 LogFileSmallTitle("Proposed signatures", "Signature", "status");
1030 // First find the set of existing signatures (won't be many).
1032 for (int signo
= 0; signo
< ProposedSignatures
.count(); signo
++)
1034 CSignature
* sig
= ProposedSignatures
.at(signo
);
1036 if (sig
->Size() < 2 && sig
->GetKeyLength() < 4) {// total length < 4! Way too small.
1037 LogFile("too small: ", sig
->Display() );
1042 if (*GetSignatures() ^= sig
) ExistingSignatures
.append(sig
);
1043 else NovelSignatures
.append(sig
);
1048 // Sort existing signatures by robustness
1049 const int size
= existing_sigs
->GetCount();
1051 LogFileStartTable(); LogFileHeader("Pre-existing signature", "Robustnes" );
1054 // calculate each signature’s robustness
1055 vector
<double> Robustness
;
1056 Robustness
.reserve(size
);
1057 for (int signo
= 0; signo
< size
; ++signo
) {
1058 CSignature
* sig
= (*existing_sigs
)[signo
];
1060 Robustness
.push_back(sig
->GetRobustness());
1061 LogFile(sig
->Display(), Robustness
[signo
]);
1064 LogFileHeader("Signature", "Stem");
1066 // For each existing signature (sorted by robustness):
1067 // analyze (cut) unanalyzed words according to this signature.
1068 vector
<int> shuffle(size
);
1069 SortVector(&shuffle
[0], &Robustness
[0], size
);
1070 for (int j
= 0; j
< size
; ++j
) {
1071 CSignature
* sig
= (*existing_sigs
)[shuffle
[j
]];
1072 LogFile(sig
->Display());
1075 // Analyze (cut) unanalyzed words according to signature.
1076 sig
->CutMyWordsAsIDeclare();
1083 // Consider novel signatures.
1084 const int size
= novel_sigs
->GetCount();
1085 LogFileSmallTitle ("Novel signatures: "); LogFileStartTable();
1088 // for each signature: calculate cost function
1089 vector
<double> Cost
;
1091 for (int i
= 0; i
< size
; ++i
)
1092 Cost
.push_back((*novel_sigs
)[i
]->FindCost(this));
1094 // for each novel signature, in least-cost-first order:
1095 // Analyze (cut) unanalyzed words according to this signature.
1096 // Stop when the cost becomes positive.
1097 vector
<int> shuffle(size
);
1098 SortVector(&shuffle
[0], &Cost
[0], size
);
1100 for (int signo
= size
-1; signo
>= 0 && Cost
[shuffle
[signo
]] < 0; --signo
) {
1101 CSignature
* sig
= (*novel_sigs
)[shuffle
[signo
]];
1102 LogFile(sig
->Display(), Cost
[shuffle
[signo
]]);
1103 // Analyze (cut) unanalyzed words according to signature.
1104 sig
->CutMyWordsAsIDeclare();
1106 } // end of signo loop
1107 // XXX. report cost of first rejected signature to log
1109 // what had followed has been eliminated, and replaced by:
1110 CStringSurrogate
remark(QString("Loose fit"));
1111 RebuildAffixesStemsAndSignaturesFromWordSplits (remark
);
1113 // XXX. not an operation
1114 status
.major_operation
= QString("Mini-Lexicon %1: End of loose fit.")
1116 GetDLHistory()->append(
1117 QString("Mini-Lexicon %1").arg(m_Index
+1),
1118 QString("Loose Fit"),
1122 int CMiniLexicon::GetNumberOfStems()
1124 return m_pStems
->GetCount();