CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Lexicon_Allomorphy.cpp
blobe2c874eb20c763f5f653e9d46b51d7816a5ff003
1 // Some methods for discovering allomorphs of stems
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
5 #include <memory>
6 #include <QList>
7 #include <QMap>
8 #include "Signature.h"
9 #include "Compound.h"
10 #include "Suffix.h"
11 #include "Stem.h"
12 #include "SignatureCollection.h"
13 #include "CompoundCollection.h"
14 #include "SuffixCollection.h"
15 #include "WordCollection.h"
16 #include "StemCollection.h"
17 #include "CollectionTemplate.h"
18 #include "StringSurrogate.h"
19 #include "StringFunc.h"
20 #include "HTML.h"
21 #include "Typedefs.h"
23 typedef QMap<QString,QString> StringToString;
27 Finding rules: first we find pairs of similar stems, and see HOW they differ.
30 Suppose we find a lot of stems that differ by final -e. Then we look at the smaller stem's
31 signature. If it contains a high incidence of
32 (a) suffix -e;
33 (b) one or two other suffixes F1, F2
34 (c) F1 and F2 "go well" with the signature of the larger stem(s);
36 then:
37 (d) Consider the hypothesis that F1 and F2 are of the form <e>F1 and <e>F2
42 struct SigLetter {
43 CSignature* m_SigPointer;
44 CParse m_Letter;
46 SigLetter(CSignature* Sig, const CStringSurrogate& SS)
47 : m_SigPointer(Sig), m_Letter(SS) { }
50 void CMiniLexicon::RelateStems()
52 const CStringSurrogate CSSofQNULL(TheStringNULL);
53 const int MinimumSuffixCount = 5;
55 LogFileLargeTitle( "Finding allomorphy (Version 2002)");
56 m_pSuffixes->Sort(COUNT);
58 // XXX. consider using boost::optional instead.
59 std::auto_ptr<CParse> pPossibleDeletingSuffixes;
61 // Major loop over deletable pseudo-suffixes
62 QList<SigLetter*> SignaturesToFixList;
63 //SignaturesToFixList.setAutoDelete(true); //fix this -- so it doesn't become a memory leak. @@@
64 CSuffixCollection SuffixesToEliminate;
65 for (int suffixno = 0; suffixno < m_pSuffixes->GetCount(); ++suffixno) {
66 CSuffix* qSuffix = m_pSuffixes->GetAtSort(suffixno);
67 if (qSuffix->GetKeyLength() != 1)
68 continue;
69 if (qSuffix->GetUseCount() < MinimumSuffixCount)
70 continue;
72 /// possible suffix (like 'e' in English)
73 CStringSurrogate ssDiffLetter = qSuffix->GetKey();
74 LogFileSmallTitle(ssDiffLetter.Display());
76 pPossibleDeletingSuffixes = std::auto_ptr<CParse>(new CParse);
78 // Find all suffix candidates that MIGHT delete this DiffLetter
79 // if DiffLetter == 'e', SuffixCandidates includes ing, ity.
80 LogFileHeader("Suffixes that do not follow this suffix", "Count");
82 bool found = false;
83 for (int suffixno2 = 1; suffixno2 < m_pSuffixes->GetCount(); ++suffixno2) {
84 CSuffix* pSuffix = m_pSuffixes->GetAt(suffixno2);
86 if (pSuffix->GetUseCount() < MinimumSuffixCount)
87 continue;
89 CStringSurrogate ssSuffix = pSuffix->GetKey();
90 int TotalStemsWithSuffix;
91 int HowManyEndWithThisLetter;
92 HowManyStemsWithThisSuffixEndInThisLetter(
93 ssSuffix,
94 ssDiffLetter,
95 TotalStemsWithSuffix,
96 HowManyEndWithThisLetter);
98 // XXX. Use “Threshold = 0.05” instead?
100 const int MinimumStemCount = 5;
101 const int MaximumExceptionCount = 5;
103 // if suffix doesn’t begin with DiffLetter
104 // and its count is very small, ignore it.
105 if (TotalStemsWithSuffix <= MinimumStemCount &&
106 ssSuffix.Left(ssDiffLetter.GetLength())
107 != ssDiffLetter)
108 continue;
109 LogFile(ssSuffix.Display(), HowManyEndWithThisLetter);
111 if (HowManyEndWithThisLetter <= MaximumExceptionCount) {
112 pPossibleDeletingSuffixes->Append(ssSuffix);
113 found = true;
117 if (!found) LogFileSmallTitle( "None found");
120 // Simple signatures (2 affixes)
121 // We now have a set of candidates that might delete DiffLetter
122 // We will look at each signature, and see whether it
123 // erroneously has a suffix that should really be thought
124 // of as deleting from the stem rather than as a suffix.
126 // 1. In our first test, we will look for signatures of the
127 // form: Letter.OtherSuffix, where OtherSuffix does not
128 // appear after stems ending in Letter, and where
129 // NULL.OtherSuffix does occur as a signature.
131 LogFileHeader("Suspicious signature","Paired signature" );
133 bool found = false;
134 for (int signo = 0; signo < m_pSignatures->GetCount(); ++signo) {
135 CSignature* pSig = m_pSignatures->GetAt(signo);
137 if (pSig -> Size() != 2)
138 continue;
139 if (!pSig -> Contains(ssDiffLetter))
140 continue;
142 QString OtherSuffix;
143 if (pSig -> GetPiece(1) == ssDiffLetter)
144 OtherSuffix = pSig->GetPiece(2).Display();
145 else
146 OtherSuffix = pSig->GetPiece(1).Display();
148 if (OtherSuffix == TheStringNULL)
149 continue;
150 if (!pPossibleDeletingSuffixes->Contains(OtherSuffix))
151 // OtherSuffix doesn't appear after DiffLetter
152 continue;
153 LogFile(pSig->Display());
154 found = true;
156 CParse TestSig;
157 TestSig.Append(TheStringNULL);
158 TestSig.Append(OtherSuffix);
159 TestSig.Alphabetize();
161 if (*m_pSignatures ^= TestSig) {
162 SuffixesToEliminate << ssDiffLetter;
164 std::auto_ptr<SigLetter> pSigLetter(
165 new SigLetter(pSig, ssDiffLetter));
166 SignaturesToFixList.append(
167 pSigLetter.release());
168 LogFile(TestSig.Display());
169 } else {
170 LogFile(TestSig.Display(), " not found.");
174 if (!found) LogFile("None found.");
175 LogFileEndTable();
178 // NB: We should do the preceding test in a more
179 // general way, that is:
180 // Look for pairs of signatures of the form
181 // A.B and A.xB, where x is the deleting letter;
182 // the previous case is like this and where B is NULL.
183 // Then A deletes x, i.e., A is really <x>A.
185 // 2. In our other test, we look at the signatures
186 // that contain both DiffLetter and one of the deleters,
187 // and see, first of all, if a suffix appears more often
188 // with NULL than with DiffLetter;
189 // or, the suffix actually is of the form aX,
190 // where a = DiffLetter, X occurs with NULL more
191 // than aX occurs with a.
193 if (!(SuffixesToEliminate ^= ssDiffLetter))
194 continue;
196 for (int signo = 0; signo < m_pSignatures->GetCount(); ++signo) {
197 CSignature* pSig = m_pSignatures->GetAt(signo);
198 if (!pSig -> Contains(ssDiffLetter))
199 continue;
201 for (int affixno = 1; affixno <= pSig->Size(); ++affixno) {
202 CStringSurrogate ssSuffix = pSig->GetPiece(affixno);
203 CSuffix* pSuffix = *m_pSuffixes ^= ssSuffix;
205 if (pSuffix->GetDeletees()->Contains(ssDiffLetter))
206 // we've already determined it deletes DiffLetter
207 continue;
209 if (StemsWithBothSuffixes(TheStringNULL, ssSuffix) >
210 StemsWithBothSuffixes(
211 ssDiffLetter, ssSuffix))
212 // probably a deleter
213 pSuffix->AddDeletee(ssDiffLetter);
215 if (ssSuffix.Left(ssDiffLetter.GetLength()) ==
216 ssDiffLetter) {
217 // probably cut wrong, like "ement"
218 CStringSurrogate ssTruncatedForm =
219 ssSuffix.Mid(
220 ssDiffLetter.GetLength());
222 if (StemsWithBothSuffixes(TheStringNULL,
223 ssTruncatedForm) >
224 StemsWithBothSuffixes(
225 ssDiffLetter,
226 ssSuffix))
227 pSuffix->AddDeletee(ssDiffLetter);
232 // Now put onto SignaturesToFixList all of the sigs that
233 // are composed ONLY of deleting suffixes.
234 LogFileHeader("Signatures to fix");
235 for (int signo = 0; signo < m_pSignatures->GetCount(); ++signo) {
236 CSignature* pSig = m_pSignatures->GetAt(signo);
238 if (!pSig->Contains(ssDiffLetter))
239 continue;
241 bool bFoundDeletee = false;
242 bool bFoundDeleter = false;
243 for (int affixno = 1; affixno <= pSig->Size(); ++affixno) {
244 const CStringSurrogate ssSuffix = pSig->GetPiece(affixno);
245 if (ssSuffix == ssDiffLetter) {
246 bFoundDeletee = true;
247 continue;
250 CSuffix* pSuffix = *m_pSuffixes ^= ssSuffix;
251 Q_ASSERT(pSuffix != 0);
252 if (!pSuffix->GetDeletees()
253 ->Contains(ssDiffLetter)) {
254 // break;
255 } else {
256 bFoundDeleter = true;
260 if (bFoundDeletee && bFoundDeleter) {
261 std::auto_ptr<SigLetter> pSigLetter(new SigLetter(
262 pSig, ssDiffLetter));
263 LogFile(pSigLetter->m_SigPointer->Display());
264 SignaturesToFixList.append(pSigLetter.release());
268 LogFileEndTable();
269 } // end of checking this letter to see if it ever deletes.
270 // End of Major loop over deletable pseudo-suffixes
271 LogFileSmallTitle("Now we do the changing for the simple signatures.");
274 foreach (SigLetter* pSigLetter, SignaturesToFixList ) {
275 CSignature* pSig = pSigLetter->m_SigPointer;
276 CStringSurrogate ssDiffLetter = pSigLetter->m_Letter;
278 // XXX. log:
279 // "1. Moving material from suffixes to stems:
280 // ${pSigLetter->m_Letter}, in signature:
281 // ${*pSigLetter->m_SigPointer}
283 // this is where the real work happens
284 // pPossibleDeletingSuffixes =
285 // SuffixesWhichMightDeleteKey.find(ssDiffLetter.Display()));
287 // XXX. pPossibleDeletingSuffixes kept on getting recreated
288 // before, so something is probably awry.
289 if (pPossibleDeletingSuffixes.get())
290 MoveWordsStemSuffixBoundaryToRight(pSig,
291 ssDiffLetter, pPossibleDeletingSuffixes.get());
294 // XXX. log: Suffixes to eliminate: ${SuffixesToEliminate}
296 m_pSignatures->CleanUp();
297 SignaturesToFixList.clear();
298 m_pSignatures->CleanUp();
300 // go through all the signatures,
301 // and identify those that should be fixed.
302 // For example, the signature e.<e>ing.es
303 // will be marked to become NULL.<e>ing.s. /
304 LogFileSmallTitle("Bigger signatures");
305 LogFile ("How many suffixes to eliminate? ", SuffixesToEliminate.GetCount());
307 for (int suffixno = 0; suffixno < SuffixesToEliminate.GetCount(); ++suffixno) {
308 CSuffix* const pSuffixToEliminate = SuffixesToEliminate[suffixno];
309 const CStringSurrogate ssSuffixToEliminate =
310 pSuffixToEliminate->GetKey();
312 LogFileSmallTitle(ssSuffixToEliminate.Display());
313 LogFileHeader("Signature", "Suffix", "Disposition");
315 for (int signo = 0; signo < m_pSignatures->GetCount(); ++signo) {
316 CSignature* const pSig = m_pSignatures->GetAt(signo);
318 bool ThisSigContainsDeletingSuffix = false;
319 if (pSig->Contains(ssSuffixToEliminate) &&
320 pSig->Size() > 2) {
321 LogFile(pSig->Display());
323 for (int affixno = 1; affixno <= pSig->Size(); ++affixno) {
324 CStringSurrogate ssSuffix =
325 pSig->GetPiece(affixno);
326 LogFile(ssSuffix.Display());
328 if (ssSuffix == ssSuffixToEliminate) {
329 // 2. Suffix is the one we want
330 // to make into NULL
331 LogFile("Found the deleting (pseudo-) suffix ");
332 continue;
335 if (pPossibleDeletingSuffixes.get() &&
336 pPossibleDeletingSuffixes
337 ->Contains(ssSuffix)) {
338 // 3. Suffix deletes the suffix
339 // we want to make NULL --
340 ThisSigContainsDeletingSuffix = true;
342 // XXX. CStringSurrogate::StartsWith
343 if (ssSuffix.Left(
344 ssSuffixToEliminate.GetLength())
345 == ssSuffixToEliminate) {
346 // this is the "ement" case
347 // -- we want to change the
348 // suffix,
349 // not make it an e-deleter
351 // XXX. SuffixChangesToMake
352 // map never populated
353 QString NewSuffix = "";
356 LogFile("We found the pseudosuffix at the beginning of a poor suffix; will change to: ",NewSuffix);
357 continue;
359 LogFile("We found the pseudosuffix deleting before ", ssSuffix.Display());
360 continue;
361 } else if (ssSuffix.Left( ssSuffixToEliminate.GetLength()) == ssSuffixToEliminate)
363 // 4. Suffix starts with the suffix
364 // we want to make NULL
365 LogFile("We found ", ssSuffix.Display());
366 continue;
368 LogFile("Not a deleter:", (ssSuffix.Display()));
369 } // end of checking each suffix in the signature
372 if (!ThisSigContainsDeletingSuffix)
373 continue;
375 std::auto_ptr<SigLetter> pSigLetter(new SigLetter(
376 pSig, ssSuffixToEliminate));
377 SignaturesToFixList.append(pSigLetter.release());
378 } // end of signature loop
380 LogFileEndTable();
381 } // end of loop over suffixes being eliminated.
383 LogFileSmallTitle("Now we do the changing for more complex signatures");
385 foreach (SigLetter* pSigLetter, SignaturesToFixList) {
386 CSignature* pSig = pSigLetter->m_SigPointer;
387 CStringSurrogate ssDiffLetter = pSigLetter->m_Letter;
389 LogFile (QString(" Moving material from suffixes to stems: "),pSigLetter->m_Letter.Display(), QString(", in signature: "),
390 pSigLetter->m_SigPointer->Display());
392 // this is where the real work happens:
394 MoveWordsStemSuffixBoundaryToRight(pSig,
395 pSigLetter->m_Letter.Display(),
396 pPossibleDeletingSuffixes.get());
398 LogFileHeader ("Suffixes to eliminate: ");
399 for (int suffixno = 0; suffixno < SuffixesToEliminate.GetCount(); ++suffixno){
400 LogFile( SuffixesToEliminate[suffixno]->Display() );}
401 LogFileEndTable();
403 m_pSignatures->CleanUp();
409 //////////////////////////////////////////////////////////////////////////////////////
410 //////////////////////////////////////////////////////////////////////////////////////
412 /* The following function takes a stem, and adds FinalLetter to it, and then
413 adds all the other suffixes in the stem's old SuffixList to the bigger
414 stem -- whether that stem already existed or not.
415 We should be sure to get rid of this stem only if ALL of its suffixes
416 are shifted -- and many may not be.
419 //////////////////////////////////////////////////////////////////////////////////////
420 //////////////////////////////////////////////////////////////////////////////////////
422 void CMiniLexicon::ShiftFinalLetterToStem (CStem* pStem, QString& FinalLetter )
424 QString Word,
425 Suffix,
426 NewStem,
427 Stem = pStem->GetKey().Display();
428 CSS ssWord,
429 ssSuffix,
430 ssStem;
431 CStem* pWord;
432 CParse* SuffixList = pStem->GetSuffixList();
433 CSignature* pNewSig;
434 CSuffix* pSuffix;
437 ssStem = *pStem;
441 for (int suffixno = 1; suffixno <= SuffixList->Size(); suffixno++)
443 Suffix = SuffixList->GetPiece(suffixno).Display();
444 if ( Suffix == FinalLetter )
446 continue;
450 Word = Stem + Suffix;
451 pWord = *m_pWords ^= Word;
452 Suffix = "<" + FinalLetter + ">" + Suffix;
453 pSuffix = *m_pSuffixes << ssSuffix;
455 NewStem = ssStem.Display() + FinalLetter;
456 CStem* pBiggerStem = *m_pStems ^= NewStem;
458 if (pBiggerStem != 0) {
459 // the bigger stem already exists....
460 pBiggerStem->AddSuffix( Suffix );
461 pNewSig = *m_pSignatures << pBiggerStem->GetSuffixList();
463 // ...get rid of OLD signature in Signatures;
466 // Fix word structure
471 else
473 // the bigger stem is new...
477 pWord->AttachWordAndSuffixalStem(pBiggerStem);
484 void CMiniLexicon::HowManyStemsWithThisSuffixEndInThisLetter (
485 CStringSurrogate& Suffix,
486 CStringSurrogate& Letter,
487 int& TotalStemsWithSuffix,
488 int& HowManyEndWithThisLetter )
491 CSignature* pSig;
492 CSS ssStem;
493 CStem* pStem;
495 TotalStemsWithSuffix = 0;
496 HowManyEndWithThisLetter = 0;
498 Q_ASSERT (Letter.Display() != "e" || Suffix.Display() != "ing");
500 for (int signo = 0; signo < m_pSignatures->GetCount(); signo++)
502 pSig = m_pSignatures->GetAt(signo);
503 if ( ! pSig->Contains ( Suffix ) ) { continue; }
505 for (int stemno= 0; stemno < pSig->GetStemPtrList()->size(); stemno++)
506 { pStem = pSig->GetStemPtrList()->at(stemno);
507 ssStem = pStem->GetKey();
508 TotalStemsWithSuffix++;
509 if ( ssStem.Right (Letter.GetLength() ) == Letter )
511 HowManyEndWithThisLetter++;
517 int CMiniLexicon::StemsWithBothSuffixes(CStringSurrogate& ssSuffix1, CStringSurrogate& ssSuffix2)
519 int count = 0;
520 CSignature* pSig;
523 for (int signo = 0; signo < m_pSignatures->GetCount(); signo++)
525 pSig = m_pSignatures->GetAt(signo);
526 if ( pSig->Contains (ssSuffix1) && pSig->Contains(ssSuffix2) )
528 count += pSig->GetNumberOfStems();
531 return count;
533 int CMiniLexicon::StemsWithBothSuffixes(QString Suffix1, CStringSurrogate& ssSuffix2)
535 int count = 0;
536 CSignature* pSig;
539 for (int signo = 0; signo < m_pSignatures->GetCount(); signo++)
541 pSig = m_pSignatures->GetAt(signo);
542 if ( pSig->Contains (Suffix1) && pSig->Contains(ssSuffix2) )
544 count += pSig->GetNumberOfStems();
547 return count;
551 void CMiniLexicon::MoveWordsStemSuffixBoundaryToRight(CSignature* pThisSig,
552 CStringSurrogate& ssDeletee,
553 CTypedPtrMap<CMapStringToString, CString, CString>& Remapper
556 CParse PNewSig,
557 POldSig,
558 PMergedSig,
559 PNewStem,
560 PDeletee = ssDeletee;
562 POSITION Pos;
563 bool bModifiedSigExisted = FALSE;
564 CSignature *pLargerModifiedSig,
565 *pModifiedSig = NULL,
566 *pOldSigBecameSig = NULL,
567 *pOlderSig = NULL;
569 CStringSurrogate ssStem,
570 ssSuffix,
571 ssNewSuffix;
572 CStem* pOldStem;
574 CStem* pNewStem;
575 CParse DummyParse,
576 Word,
577 WhatSigWillBecome,
578 SuffixChanges;
579 CSuffix* pSuffix;
580 CStem * pWord;
581 bool bStemShouldRemain = FALSE;
582 bool bOldSigRemains = FALSE;
583 bool bNewStemAlreadyExisted = FALSE;
584 bool val;
585 int i;
586 //----------------------------------------------------------------//
592 int* bStatus = new int [ pThisSig->Size() + 1];
594 if ( m_LogFile )
596 *m_LogFile << "\n\tF1: Creating new signature: " <<
597 PNewSig.Display() <<
598 " Residue of old sig: "<<
599 WhatSigWillBecome.Display();
603 PNewSig = CreateADeletingSignature(
604 pThisSig,
605 PDeletee,
606 SuffixChanges,
607 bStatus,
608 WhatSigWillBecome,
609 Remapper
611 if ( m_LogFile )
613 *m_LogFile << "\n\tF1: Creating new signature: " <<
614 PNewSig.Display() <<
615 " Residue of old sig: "<<
616 WhatSigWillBecome.Display();
619 // TODO: fix this later. Eg local: NULL.e.ly, creates a signature just NULL with locale. Not useful.
620 if ( PNewSig.Size() == 1 ) {return; }
622 // TODO: fix this later. Problem is what's left over is just NULL: e.g. NULL.e.ed
623 if ( CSS( WhatSigWillBecome ) == CSS ( CString("NULL") ) ) { return; }
626 pModifiedSig = *Signatures ^= PNewSig;
627 if (! pModifiedSig )
629 pModifiedSig = *Signatures << &PNewSig ;
632 pModifiedSig->SetRemark ( pThisSig->GetRemark() + " + allomorphy" );
634 if ( WhatSigWillBecome.Size() > 0 )
636 bOldSigRemains = TRUE;
637 pOldSigBecameSig = *Signatures << &WhatSigWillBecome;
638 pOldSigBecameSig ->SetRemark( CString("Allomorphy2") );
641 //----------------------------------------------------------------//
642 // Loop through stems:
643 //----------------------------------------------------------------//
645 Pos = pThisSig->GetStemPtrList()->GetHeadPosition();
646 CTypedPtrList<CPtrList, CStem*>* Temp = pThisSig->GetStemPtrList();
647 while (Pos)
649 pOldStem = pThisSig->GetStemPtrList()->GetNext(Pos);
650 PNewStem = pOldStem->GetPiece() + ssDeletee;
651 PNewStem .SimplifyParseStructure();
652 pNewStem = *Stems_Suffixed ^= PNewStem;
656 if ( pNewStem ) // -- if the larger stem ("love") already existed
658 if ( m_LogFile ) { *m_LogFile << "\t NewStem: "<< pNewStem->GetKey() << " existed."; }
660 bNewStemAlreadyExisted = TRUE;
661 pLargerModifiedSig =
662 ModifyAnExistingSignatureWithANewStem ( pOldStem, pNewStem, pThisSig );
664 else
666 if ( m_LogFile ) { *m_LogFile << "\t NewStem: "<< PNewStem.Display() << " did not exist; it does now."; }
668 bNewStemAlreadyExisted = FALSE;
669 pNewStem = CreateANewSignatureForThisStem ( PNewStem, pModifiedSig );
671 val = pThisSig ->RemoveStem ( pOldStem );
672 Q_ASSERT (val);
675 if ( bOldSigRemains )
677 pOldStem ->SetSuffixList ( &WhatSigWillBecome );
678 pOldStem ->SetSuffixSignature ( pOldSigBecameSig );
683 if ( m_LogFile ) { *m_LogFile << "\n\t\tF1: Changing stem: "<< pOldStem->GetKey() ; }
685 // go through each word associated with the OldStem, and change its suffix pointer.
686 for ( i = 1; i <= (int) pThisSig->Size(); i++)
688 if ( bStatus[i] == FALSE )
690 bStemShouldRemain = TRUE;
691 ssSuffix = pThisSig->GetPiece(i);
692 if ( ssSuffix.IsNULL() )
694 Word = *pOldStem;
696 else
698 Word = pOldStem->GetPiece() + ssSuffix ;
701 LinkThisWordToThisSignature ( Word, pOldSigBecameSig );
702 pThisSig ->RemoveWord (pWord);
704 if ( !pOldSigBecameSig->GetStemPtrList()->Find( pOldStem ) )
706 pOldSigBecameSig->GetStemPtrList()->AddTail (pOldStem);
709 continue;
710 } // this means the suffix doesn't delete the deleting suffix or anything
712 ssNewSuffix = SuffixChanges.GetPiece(i);
714 pSuffix = *Suffixes ^= ssNewSuffix;
715 if ( ssNewSuffix.IsNULL() )
717 Word = PNewStem;
719 else
721 Word = pOldStem->GetPiece() + pThisSig->GetPiece(i) ;
724 pWord = *Words ^= CStringSurrogate( Word );
725 pWord ->AppendToConfidence( CString ("Allomorphy 2") );
726 pWord ->SetSuffixPtr ( pSuffix );
727 val = pThisSig ->RemoveWord (pWord);
728 // Q_ASSERT (val);
730 // case like hostilit-y becoming hostility
731 if ( ssNewSuffix.IsNULL() )
733 pWord->DoNotParse();
736 if ( bNewStemAlreadyExisted )
738 pWord ->SetSuffixSignature ( pLargerModifiedSig );
739 pWord ->SetStemPtr ( pNewStem );
740 pLargerModifiedSig ->AttachToSuffixSig ( pNewStem );
742 else
744 if ( m_LogFile ) { *m_LogFile << "\t NewStem: "<< PNewStem.Display() << " did not exist."; }
745 pModifiedSig->AttachToSuffixSig(
746 pNewStem, CSignature::eDo_Not_Call_Words);
747 pWord->AttachSuffixSignature(pModifiedSig);
748 pWord->SetStemPtr(pNewStem);
752 }// cycle through the suffixes
755 if ( bStemShouldRemain == FALSE )
757 Stems_Suffixed ->RemoveMember( pOldStem->GetPiece() );
761 } // cycle of stems
766 // This is DEPRECATED! Not used in new code. The following version is used, the one underneath this.
767 void CMiniLexicon::MoveWordsStemSuffixBoundaryToRight(CSignature* pThisSig, QString Deletee, CParse* pSuffixCandidates )
769 CSS ssDeletee;
770 ssDeletee = Deletee;
771 MoveWordsStemSuffixBoundaryToRight(pThisSig, Deletee, pSuffixCandidates);
774 void CMiniLexicon::MoveWordsStemSuffixBoundaryToRight(CSignature* pThisSig,
775 CStringSurrogate& ssDeletee,
776 CParse* pSuffixCandidates //3/04
779 CParse PNewSig,
780 POldSig,
781 PMergedSig,
782 PNewStem,
783 PDeletee = ssDeletee;
785 CSignature* pModifiedSig = NULL;
786 CSignature* pOldSigBecameSig = NULL;
787 CSignature* pOlderSig = NULL;
789 CStringSurrogate ssSuffix,
790 ssNewSuffix;
791 CStem* pOldStem;
792 CStem* pNewStem;
793 CParse Word,
794 WhatSigWillBecome,
795 SuffixChanges;
796 CSuffix* pSuffix;
797 CStem * pWord;
798 bool bStemShouldRemain = FALSE;
799 bool bOldSigRemains = FALSE;
800 bool bNewStemAlreadyExisted = FALSE;
801 bool val;
802 //----------------------------------------------------------------//
806 // this actually changes the signature itself:
808 int* bStatus = new int [ pThisSig->Size() + 1];
811 PNewSig = CreateADeletingSignature(
812 pThisSig,
813 PDeletee,
814 SuffixChanges,
815 bStatus,
816 WhatSigWillBecome,
817 pSuffixCandidates
819 LogFile( pThisSig->Display(),
820 QString(" Creating new signature:"),
821 PNewSig.Display('-'),
822 QString(" Residue of old sig: "),
823 WhatSigWillBecome.Display() );
825 // TODO: fix this later. Eg local: NULL.e.ly, creates a signature just NULL with locale. Not useful.
826 if ( PNewSig.Size() == 1 ) {return; }
828 // TODO: fix this later. Problem is what's left over is just NULL: e.g. NULL.e.ed
829 if ( CSS( WhatSigWillBecome ) == CSS ( QString("NULL") ) ) { return; }
831 pModifiedSig = *m_pSignatures ^= PNewSig;
832 if (! pModifiedSig )
834 pModifiedSig = *m_pSignatures << &PNewSig ;
837 pModifiedSig->SetRemark ( pThisSig->GetRemark() + " + allomorphy" );
839 if ( WhatSigWillBecome.Size() > 0 )
841 bOldSigRemains = TRUE;
842 pOldSigBecameSig = *m_pSignatures << &WhatSigWillBecome;
843 pOldSigBecameSig ->SetRemark( QString("Allomorphy") );
846 //----------------------------------------------------------------//
847 // Loop through stems:
848 //----------------------------------------------------------------//
850 CStem* pStem;
851 LogFileHeader("New Stems", "Already existed?");
854 for (int stemno=0; stemno < pThisSig->GetNumberOfStems(); stemno++)
856 pStem = pThisSig->GetStem(stemno);
857 pOldStem = pStem;
858 QString temp2 = pStem->Display();
859 PNewStem = *pOldStem + ssDeletee;
860 PNewStem .SimplifyParseStructure();
861 QString temp4 = PNewStem.Display();
862 pNewStem = *m_pStems ^= PNewStem;
863 QString temp3 = PNewStem.Display();
865 // XXX. suppresses warning
866 // `pLargerModifiedSig' may be used uninitialized
867 // since g++ can’t figure out the control flow.
868 CSignature* pLargerModifiedSig = 0;
870 bNewStemAlreadyExisted = (pNewStem != 0);
871 if (bNewStemAlreadyExisted) {
872 // -- if the larger stem ("love") already existed
873 LogFile(pNewStem->Display(), "yes");
874 pOlderSig = pNewStem->GetSuffixSignature();
875 POldSig = *pNewStem->GetSuffixSignature();
876 POldSig.MergeParse(PNewSig, PMergedSig);
878 pLargerModifiedSig = *m_pSignatures << &PMergedSig;
879 pLargerModifiedSig->SetRemark(QString("Allomorphy1"));
880 pNewStem->SetSuffixList(&PMergedSig);
881 pNewStem->SetSuffixSignature( pLargerModifiedSig);
882 pNewStem->SetConfidence(QString("Allomorphy1"));
883 pThisSig->DetachStem(pOldStem,
884 CSignature::eDo_Not_Call_Words);
885 } else
887 LogFile(PNewStem.Display(), QString("no"));
888 pNewStem = *m_pStems << PNewStem;
889 pNewStem ->SetSuffixList ( pModifiedSig );
890 pNewStem ->SetSuffixSignature( pModifiedSig );
891 pNewStem ->SetConfidence ( QString("Allomorphy2") );
892 val = pThisSig ->RemoveStem ( pOldStem );
893 Q_ASSERT (val);
894 if ( bOldSigRemains )
896 pOldStem ->SetSuffixList ( &WhatSigWillBecome );
897 pOldStem ->SetSuffixSignature ( pOldSigBecameSig );
901 // go through each word associated with the OldStem, and change its suffix pointer.
902 for (int affixno = 1; affixno <= pThisSig->Size(); ++affixno) {
903 if ( bStatus[affixno] == FALSE )
905 bStemShouldRemain = TRUE;
906 ssSuffix = pThisSig->GetPiece(affixno);
907 if ( ssSuffix.IsNULL() )
909 Word = *pOldStem;
911 else
913 Word = CSS(pOldStem) + ssSuffix ;
916 pWord = *m_pWords ^= CStringSurrogate( Word );
917 pWord ->SetSuffixSignature ( pOldSigBecameSig );
918 pWord ->AppendToConfidence(QString ("Allomorphy 1") );
919 pOldSigBecameSig->AddWord( pWord );
920 if ( ! pOldSigBecameSig->StemListContains( pOldStem ) < 0 )
922 pOldSigBecameSig->AppendStemPtr(pOldStem);
924 pThisSig ->RemoveWord (pWord);
925 continue;
926 } // this means the suffix doesn't delete the deleting suffix or anything
928 ssNewSuffix = SuffixChanges.GetPiece(affixno);
930 pSuffix = *m_pSuffixes ^= ssNewSuffix;
931 if ( ssNewSuffix.IsNULL() )
933 Word = PNewStem;
935 else
937 Word = CSS(pOldStem) + pThisSig->GetPiece(affixno);
941 pWord = *m_pWords ^= CStringSurrogate( Word );
942 pWord ->AppendToConfidence( QString ("Allomorphy 2") );
943 pWord ->SetSuffixPtr ( pSuffix );
944 val = pThisSig ->RemoveWord (pWord);
945 Q_ASSERT (val);
947 // case like hostilit-y becoming hostility
948 if ( ssNewSuffix.IsNULL() )
950 pWord->DoNotParse();
953 if (bNewStemAlreadyExisted) {
954 pWord->SetSuffixSignature(pLargerModifiedSig);
955 pWord->SetStemPtr(pNewStem);
956 pLargerModifiedSig->AttachToSuffixSig(pNewStem);
957 } else {
958 // LogFile( PNewStem.Display(), QString( " did not exist.");
959 pModifiedSig->AttachToSuffixSig(pNewStem,
960 CSignature::eDo_Not_Call_Words);
961 pWord->AttachSuffixSignature(pModifiedSig);
962 pWord->SetStemPtr(pNewStem);
966 }// cycle through the suffixes
967 LogFileEndRow();
969 if ( bStemShouldRemain == FALSE )
971 m_pStems ->RemoveMember( pOldStem );
974 } // cycle of stems
975 LogFileEndTable();
980 Takes as input a signature, a Deletee (stem final material that will be deleted)
983 CParse CMiniLexicon::CreateADeletingSignature ( CSignature* pSig,
984 CSS Deletee,
985 CParse& ReplacingSuffixes,
986 int* bStatus,
987 CParse& WhatSigWillBecome,
988 CParse* pSuffixCandidatesThatMightDeleteDeletee) //3/04
992 CStringSurrogate ssSuffix, ssTruncatedSuffix;
995 CParse NewSig;
996 CSuffix* pSuffix,
997 *pTruncatedSuffix;
998 int DeleteeLength = Deletee.GetLength();
999 QString QNULL ("NULL");
1001 ReplacingSuffixes.ClearParse();
1002 WhatSigWillBecome.ClearParse();
1007 LogFile("Affix", "Status");
1008 for (int affixno = 1; affixno <= pSig->Size(); affixno++)
1009 { // Consider the suffixes in this signature. If one is the deletee, replace it by NULL.
1010 ssSuffix = pSig->GetPiece(affixno);
1011 if ( ssSuffix == Deletee )
1013 NewSig.Append (QNULL );
1014 ReplacingSuffixes.Append ( QNULL );
1015 bStatus[affixno] = TRUE;
1016 LogFile(ssSuffix.Display(), QString(" is our deletee"));
1018 else
1020 pSuffix = *m_pSuffixes ^= ssSuffix;
1021 ssTruncatedSuffix = ssSuffix.Mid( Deletee.GetLength() );
1022 pTruncatedSuffix = *m_pSuffixes ^= ssTruncatedSuffix;
1023 // 0: if the candidate doesn't delete the deleting suffix
1024 if ( ! pSuffixCandidatesThatMightDeleteDeletee->Contains( ssSuffix ) )
1026 ReplacingSuffixes.Append ( QString ("***") );
1027 WhatSigWillBecome.Append( ssSuffix );
1028 bStatus[affixno] = FALSE;
1029 LogFile ( ssSuffix.Display(), QString(" does not delete." ));
1031 // 1: this is the "ement" case
1032 else if (
1033 pTruncatedSuffix &&
1034 pSuffix->GetKey().Left( DeleteeLength ) == Deletee &&
1035 ( StemsWithBothSuffixes ( TheStringNULL, ssTruncatedSuffix ) >
1036 StemsWithBothSuffixes ( Deletee, ssSuffix )
1040 NewSig .Append( ssTruncatedSuffix );
1041 ReplacingSuffixes .Append( ssTruncatedSuffix );
1042 bStatus[affixno] = TRUE;
1043 LogFile (ssSuffix.Display(), QString(" the ement sort of case." ));
1045 else if (
1046 StemsWithBothSuffixes ( TheStringNULL, ssSuffix ) >
1047 StemsWithBothSuffixes ( Deletee , ssSuffix )
1049 // 2: this is the case of a suffix that tdeletes a stem-final Deletee
1052 pSuffix ->AddDeletee (Deletee);
1053 NewSig .Append( ssSuffix );
1054 ReplacingSuffixes .Append( ssSuffix );
1055 bStatus[affixno] = TRUE;
1056 LogFile (ssSuffix.Display(), QString( " is a deleter." ));
1058 else
1059 // 3: another case where we'll say, for now, the candidate doesn't delete the deleting suffix:
1061 ReplacingSuffixes.Append ( QString ("***") );
1062 WhatSigWillBecome.Append( ssSuffix );
1063 bStatus[affixno] = FALSE;
1064 LogFile (ssSuffix.Display(), QString( " maybe does not delete Deletee." ));
1068 LogFileEndTable();
1070 return NewSig;
1074 CParse CMiniLexicon::CreateADeletingSignature ( CSignature* pSig,
1075 CSS Deletee,
1076 CParse& ReplacingSuffixes,
1077 int* bStatus,
1078 CParse& WhatSigWillBecome,
1079 StringToString& Remapper) //3/04
1083 CStringSurrogate ssSuffix, ssTruncatedSuffix;
1085 CParse PSuffix,
1086 NewSig,
1087 Suffix;
1088 int DeleteeLength = Deletee.GetLength();
1089 QString QNULL ("NULL"),
1090 Rewrite;
1092 ReplacingSuffixes.ClearParse();
1093 WhatSigWillBecome.ClearParse();
1097 for (int n = 1; n <= pSig->Size(); n++)
1099 ssSuffix = pSig->GetPiece(n);
1100 if ( ssSuffix == Deletee )
1102 NewSig.Append (QNULL) ;
1103 ReplacingSuffixes.Append ( QNULL);
1104 bStatus[n] = TRUE;
1106 else
1107 { //if ( ! pSuffixCandidates->Contains( ssSuffix ) )
1109 QMap<QString,QString>::Iterator it;
1110 if ( Remapper.contains (ssSuffix.Display()) )
1111 //Remapper.Lookup( ssSuffix.SpellOut() ) )
1113 Rewrite =Remapper.find(ssSuffix.Display()).data();
1114 NewSig .Append( Rewrite);
1115 ReplacingSuffixes .Append( Rewrite);
1116 bStatus[n] = TRUE;
1118 else
1120 ReplacingSuffixes.Append ( QString ("***") );
1121 WhatSigWillBecome.Append( ssSuffix );
1122 bStatus[n] = FALSE;
1126 return NewSig;