1 // Implementation of CStem methods
2 // Copyright © 2009 The University of Chicago
5 // See also Stem_Phonology.cpp for phonology methods,
6 // Stem_EncodingLength.cpp for description length calculations,
7 // GUIclasses.cpp for methods pertaining to GUI output,
8 // and Word.cpp for methods concerning stems-qua-words.
10 #include <Q3TextStream>
12 #include "EarleyParser.h"
13 #include "Signature.h"
17 CLexicon
* CStem::m_Lexicon
; // assign value ! :TODO
19 // construction/destruction.
21 CStem::CStem(CMiniLexicon
* mini
)
25 m_SuffixList(), // initialized below
26 m_pSuffixSignature(NULL
), m_pPrefixSignature(NULL
),
27 m_PrefixList(), // initialized below
34 m_PrefixLoc(0), m_SuffixLoc(0),
35 m_Confidence(QString()),
38 m_strSuffix(), m_strPrefix(),
39 m_pSuffix(NULL
), m_pPrefix(NULL
),
40 m_LengthOfPointerToMe(0.0),
41 m_WordPtrList(new QList
<CStem
*>),
51 m_Phonology_Tier1_Skeleton(),
52 m_UnigramLogProb(0.0),
54 m_BigramComplexity(0.0), // average
55 m_UnigramComplexity(0.0), // average
56 m_PhonologicalContent(0.0),
57 m_HMM_LogProbability(0.0),
58 // first Boltzmann model.
59 m_Tier2_LocalMI_Score(0.0),
60 m_LocalMI_TotalBoltzmannScore(0.0),
62 // second Boltzmann model.
63 m_Tier2_DistantMI_Score(0.0),
64 m_DistantMI_TotalBoltzmannScore(0.0),
65 m_DistantMI_Plog(0.0),
66 // tier-1 phonology info for graphical display.
67 m_phonologies(), m_unigrams(), m_mis(),
72 m_donephonology(false)
74 m_SuffixList
.Alphabetize();
75 m_PrefixList
.Alphabetize();
78 CStem::CStem(const CStringSurrogate
& stem
, CMiniLexicon
* mini
)
79 : CLParse(stem
, mini
),
82 m_SuffixList (), // initialized below
83 m_pSuffixSignature (NULL
),
84 m_pPrefixSignature (NULL
),
85 m_PrefixList (), // initialized below
94 m_Confidence (QString()),
101 m_LengthOfPointerToMe(0.0),
102 m_WordPtrList(new QList
<CStem
*>),
107 m_CompoundCount (0.0),
110 m_Phonology_Tier1 (),
111 m_Phonology_Tier2 (),
112 m_Phonology_Tier1_Skeleton(),
113 m_UnigramLogProb (0.0),
114 m_BigramLogProb (0.0),
115 m_BigramComplexity (0.0), // average
116 m_UnigramComplexity(0.0), // average
117 m_PhonologicalContent(0.0),
118 m_HMM_LogProbability(0.0),
119 // first Boltzmann model.
120 m_Tier2_LocalMI_Score(0.0),
121 m_LocalMI_TotalBoltzmannScore(0.0),
123 // second Boltzmann model.
124 m_Tier2_DistantMI_Score(0.0),
125 m_DistantMI_TotalBoltzmannScore(0.0),
126 m_DistantMI_Plog(0.0),
127 // tier-1 phonology info for graphical display.
128 m_phonologies(), m_unigrams(), m_mis(),
129 m_countofunigrams(0),
133 m_donephonology(false)
135 m_SuffixList
.Alphabetize();
136 m_PrefixList
.Alphabetize();
139 CStem::CStem(const CStem
& x
)
141 m_WordCount(x
.m_WordCount
),
142 m_BrokenForm(), // XXX. copy?
143 m_SuffixList(), // initialized below. XXX. copy?
144 m_pSuffixSignature (x
.m_pSuffixSignature
),
145 m_pPrefixSignature (x
.m_pPrefixSignature
),
146 m_PrefixList (), // initialized below. XXX. copy?
147 m_Regular (x
.m_Regular
),
148 m_SimpleFlag (x
.m_SimpleFlag
),
149 m_StemType (x
.m_StemType
),
150 m_StemLoc (x
.m_StemLoc
),
151 m_Stem2Loc (x
.m_Stem2Loc
),
152 m_NumberOfStems (x
.m_NumberOfStems
),
153 m_PrefixLoc (x
.m_PrefixLoc
),
154 m_SuffixLoc (x
.m_SuffixLoc
),
155 m_Confidence (x
.m_Confidence
),
157 m_strStem (x
.m_strStem
),
159 m_strPrefix(), // XXX. copy?
160 m_pSuffix(x
.m_pSuffix
),
161 m_pPrefix(x
.m_pPrefix
),
162 m_LengthOfPointerToMe(x
.m_LengthOfPointerToMe
),
163 m_WordPtrList(new QList
<CStem
*>(*x
.m_WordPtrList
)),
164 m_LeftNeighbors(), m_RightNeighbors(), // XXX. copy?
166 m_MyEarleyParser(), // XXX. copy?
167 m_CompoundCount(x
.m_CompoundCount
),
168 m_Affixness(x
.m_Affixness
),
170 m_Phonology_Tier1(x
.m_Phonology_Tier1
),
171 m_Phonology_Tier2(x
.m_Phonology_Tier2
),
172 m_Phonology_Tier1_Skeleton(x
.m_Phonology_Tier1_Skeleton
),
173 m_UnigramLogProb(x
.m_UnigramLogProb
),
174 m_BigramLogProb(x
.m_BigramLogProb
),
175 m_BigramComplexity(x
.m_BigramComplexity
),
176 m_UnigramComplexity(x
.m_UnigramComplexity
),
177 m_PhonologicalContent(x
.m_PhonologicalContent
),
178 m_HMM_LogProbability(0.0), // XXX. copy?
179 // first Boltzmann model.
180 m_Tier2_LocalMI_Score(x
.m_Tier2_LocalMI_Score
),
181 m_LocalMI_TotalBoltzmannScore(0.0), // XXX. copy?
182 m_LocalMI_Plog(0.0), // XXX. copy?
183 // second Boltzmann model.
184 m_Tier2_DistantMI_Score(x
.m_Tier2_DistantMI_Score
),
185 m_DistantMI_TotalBoltzmannScore(0.0), // XXX. copy?
186 m_DistantMI_Plog(0.0), // XXX. copy?
187 // tier-1 phonology info for graphical display.
188 m_phonologies(), m_unigrams(), m_mis(), // XXX. copy?
189 m_countofunigrams(0), // XXX. copy?
190 m_countofmis(0), // XXX. copy?
191 m_maxpositive(0.0), // XXX. copy?
192 m_maxnegative(0.0), // XXX. copy?
193 m_donephonology(x
.m_donephonology
)
195 m_SuffixList
.Alphabetize();
196 m_PrefixList
.Alphabetize();
199 CStem::CStem(const CLParse
& text_in_corpus
)
200 : CLParse(text_in_corpus
),
203 m_SuffixList(), // initialized below
204 m_pSuffixSignature(NULL
), m_pPrefixSignature(NULL
),
205 m_PrefixList(), // initialized below
212 m_PrefixLoc(0), m_SuffixLoc(0),
213 m_Confidence(QString()),
216 m_strSuffix(), m_strPrefix(),
217 m_pSuffix(NULL
), m_pPrefix(NULL
),
218 m_LengthOfPointerToMe(0.0),
219 m_WordPtrList(new QList
<CStem
*>),
224 m_CompoundCount(0.0),
229 m_Phonology_Tier1_Skeleton(),
230 m_UnigramLogProb(0.0),
231 m_BigramLogProb(0.0),
232 m_BigramComplexity(0.0), // average
233 m_UnigramComplexity(0.0), // average
234 m_PhonologicalContent(0.0),
235 m_HMM_LogProbability(0.0),
236 // first Boltzmann model.
237 m_Tier2_LocalMI_Score(0.0),
238 m_LocalMI_TotalBoltzmannScore(0.0),
240 // second Boltzmann model.
241 m_Tier2_DistantMI_Score(0.0),
242 m_DistantMI_TotalBoltzmannScore(0.0),
243 m_DistantMI_Plog(0.0),
244 // tier-1 phonology info for graphical display.
245 m_phonologies(), m_unigrams(), m_mis(),
246 m_countofunigrams(0),
250 m_donephonology(false)
252 m_SuffixList
.Alphabetize();
253 m_PrefixList
.Alphabetize();
256 CStem::CStem(const CParse
& text
, CMiniLexicon
* lex
)
257 : CLParse(text
, lex
),
260 m_SuffixList(), // initialized below
261 m_pSuffixSignature(NULL
), m_pPrefixSignature(NULL
),
262 m_PrefixList(), // initialized below
269 m_PrefixLoc(0), m_SuffixLoc(0),
270 m_Confidence(QString()),
273 m_strSuffix(), m_strPrefix(),
274 m_pSuffix(NULL
), m_pPrefix(NULL
),
275 m_LengthOfPointerToMe(0.0),
276 m_WordPtrList(new QList
<CStem
*>),
281 m_CompoundCount(0.0),
286 m_Phonology_Tier1_Skeleton(),
287 m_UnigramLogProb(0.0),
288 m_BigramLogProb(0.0),
289 m_BigramComplexity(0.0), // average
290 m_UnigramComplexity(0.0), // average
291 m_PhonologicalContent(0.0),
292 m_HMM_LogProbability(0.0),
293 // first Boltzmann model.
294 m_Tier2_LocalMI_Score(0.0),
295 m_LocalMI_TotalBoltzmannScore(0.0),
297 // second Boltzmann model.
298 m_Tier2_DistantMI_Score(0.0),
299 m_DistantMI_TotalBoltzmannScore(0.0),
300 m_DistantMI_Plog(0.0),
301 // tier-1 phonology info for graphical display.
302 m_phonologies(), m_unigrams(), m_mis(),
303 m_countofunigrams(0),
307 m_donephonology(false)
309 m_SuffixList
.Alphabetize();
310 m_PrefixList
.Alphabetize();
316 delete m_WordPtrList
;
317 delete m_MyEarleyParser
;
320 //-----------------------------------------------------------------
321 // Overloaded operators
322 //-----------------------------------------------------------------
324 void CStem::operator= (const CStem
& RHS
)
329 m_Confidence
= RHS
.GetConfidence();
330 m_NumberOfStems
= RHS
.GetNumberOfStems();
331 m_pPrefix
= RHS
.GetPrefixPtr();
332 m_pPrefixSignature
= RHS
.GetPrefixSignature();
333 m_PrefixLoc
= RHS
.GetPrefixLoc();
334 m_pStem
= RHS
.GetStemPtr();
335 m_pSuffix
= RHS
.GetSuffixPtr();
336 m_pSuffixSignature
= RHS
.GetSuffixSignature();
337 m_Regular
= RHS
.GetRegular();
338 m_SimpleFlag
= RHS
.GetSimpleFlag();
339 m_Stem2Loc
= RHS
.GetStem2Loc();
340 m_StemLoc
= RHS
.GetStemLoc();
341 m_StemType
= RHS
.GetStemType();
342 m_SuffixLoc
= RHS
.GetSuffixLoc();
343 m_WordCount
= RHS
.GetWordCount();
344 m_Phonology_Tier1
= RHS
.m_Phonology_Tier1
;
345 m_Phonology_Tier2
= RHS
.m_Phonology_Tier2
;
346 m_Phonology_Tier1_Skeleton
= RHS
.m_Phonology_Tier1_Skeleton
;
347 m_CompoundCount
= RHS
.GetCompoundCount();
348 m_Affixness
= RHS
.GetAffixness();
349 m_MyEarleyParser
= RHS
.GetMyEarleyParser();
350 // m_LengthOfPointerToMe = RHS.GetLengthOfPointerToMe();
352 // m_SuffixList.SetAlphabetical();
353 // m_PrefixList.SetAlphabetical();
358 void CStem::Copy (CStem
& RHS
)
363 m_Confidence
= RHS
.GetConfidence();
364 m_NumberOfStems
= RHS
.GetNumberOfStems();
365 m_pPrefix
= RHS
.GetPrefixPtr();
366 m_pPrefixSignature
= RHS
.GetPrefixSignature();
367 m_PrefixLoc
= RHS
.GetPrefixLoc();
368 m_pStem
= RHS
.GetStemPtr();
369 m_pSuffix
= RHS
.GetSuffixPtr();
370 m_pSuffixSignature
= RHS
.GetSuffixSignature();
371 m_Regular
= RHS
.GetRegular();
372 m_SimpleFlag
= RHS
.GetSimpleFlag();
373 m_Stem2Loc
= RHS
.GetStem2Loc();
374 m_StemLoc
= RHS
.GetStemLoc();
375 m_StemType
= RHS
.GetStemType();
376 m_SuffixLoc
= RHS
.GetSuffixLoc();
377 m_WordCount
= RHS
.GetWordCount();
378 m_MyEarleyParser
= RHS
.GetMyEarleyParser();
379 // m_SuffixList.SetAlphabetical();
380 // m_PrefixList.SetAlphabetical();
382 m_CompoundCount
= RHS
.GetCompoundCount();
383 m_Affixness
= RHS
.GetAffixness();
384 m_LengthOfPointerToMe
= RHS
.GetLengthOfPointerToMe();
389 //-----------------------------------------------------------------
391 //-----------------------------------------------------------------
394 // Copy utility for stems
397 // RHS - the stem to be copied
399 void CStem::CopyStemInformation(CStem
* RHS
)
401 m_Confidence
= RHS
->GetConfidence();
402 SetCorpusCount(RHS
->GetCorpusCount());
403 m_NumberOfStems
= RHS
->GetNumberOfStems();
404 m_pPrefix
= RHS
->GetPrefixPtr();
406 if ( RHS
->GetPrefixList() )
408 m_PrefixList
= RHS
->GetPrefixList();
411 m_PrefixLoc
= RHS
->GetPrefixLoc();
413 if ( RHS
->GetPrefixSignature() )
415 m_pPrefixSignature
= RHS
->GetPrefixSignature();
418 m_pStem
= RHS
->GetStemPtr();
419 m_pSuffix
= RHS
->GetSuffixPtr();
420 m_Regular
= RHS
->GetRegular();
421 m_SimpleFlag
= RHS
->GetSimpleFlag();
422 m_Stem2Loc
= RHS
->GetStem2Loc();
423 m_StemLoc
= RHS
->GetStemLoc();
424 m_StemType
= RHS
->GetStemType();
425 m_SuffixLoc
= RHS
->GetSuffixLoc();
427 if ( RHS
->GetSuffixList() )
429 m_SuffixList
= RHS
->GetSuffixList();
432 if ( RHS
->GetSuffixSignature() )
434 m_pSuffixSignature
= RHS
->GetSuffixSignature();
437 m_WordCount
= RHS
->GetWordCount();
440 for (int wordno
= 0; wordno
< RHS
->GetWordPtrList()->size(); wordno
++)
441 { word
= RHS
->GetWordPtrList()->at(wordno
);
442 m_WordPtrList
->append( word
);
445 m_CompoundCount
= RHS
->GetCompoundCount();
446 m_Affixness
= RHS
->GetAffixness();
447 m_LengthOfPointerToMe
= RHS
->GetLengthOfPointerToMe();
448 m_MyEarleyParser
= RHS
->GetMyEarleyParser();
450 //-----------------------------------------------------------------------------------//
451 // Add the prefix 'NULL' to the list of prefixes
452 void CStem::AddNULLPrefix()
453 //-----------------------------------------------------------------------------------//
456 QString Null
= "NULL";
457 if ( ! m_PrefixList
.ContainsNULL() )
459 m_PrefixList
.Append (CStringSurrogate(Null
.unicode(),0,Null
.length()));
463 //-----------------------------------------------------------------------------------//
464 // Add the suffix 'NULL' to the list of suffixes
465 void CStem::AddNULLSuffix()
466 //-----------------------------------------------------------------------------------//
469 QString Null
= "NULL";
470 if ( ! m_SuffixList
.ContainsNULL() )
472 m_SuffixList
.Append (CStringSurrogate(Null
.unicode(),0,Null
.length()));
476 //-----------------------------------------------------------------------------------//
477 // Add a word to the word list
478 bool CStem::AddWord (CStem
* pWord
)
479 //-----------------------------------------------------------------------------------//
481 if ( m_WordPtrList
->indexOf(pWord
) < 0 )
483 m_WordPtrList
->append (pWord
);
490 // Add a prefix to the prefix list
493 // pPrefix - pointer to the prefix to
496 void CStem::AddPrefix (CPrefix
* pPrefix
)
498 if (! ContainsPrefix (pPrefix
)) {
499 // if(!m_PrefixList.Alphabetical()) m_PrefixList.Alphabetize();
500 m_PrefixList
.Append (pPrefix
->GetKey());
505 // Add a suffix to the suffix list
508 // pSuffix - pointer to the suffix to
511 void CStem::AddSuffix(CSuffix
* pSuffix
)
513 if ( !ContainsSuffix (pSuffix
) ) {
514 // if(!m_SuffixList.Alphabetical()) m_SuffixList.Alphabetize();
515 m_SuffixList
.Append (pSuffix
->GetKey());
522 // Add a suffix to the suffix list
525 // key - surrogate string of the suffix to
528 void CStem::AddSuffix(const CStringSurrogate
& key
)
530 if ( !m_SuffixList
.Contains (key
) ) {
531 // if(!m_SuffixList.Alphabetical()) m_SuffixList.Alphabetize();
532 m_SuffixList
.Append (key
);
537 // Copy a list of suffixes into the suffix list
540 // pParse - the list of new suffixes
542 void CStem::CopySuffixList(CParse
* pParse
)
544 for (int i
= 1; i
<= (int)pParse
->Size(); i
++)
546 AddSuffix ( pParse
->GetPiece(i
) );
551 // Find out if the prefix list contains a specific
555 // Prefix - the prefix in question
558 // bool - true if the prefix is in our list
560 bool CStem::ContainsPrefix(CPrefix
* Prefix
) const
562 if ( m_PrefixList
.Contains (Prefix
->GetKey()) ) {
569 bool CStem::ContainsPrefix(const CStringSurrogate
& Prefix
) const
571 if ( m_PrefixList
.Contains (Prefix
) ) {
578 QString
CStem::GetSortingString ()
580 QString sortString
= GetSuffixList()->Display(); return sortString
;
582 // Add a prefix to the prefix list
585 // Prefix - prefix surrogate to be added be added
587 void CStem::AddPrefix(const CStringSurrogate
& Prefix
)
589 if ( !ContainsPrefix (Prefix
) )
591 m_PrefixList
.Append (Prefix
);
596 // Find out if the suffix list contains a specific
600 // Suffix - the suffix to look for
603 // bool - true if the suffix is in the list
605 bool CStem::ContainsSuffix(CSuffix
* Suffix
) const
607 if (m_SuffixList
.Contains (Suffix
->GetKey() ) ){
615 // Increment the word count
618 // n - amount to increment, default = 1
620 void CStem::IncrementWordCount (int n
)
623 Q_ASSERT (m_WordCount
> 0);
624 Q_ASSERT (m_WordCount
< 1000000);
631 // Prefix - the parse to put the prefix in
633 void CStem::GetPrefix ( CParse
& Prefix
) const
635 if (m_strPrefix
.GetKeyLength() > 0) {
636 Prefix
= m_strPrefix
;
639 Prefix
= GetPiece( m_PrefixLoc
);
646 // Output - the parse to put the suffix in
648 void CStem::GetSuffix(CParse
& Output
) const
650 if (m_strSuffix
.GetKeyLength() > 0) {
651 Output
= m_strSuffix
;
654 Output
= GetPiece( m_SuffixLoc
);
661 // Output - the parse to put the stem in
663 void CStem::GetStem(CParse
& Output
) const
665 if (m_strStem
.GetKeyLength() > 0) {
669 Output
= GetPiece( m_StemLoc
);
675 // Display the type of this stem
678 // QString - the type of this stem
680 QString
CStem::DisplayStemType() const
685 { return ""; } // return "Normal":
686 case BIWORD_COMPOUND
:
687 { return "2 word compound"; }
688 case MULTIPLE_COMPOUND
:
689 { return "Multiple-word compound"; }
690 case POSSIBLE_COMPOUND
:
691 { return "Possible compound"; };
697 { return "Ends in hyphen"; }
699 { return "Compound"; }
702 case STEM_PLUS_SUFFIX
:
703 { return "Stem & Suffix"; }
705 { return "Polyword piece"; }
712 // Merge the prefix and stem
714 void CStem::ClearPrefixStemSplit()
716 if ( m_StemLoc
&& m_PrefixLoc
)
718 MergePieces (m_PrefixLoc
);
728 // Merge the root and suffix
730 void CStem::ClearRootSuffixSplit()
732 if ( m_StemLoc
&& HasASuffix() )
734 MergePieces (m_StemLoc
);
742 // TODO : define this function
743 // Get the sorting quantity
746 // float - the sorting quantity
748 float CStem::GetSortingQuantity() const
754 /// used in allomorphy code.
755 void CStem::RepairSuffixList(const CMiniLexicon
* Lexicon
)
757 struct not_implemented
{ };
758 throw not_implemented();
759 static_cast<void>(Lexicon
);
761 // QString NewSuffix;
762 // CSuffix* pNewSuffix;
764 // for (int i = 1; i <= m_SuffixList.Size(); ++i) {
765 // TODO: Get John's help to fix, I don't understand.
766 // CStringSurrogate ssSuffix = m_SuffixList.GetPiece(i);
767 // CSuffix* pOldSuffix = *Lexicon->GetSuffixes() ^= ssSuffix;
769 // SuffixStringTranslation.GetPiece(ssSuffix.SpellOut(), NewSuffix);
770 // pNewSuffix = *Lexicon->GetSuffixes() ^=
774 // Detach a specific suffix from the list
777 // pSuffix - the suffix to detach
779 void CStem::DetachSuffix(CSuffix
* pSuffix
)
782 m_SuffixList
.Remove ( pSuffix
->GetKey() );
783 pSuffix
->RemoveFromStemPtrList ( this );
784 pSuffix
->RemoveStemString ( GetKey() );
789 // Detach a specific prefix from the list
792 // pPrefix - the prefix to detach
794 void CStem::DetachPrefix(CPrefix
* pPrefix
)
797 m_PrefixList
.Remove ( pPrefix
->GetKey() );
798 pPrefix
->RemoveFromStemPtrList ( this );
799 pPrefix
->RemoveStemString ( GetKey() );
803 // Remove a word from the word list
806 // pWord - pointer to the word to be removed
808 void CStem::RemoveWordFromWordPtrList(CStem
* pWord
)
811 m_WordPtrList
->remove(pWord
);
815 // Replace the old suffix signature and return it
818 // pNewSig - the new signature
821 // CSignature* - the old signature
823 CSignature
* CStem::ChangeSuffixSignature(CSignature
* pNewSig
)
825 CSignature
* pOldSig
= m_pSuffixSignature
;
828 pOldSig
->DetachStem(this, CSignature::eCall_Words
);
830 m_pSuffixSignature
= pNewSig
;
834 void CStem::OutputStem(Q3TextStream
& outf
, int index
,
835 QMap
<QString
, QString
>* filter
)
838 // "# Index | Stem | Confidence | Corpus Count | # of Words | Affixes | Words"
852 confidence
= GetConfidence();
853 if( confidence
== "" ) confidence
= "NONE";
854 outf
<< confidence
.replace( " ", "_" );
859 outf
<< GetCorpusCount();
863 if( GetSuffixSignature() )
866 outf
<< m_WordPtrList
->size();
870 outf
<< GetSuffixSignature()->Display(' ', filter
);
873 else if( GetPrefixSignature() )
876 outf
<< GetPrefixSignature()->Size();
880 outf
<< GetPrefixSignature()->Display(' ', filter
);