1 // Common type for stems to which to attach affixes and words to analyze
2 // Copyright © 2009 The University of Chicago
11 #include "SparseVector.h"
13 #include "StemListViewItem.h"
14 template<class V
> class QList
;
16 /// CStem objects are the key to signature discovery.
18 /// A CStem represents a morpheme to which an affix might be attached
19 /// on the left or right. The CStem itself may be further analyzed,
20 /// so that in the representation as stem + affix each word has an
21 /// implied parse tree.
23 /// A single instance represents all uses of a stem, and its main
24 /// purpose is to remember the list of affixes it appears with so
25 /// they can be considered as a potential signature. In the
26 /// prefix/suffix-based morphology code, first the affix and stem
27 /// collections are built together and then the signature collection is
28 /// built using data stored with the stems.
30 /// Once a signature is built, CStem instances maintain the information
31 /// a signature needs to remember about each stem it appears with. In
32 /// particular, the corpus count of a stem is kept here.
34 /// Compound discovery code uses CStem instances in a similar way:
35 /// the various parses of a word, the likelihood that a parse is of the
36 /// form stem + affix rather than stem + stem, and other relevant data
37 /// are maintained in CStem instances before compound objects are built,
38 /// and afterwards the compound objects defer to the underlying CStem
39 /// for some relevant counts.
41 /// Lastly, phonological information in Linguistica is stored per-word,
42 /// in the hope that it can help improve the discovery of
43 /// affix boundaries and compounds. In particular, the representation
44 /// length (information content) of a stem or word disregarding
45 /// morphology can be used in calculating its description length
46 /// for building more complex models.
48 /// Sometimes a CStem is used in ways that emphasize its role as
49 /// something to be analyzed (i.e., something to be split: an entire
50 /// word or a stem from a MiniLexicon’s word collection), while at
51 /// other times, a CStem is primarily a constituent for other stems or
52 /// compounds. We say in the former case that the CStem is “playing
53 /// the role of a word”, and in the latter, “it plays the role of a
54 /// stem”, even though in both cases the representation is the same.
55 /// This distinction comes up when displaying a word collection or
56 /// stem collection through the UI: only the properties relevant to
57 /// the role considered are displayed.
58 class CStem
: public CLParse
{
60 static class CLexicon
* m_Lexicon
;
68 MULTIPLE_COMPOUND
= 32,
69 POSSIBLE_COMPOUND
= 64,
73 POLYWORD_PIECE
= 1024,
75 enum eAddBoundarySymbols
{
79 enum ePhonologySplitType
{
86 CParse m_SuffixList
; //sister affixes, not daughter
87 class CSignature
* m_pSuffixSignature
;
88 class CSignature
* m_pPrefixSignature
;
89 CParse m_PrefixList
; //sister affixes, not daughter
91 bool m_SimpleFlag
; //if TRUE, then it's not further decomposable.
103 class CSuffix
* m_pSuffix
;
104 class CPrefix
* m_pPrefix
;
105 mutable double m_LengthOfPointerToMe
; ///< Based on corpus counts, and the Stem collection set.
107 QList
<CStem
*>* m_WordPtrList
;
108 CSparseVector m_LeftNeighbors
;
109 CSparseVector m_RightNeighbors
;
112 class CEarleyParser
* m_MyEarleyParser
;
113 double m_CompoundCount
;
117 CParse m_Phonology_Tier1
;
118 CParse m_Phonology_Tier2
;
119 CParse m_Phonology_Tier1_Skeleton
;
121 double m_UnigramLogProb
;
122 double m_BigramLogProb
;
124 double m_BigramComplexity
; // average log prob
125 double m_UnigramComplexity
; // average log prob
126 mutable double m_PhonologicalContent
; // value depends on what we have computed so far.
127 double m_HMM_LogProbability
;
129 // First Boltzmann model: only MI between tier 2 neighbors:
130 double m_Tier2_LocalMI_Score
; //Only the MI on tier 2
131 double m_LocalMI_TotalBoltzmannScore
; // Total score, including tier 1 bigram score
132 double m_LocalMI_Plog
;
134 // Second Boltzmann model: MI between more distant tier 2 elements
135 double m_Tier2_DistantMI_Score
;
136 double m_DistantMI_TotalBoltzmannScore
;
137 double m_DistantMI_Plog
;
140 // Some Tier one Phonology Info for Graphica display
141 QMap
<int, QString
> m_phonologies
;
142 QMap
<int, double> m_unigrams
;
143 QMap
<int, double> m_mis
;
144 int m_countofunigrams
;
146 double m_maxpositive
;
147 double m_maxnegative
;
148 bool m_donephonology
;
151 // construction/destruction.
153 CStem(class CMiniLexicon
* mini
= 0);
154 CStem(const CStem
& x
);
155 CStem(const class CStringSurrogate
&, class CMiniLexicon
* mini
= 0);
156 CStem(const CLParse
&);
157 CStem(const CParse
&, class CMiniLexicon
* mini
= 0);
162 void operator=(const CStem
&);
165 // description length.
166 double GetLengthOfPointerToMe() const;
167 double GetLengthOfPointerToMe_2(); ///< deprecated
168 void SetLengthOfPointerToMe(double L
) { m_LengthOfPointerToMe
= L
; }
169 double CalculatePhonologicalInformationContent(class CLexicon
*) const;
170 double GetPhonologicalInformationContent(class CLexicon
* = 0) const;
171 float CalculateDL() const;
174 CParse
* GetPhonology_Tier1();
175 CParse
* GetPhonology_Tier2();
176 CParse
* GetPhonology_Tier1_Skeleton();
177 double GetTier2_LocalMI_Score() { return m_Tier2_LocalMI_Score
; }
178 double GetLocalMI_TotalBoltzmannScore();
179 double GetLocalMI_Plog() { return m_LocalMI_Plog
; }
180 double GetTier2_DistantMI_Score() { return m_Tier2_DistantMI_Score
; }
181 double GetDistantMI_TotalBoltzmannScore();
182 double GetDistantMI_Plog() { return m_DistantMI_Plog
; }
183 double GetHMM_LogProbability() { return m_HMM_LogProbability
; }
184 void ComputeProbabilities(class CWordCollection
* words
);
185 void ComputeBoltzmannProbabilities(double Z
, double ZStar
);
186 void GetPhonogyTier1InfoForGraph(class CWordCollection
* words
);
187 QString
GetProbabilityInformation();
188 void SplitPhonologyToTiers(enum ePhonologySplitType type
,
189 CParse
& PhonesToMove
);
190 void CreateCVTemplate(CParse
* Vowels
);
191 void CreatePhonologyFromOrthography(enum eAddBoundarySymbols
= BOUNDARIES
);
195 bool ContainsPrefix(class CPrefix
*) const;
196 bool ContainsPrefix(const class CStringSurrogate
&) const;
197 bool ContainsSuffix(class CSuffix
*) const;
199 CParse
DisplayBrokenForm(); // for MT, etc.
200 QString
DisplayStemType() const;
201 // TODO: get CRule int FindRule (CStem*, CRule&) const;
205 double GetAffixness() const { return m_Affixness
; }
206 double GetCompoundCount() const { return m_CompoundCount
; }
207 QString
GetConfidence() const { return m_Confidence
; }
208 int GetCorpusCount() const { return linguistica::corpus_count::GetCorpusCount(); }
209 class CEarleyParser
* GetMyEarleyParser() const { return m_MyEarleyParser
;}
210 int GetNumberOfPrefixes() const { return m_PrefixList
.Size(); }
211 int GetNumberOfStems() const;
212 int GetNumberOfSuffixes() const { return m_SuffixList
.Size(); }
213 void GetPrefix(CParse
&) const;
214 class CStringSurrogate
GetPrefix() const;
215 CParse
* GetPrefixList() { return &m_PrefixList
; }
216 int GetPrefixLoc() const { return m_PrefixLoc
; }
217 class CPrefix
* GetPrefixPtr() const { return m_pPrefix
; }
218 class CSignature
* GetPrefixSignature() const { return m_pPrefixSignature
; }
219 int GetRegular() const { return m_Regular
; }
220 bool GetSimpleFlag() const { return m_SimpleFlag
; }
221 float GetSortingQuantity() const; // TODO
222 QString
GetSortingString();
223 void GetStem(CParse
&) const;
224 class CStringSurrogate
GetStem();
225 int GetStem2Loc() const { return m_Stem2Loc
; }
226 int GetStemLoc() const { return m_StemLoc
; }
227 CStem
* GetStemPtr() const { return m_pStem
; }
228 enum type
GetStemType() const { return m_StemType
; }
229 void GetSuffix(CParse
&) const;
230 class CStringSurrogate
GetSuffix() const;
231 CParse
* GetSuffixList() { return &m_SuffixList
; }
232 int GetSuffixLoc() const { return m_SuffixLoc
; }
233 class CSuffix
* GetSuffixPtr() const { return m_pSuffix
; }
234 class CSignature
* GetSuffixSignature() const { return m_pSuffixSignature
; }
235 int GetWordCount() const { return m_WordCount
; }
239 double GetUnigramLogProb() { return m_UnigramLogProb
; }
240 double GetBigramLogProb() { return m_BigramLogProb
; }
241 CParse
* GetTier1() { return &m_Phonology_Tier1
; }
242 const CParse
* GetTier1() const { return &m_Phonology_Tier1
; }
243 CParse
* GetTier2() { return &m_Phonology_Tier2
; }
244 const CParse
* GetTier2() const { return &m_Phonology_Tier2
; }
245 CParse
* GetTier1_Skeleton() { return &m_Phonology_Tier1_Skeleton
; }
246 const CParse
* GetTier1_Skeleton() const { return &m_Phonology_Tier1_Skeleton
; }
248 double GetUnigramComplexity() { return m_UnigramComplexity
; }
249 double GetBigramComplexity() { return m_BigramComplexity
; }
251 QList
<CStem
*>* GetWordPtrList() { return m_WordPtrList
; }
252 CStem
* GetWord(int wordno
) const { return m_WordPtrList
->at(wordno
);}
253 int GetNumberOfWords() const { return m_WordPtrList
->count(); }
254 enum type
GetWordType() const { return m_StemType
; }
256 bool HasAPrefix() const;
257 bool HasASuffix() const;
259 bool IsValid() const;
260 int SF(int) const; // SuccessorFrequency;
261 void StemListDisplay(class Q3ListView
* dest
, QMap
<QString
, QString
>* filter
= 0, int char_count
= 27);
262 void WordListDisplay(class Q3ListView
* dest
,
263 QMap
<QString
, QString
>* filter
= 0,
264 enum CWordListViewItem::display_mode
=
265 CWordListViewItem::MiniLexicon_MorphologyStuffFirst
,
266 int char_count
= 27);
268 // typical filter: m_pMyMini->GetOutFilter()
269 void OutputStem(class Q3TextStream
& outf
, int index
,
270 QMap
<QString
, QString
>* filter
);
271 void OutputWord(class Q3TextStream
& outf
, int index
,
272 QMap
<QString
, QString
>* filter
);
276 void AddNULLPrefix();
277 void AddNULLSuffix();
278 void AddPrefix(class CPrefix
*);
279 void AddPrefix(const class CStringSurrogate
&);
280 void AddSuffix(class CSuffix
*);
281 void AddSuffix(const class CStringSurrogate
&);
282 bool AddWord(CStem
*);
283 void AppendToConfidence(const QString string
) { m_Confidence
+= string
; }
284 void AttachPrefixSignature(class CSignature
*);
285 void AttachSuffixSignature(class CSignature
*);
286 void AttachWordAndSuffixalStem(CStem
*);
287 void AttachWordAndPrefixalStem(CStem
*);
288 void AttachWordStemAndPrefix(CStem
*, class CPrefix
*);
289 void AttachWordStemAndSuffix(CStem
*, class CSuffix
*);
291 class CSignature
* ChangeSuffixSignature(class CSignature
* pNewSig
);
292 void ClearPointers(); // to Stem, Suffix, Signature;
293 void ClearPrefixStemSplit();
294 void ClearRootSuffixSplit();
295 void CopyStemInformation(CStem
*);
296 void CopySuffixList(CParse
*);
298 void DeleteFactorization();
299 void DeletePrefix(); // Arabic morphology
300 void DetachPrefix(class CPrefix
*);
301 void DetachSuffix(class CSuffix
*);
303 void IncrementSuffixLocs();
304 void IncrementCompoundCount(double d
= 1.0) { m_CompoundCount
+= d
; }
305 void IncrementWordCount(int n
= 1);
308 void RepairSuffixList(const class CMiniLexicon
*);
309 void RemoveWordFromWordPtrList(CStem
*);
311 void SetAffixness(double d
) { m_Affixness
= d
; }
312 void SetCompoundCount(double d
) { m_CompoundCount
= d
; }
313 void SetConfidence(const QString conf
) { m_Confidence
= conf
; }
314 static void SetLexicon(CLexicon
* Lex
) { m_Lexicon
= Lex
; }
315 void SetNumberOfStems(int n
) { m_NumberOfStems
= n
; }
316 void SetPrefixLoc(int n
) { m_PrefixLoc
= n
; }
317 void SetPrefixPtr(class CPrefix
* pPre
) { m_pPrefix
= pPre
; }
318 void SetPrefixSignature(class CSignature
* pSig
) { m_pPrefixSignature
= pSig
; }
319 void SetStem2Loc(int n
) { m_Stem2Loc
= n
; }
320 void SetStemLoc(int n
) { m_StemLoc
= n
; }
321 void SetStemPtr(CStem
* pStm
) { m_pStem
= pStm
; }
322 void SetStemType(enum type e
) { m_StemType
= e
; }
323 void SetSuffixList(CParse
* pParse
) { m_SuffixList
= pParse
; }
324 void SetSuffixLoc(int n
) { m_SuffixLoc
= n
; }
325 void SetSuffixPtr(class CSuffix
* pSuf
) { m_pSuffix
= pSuf
; }
326 void SetSuffixSignature(class CSignature
* pSig
) { m_pSuffixSignature
= pSig
; }
327 void SetWordCount(int n
) { m_WordCount
= n
; }
328 void SetWordType(enum type WT
) { m_StemType
= WT
; }
329 void SetMyEarleyParser(class CEarleyParser
* parser
) { m_MyEarleyParser
= parser
; }
330 void SetPhonology_Tier1(CParse
*);
331 void ShiftStemSuffixBoundary(int);
332 void ShiftPrefixStemBoundary(int);
333 void SetHMM_LogProbability(double logprob
) { m_HMM_LogProbability
= logprob
;}
336 inline CParse
* CStem::GetPhonology_Tier1() { return &m_Phonology_Tier1
; }
337 inline CParse
* CStem::GetPhonology_Tier2() { return &m_Phonology_Tier2
; }
338 inline CParse
* CStem::GetPhonology_Tier1_Skeleton()
339 { return &m_Phonology_Tier1_Skeleton
; }
340 inline double CStem::GetLocalMI_TotalBoltzmannScore()
341 { return m_LocalMI_TotalBoltzmannScore
; }
342 inline double CStem::GetDistantMI_TotalBoltzmannScore()
343 { return m_DistantMI_TotalBoltzmannScore
; }