CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Stem.h
blobce3eebc7ac6014d4337cdc5da19e2d055495470c
1 // Common type for stems to which to attach affixes and words to analyze
2 // Copyright © 2009 The University of Chicago
3 #ifndef STEM_H
4 #define STEM_H
6 class CStem;
8 #include "LParse.h"
9 #include <QString>
10 #include <QMap>
11 #include "SparseVector.h"
12 #include "Parse.h"
13 #include "StemListViewItem.h"
14 template<class V> class QList;
16 /// CStem objects are the key to signature discovery.
17 ///
18 /// A CStem represents a morpheme to which an affix might be attached
19 /// on the left or right. The CStem itself may be further analyzed,
20 /// so that in the representation as stem + affix each word has an
21 /// implied parse tree.
22 ///
23 /// A single instance represents all uses of a stem, and its main
24 /// purpose is to remember the list of affixes it appears with so
25 /// they can be considered as a potential signature. In the
26 /// prefix/suffix-based morphology code, first the affix and stem
27 /// collections are built together and then the signature collection is
28 /// built using data stored with the stems.
29 ///
30 /// Once a signature is built, CStem instances maintain the information
31 /// a signature needs to remember about each stem it appears with. In
32 /// particular, the corpus count of a stem is kept here.
33 ///
34 /// Compound discovery code uses CStem instances in a similar way:
35 /// the various parses of a word, the likelihood that a parse is of the
36 /// form stem + affix rather than stem + stem, and other relevant data
37 /// are maintained in CStem instances before compound objects are built,
38 /// and afterwards the compound objects defer to the underlying CStem
39 /// for some relevant counts.
40 ///
41 /// Lastly, phonological information in Linguistica is stored per-word,
42 /// in the hope that it can help improve the discovery of
43 /// affix boundaries and compounds. In particular, the representation
44 /// length (information content) of a stem or word disregarding
45 /// morphology can be used in calculating its description length
46 /// for building more complex models.
47 ///
48 /// Sometimes a CStem is used in ways that emphasize its role as
49 /// something to be analyzed (i.e., something to be split: an entire
50 /// word or a stem from a MiniLexicon’s word collection), while at
51 /// other times, a CStem is primarily a constituent for other stems or
52 /// compounds. We say in the former case that the CStem is “playing
53 /// the role of a word”, and in the latter, “it plays the role of a
54 /// stem”, even though in both cases the representation is the same.
55 /// This distinction comes up when displaying a word collection or
56 /// stem collection through the UI: only the properties relevant to
57 /// the role considered are displayed.
58 class CStem : public CLParse {
59 protected:
60 static class CLexicon* m_Lexicon;
61 public:
62 enum type {
63 NORMAL = 1,
64 STEM_PLUS_SUFFIX = 2,
65 STEM_NORMAL = 4,
66 STEM_COMPOUND = 8,
67 BIWORD_COMPOUND = 16,
68 MULTIPLE_COMPOUND = 32,
69 POSSIBLE_COMPOUND = 64,
70 NUMBER = 128,
71 UNKNOWN = 256,
72 ENDS_IN_HYPHEN = 512,
73 POLYWORD_PIECE = 1024,
75 enum eAddBoundarySymbols {
76 NO_BOUNDARIES,
77 BOUNDARIES,
79 enum ePhonologySplitType {
80 Split_LeaveSlot,
81 Split_LeaveCopy,
83 protected:
84 int m_WordCount;
85 CParse* m_BrokenForm;
86 CParse m_SuffixList; //sister affixes, not daughter
87 class CSignature* m_pSuffixSignature;
88 class CSignature* m_pPrefixSignature;
89 CParse m_PrefixList; //sister affixes, not daughter
90 int m_Regular;
91 bool m_SimpleFlag; //if TRUE, then it's not further decomposable.
92 enum type m_StemType;
93 int m_StemLoc;
94 int m_Stem2Loc;
95 int m_NumberOfStems;
96 int m_PrefixLoc;
97 int m_SuffixLoc;
98 QString m_Confidence;
99 CStem* m_pStem;
100 CParse m_strStem;
101 CParse m_strSuffix;
102 CParse m_strPrefix;
103 class CSuffix* m_pSuffix;
104 class CPrefix* m_pPrefix;
105 mutable double m_LengthOfPointerToMe; ///< Based on corpus counts, and the Stem collection set.
107 QList<CStem*>* m_WordPtrList;
108 CSparseVector m_LeftNeighbors;
109 CSparseVector m_RightNeighbors;
111 // compounding.
112 class CEarleyParser* m_MyEarleyParser;
113 double m_CompoundCount;
114 double m_Affixness;
116 // phonology.
117 CParse m_Phonology_Tier1;
118 CParse m_Phonology_Tier2;
119 CParse m_Phonology_Tier1_Skeleton;
121 double m_UnigramLogProb;
122 double m_BigramLogProb;
124 double m_BigramComplexity; // average log prob
125 double m_UnigramComplexity; // average log prob
126 mutable double m_PhonologicalContent; // value depends on what we have computed so far.
127 double m_HMM_LogProbability;
129 // First Boltzmann model: only MI between tier 2 neighbors:
130 double m_Tier2_LocalMI_Score; //Only the MI on tier 2
131 double m_LocalMI_TotalBoltzmannScore; // Total score, including tier 1 bigram score
132 double m_LocalMI_Plog;
134 // Second Boltzmann model: MI between more distant tier 2 elements
135 double m_Tier2_DistantMI_Score;
136 double m_DistantMI_TotalBoltzmannScore;
137 double m_DistantMI_Plog;
139 public:
140 // Some Tier one Phonology Info for Graphica display
141 QMap<int, QString> m_phonologies;
142 QMap<int, double> m_unigrams;
143 QMap<int, double> m_mis;
144 int m_countofunigrams;
145 int m_countofmis;
146 double m_maxpositive;
147 double m_maxnegative;
148 bool m_donephonology;
150 public:
151 // construction/destruction.
153 CStem(class CMiniLexicon* mini = 0);
154 CStem(const CStem& x);
155 CStem(const class CStringSurrogate&, class CMiniLexicon* mini = 0);
156 CStem(const CLParse&);
157 CStem(const CParse&, class CMiniLexicon* mini = 0);
158 virtual ~CStem();
160 // copy assignment.
162 void operator=(const CStem &);
163 void Copy(CStem&);
165 // description length.
166 double GetLengthOfPointerToMe() const;
167 double GetLengthOfPointerToMe_2(); ///< deprecated
168 void SetLengthOfPointerToMe(double L) { m_LengthOfPointerToMe = L; }
169 double CalculatePhonologicalInformationContent(class CLexicon*) const;
170 double GetPhonologicalInformationContent(class CLexicon* = 0) const;
171 float CalculateDL() const;
173 // phonology.
174 CParse* GetPhonology_Tier1();
175 CParse* GetPhonology_Tier2();
176 CParse* GetPhonology_Tier1_Skeleton();
177 double GetTier2_LocalMI_Score() { return m_Tier2_LocalMI_Score; }
178 double GetLocalMI_TotalBoltzmannScore();
179 double GetLocalMI_Plog() { return m_LocalMI_Plog; }
180 double GetTier2_DistantMI_Score() { return m_Tier2_DistantMI_Score; }
181 double GetDistantMI_TotalBoltzmannScore();
182 double GetDistantMI_Plog() { return m_DistantMI_Plog; }
183 double GetHMM_LogProbability() { return m_HMM_LogProbability; }
184 void ComputeProbabilities(class CWordCollection* words);
185 void ComputeBoltzmannProbabilities(double Z, double ZStar);
186 void GetPhonogyTier1InfoForGraph(class CWordCollection* words);
187 QString GetProbabilityInformation();
188 void SplitPhonologyToTiers(enum ePhonologySplitType type,
189 CParse& PhonesToMove);
190 void CreateCVTemplate(CParse* Vowels);
191 void CreatePhonologyFromOrthography(enum eAddBoundarySymbols = BOUNDARIES);
194 bool IsAnalyzed();
195 bool ContainsPrefix(class CPrefix*) const;
196 bool ContainsPrefix(const class CStringSurrogate&) const;
197 bool ContainsSuffix(class CSuffix*) const;
199 CParse DisplayBrokenForm(); // for MT, etc.
200 QString DisplayStemType() const;
201 // TODO: get CRule int FindRule (CStem*, CRule&) const;
203 // accessors
205 double GetAffixness() const { return m_Affixness; }
206 double GetCompoundCount() const { return m_CompoundCount; }
207 QString GetConfidence() const { return m_Confidence; }
208 int GetCorpusCount() const { return linguistica::corpus_count::GetCorpusCount(); }
209 class CEarleyParser* GetMyEarleyParser() const { return m_MyEarleyParser;}
210 int GetNumberOfPrefixes() const { return m_PrefixList.Size(); }
211 int GetNumberOfStems() const;
212 int GetNumberOfSuffixes() const { return m_SuffixList.Size(); }
213 void GetPrefix(CParse&) const;
214 class CStringSurrogate GetPrefix() const;
215 CParse* GetPrefixList() { return &m_PrefixList; }
216 int GetPrefixLoc() const { return m_PrefixLoc; }
217 class CPrefix* GetPrefixPtr() const { return m_pPrefix; }
218 class CSignature* GetPrefixSignature() const { return m_pPrefixSignature; }
219 int GetRegular() const { return m_Regular; }
220 bool GetSimpleFlag() const { return m_SimpleFlag; }
221 float GetSortingQuantity() const; // TODO
222 QString GetSortingString();
223 void GetStem(CParse&) const;
224 class CStringSurrogate GetStem();
225 int GetStem2Loc() const { return m_Stem2Loc; }
226 int GetStemLoc() const { return m_StemLoc; }
227 CStem* GetStemPtr() const { return m_pStem; }
228 enum type GetStemType() const { return m_StemType; }
229 void GetSuffix(CParse&) const;
230 class CStringSurrogate GetSuffix() const;
231 CParse* GetSuffixList() { return &m_SuffixList; }
232 int GetSuffixLoc() const { return m_SuffixLoc; }
233 class CSuffix* GetSuffixPtr() const { return m_pSuffix; }
234 class CSignature* GetSuffixSignature() const { return m_pSuffixSignature; }
235 int GetWordCount() const { return m_WordCount; }
238 //Phonology
239 double GetUnigramLogProb() { return m_UnigramLogProb; }
240 double GetBigramLogProb() { return m_BigramLogProb; }
241 CParse* GetTier1() { return &m_Phonology_Tier1; }
242 const CParse* GetTier1() const { return &m_Phonology_Tier1; }
243 CParse* GetTier2() { return &m_Phonology_Tier2; }
244 const CParse* GetTier2() const { return &m_Phonology_Tier2; }
245 CParse* GetTier1_Skeleton() { return &m_Phonology_Tier1_Skeleton; }
246 const CParse* GetTier1_Skeleton() const { return &m_Phonology_Tier1_Skeleton; }
248 double GetUnigramComplexity() { return m_UnigramComplexity; }
249 double GetBigramComplexity() { return m_BigramComplexity; }
251 QList<CStem*>* GetWordPtrList() { return m_WordPtrList; }
252 CStem* GetWord(int wordno) const { return m_WordPtrList->at(wordno);}
253 int GetNumberOfWords() const { return m_WordPtrList->count(); }
254 enum type GetWordType() const { return m_StemType; }
256 bool HasAPrefix() const;
257 bool HasASuffix() const;
259 bool IsValid() const;
260 int SF(int) const; // SuccessorFrequency;
261 void StemListDisplay(class Q3ListView* dest, QMap<QString, QString>* filter = 0, int char_count = 27);
262 void WordListDisplay(class Q3ListView* dest,
263 QMap<QString, QString>* filter = 0,
264 enum CWordListViewItem::display_mode =
265 CWordListViewItem::MiniLexicon_MorphologyStuffFirst,
266 int char_count = 27);
268 // typical filter: m_pMyMini->GetOutFilter()
269 void OutputStem(class Q3TextStream& outf, int index,
270 QMap<QString, QString>* filter);
271 void OutputWord(class Q3TextStream& outf, int index,
272 QMap<QString, QString>* filter);
274 // mutators.
276 void AddNULLPrefix();
277 void AddNULLSuffix();
278 void AddPrefix(class CPrefix*);
279 void AddPrefix(const class CStringSurrogate&);
280 void AddSuffix(class CSuffix*);
281 void AddSuffix(const class CStringSurrogate&);
282 bool AddWord(CStem*);
283 void AppendToConfidence(const QString string) { m_Confidence += string; }
284 void AttachPrefixSignature(class CSignature*);
285 void AttachSuffixSignature(class CSignature*);
286 void AttachWordAndSuffixalStem(CStem*);
287 void AttachWordAndPrefixalStem(CStem*);
288 void AttachWordStemAndPrefix(CStem*, class CPrefix*);
289 void AttachWordStemAndSuffix(CStem*, class CSuffix*);
291 class CSignature* ChangeSuffixSignature(class CSignature* pNewSig);
292 void ClearPointers(); // to Stem, Suffix, Signature;
293 void ClearPrefixStemSplit();
294 void ClearRootSuffixSplit();
295 void CopyStemInformation(CStem*);
296 void CopySuffixList(CParse*);
298 void DeleteFactorization();
299 void DeletePrefix(); // Arabic morphology
300 void DetachPrefix(class CPrefix*);
301 void DetachSuffix(class CSuffix*);
303 void IncrementSuffixLocs();
304 void IncrementCompoundCount(double d = 1.0) { m_CompoundCount += d; }
305 void IncrementWordCount(int n = 1);
308 void RepairSuffixList(const class CMiniLexicon*);
309 void RemoveWordFromWordPtrList(CStem*);
311 void SetAffixness(double d) { m_Affixness = d; }
312 void SetCompoundCount(double d) { m_CompoundCount = d; }
313 void SetConfidence(const QString conf) { m_Confidence = conf; }
314 static void SetLexicon(CLexicon* Lex) { m_Lexicon = Lex; }
315 void SetNumberOfStems(int n) { m_NumberOfStems = n; }
316 void SetPrefixLoc(int n) { m_PrefixLoc = n; }
317 void SetPrefixPtr(class CPrefix* pPre) { m_pPrefix = pPre; }
318 void SetPrefixSignature(class CSignature* pSig) { m_pPrefixSignature = pSig; }
319 void SetStem2Loc(int n) { m_Stem2Loc = n; }
320 void SetStemLoc(int n) { m_StemLoc = n; }
321 void SetStemPtr(CStem* pStm) { m_pStem = pStm; }
322 void SetStemType(enum type e) { m_StemType = e; }
323 void SetSuffixList(CParse* pParse) { m_SuffixList = pParse; }
324 void SetSuffixLoc(int n) { m_SuffixLoc = n; }
325 void SetSuffixPtr(class CSuffix* pSuf) { m_pSuffix = pSuf; }
326 void SetSuffixSignature(class CSignature* pSig) { m_pSuffixSignature = pSig; }
327 void SetWordCount(int n) { m_WordCount = n; }
328 void SetWordType(enum type WT) { m_StemType = WT; }
329 void SetMyEarleyParser(class CEarleyParser* parser) { m_MyEarleyParser = parser; }
330 void SetPhonology_Tier1(CParse*);
331 void ShiftStemSuffixBoundary(int);
332 void ShiftPrefixStemBoundary(int);
333 void SetHMM_LogProbability(double logprob) { m_HMM_LogProbability = logprob;}
336 inline CParse* CStem::GetPhonology_Tier1() { return &m_Phonology_Tier1; }
337 inline CParse* CStem::GetPhonology_Tier2() { return &m_Phonology_Tier2; }
338 inline CParse* CStem::GetPhonology_Tier1_Skeleton()
339 { return &m_Phonology_Tier1_Skeleton; }
340 inline double CStem::GetLocalMI_TotalBoltzmannScore()
341 { return m_LocalMI_TotalBoltzmannScore; }
342 inline double CStem::GetDistantMI_TotalBoltzmannScore()
343 { return m_DistantMI_TotalBoltzmannScore; }
345 #endif // STEM_H