CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Signature.h
bloba7b4aa3dba7a58146983509bd89a5b3ae1ab99af
1 // Signature type for suffix/signature-based morphology discovery
2 // Copyright © 2009 The University of Chicago
3 #ifndef SIGNATURE_H
4 #define SIGNATURE_H
6 class CSignatureListViewItem;
7 class CSignature;
9 // See the CMiniLexicon class in MiniLexicon.h for an overview of
10 // suffix/signature-based discovery of morphology.
12 // for CSignatureListViewItem:
13 #include <Q3ListViewItem>
14 #include <QString>
15 template<class K, class V> class QMap;
17 // for CSignature:
18 #include "LParse.h"
19 #include <QString>
20 #include <QChar>
21 #include "Parse.h"
22 #include "AffixLocation.h"
23 template<class K, class V> class QMap;
24 template<class T> class QList;
26 /// A list view item for signatures.
27 /**
28 Contains all the necessary variables needed when displaying a
29 signature and its data in a list view.
31 class CSignatureListViewItem : public Q3ListViewItem {
32 CSignature* m_signature;
33 QMap<QString, QString>* m_filter;
34 QString m_label;
35 class Q3ListView* m_parentlist;
36 int m_mini;
37 public:
38 // construction/destruction.
40 explicit CSignatureListViewItem(class Q3ListView* parent = 0,
41 QString sig_text = QString(), int mini_index = -1,
42 CSignature* sig = 0, QMap<QString, QString>* filter = 0);
43 CSignatureListViewItem(Q3ListViewItem* parent,
44 QString sig_text = QString(), int mini_index = -1,
45 CSignature* sig = 0, QMap<QString, QString>* filter = 0);
47 // copy.
49 CSignatureListViewItem(const CSignatureListViewItem& x)
50 : Q3ListViewItem(x),
51 m_signature(x.m_signature),
52 m_filter(x.m_filter),
53 m_label(x.m_label),
54 m_parentlist(x.m_parentlist),
55 m_mini(x.m_mini) { }
56 CSignatureListViewItem& operator=(const CSignatureListViewItem& x)
58 Q3ListViewItem::operator=(x);
59 m_signature = x.m_signature;
60 m_filter = x.m_filter;
61 m_label = x.m_label;
62 m_parentlist = x.m_parentlist;
63 m_mini = x.m_mini;
64 return *this;
67 // Qt3 list view item interface.
69 virtual QString text(int column_index) const;
70 int compare(Q3ListViewItem* other, int column, bool ascending) const;
72 // underlying signature object.
74 CSignature* GetSignature() { return m_signature; }
75 void SetSignature(CSignature* pSig) { m_signature = pSig; }
77 /// enclosing widget
78 Q3ListView* GetParent() { return m_parentlist; }
81 /// A string implementation of signatures.
82 /**
83 Signatures are simple statements of morphological patterns, which aid both
84 in quantifying the Minimum Description Length (MDL) account and in constructively
85 building a satisfactory morphological grammar (for MDL offers no guidance in the
86 task of seeking the optimal analysis).
88 class CSignature: public CLParse {
89 public:
90 enum detachment_parameter {
91 eDo_Not_Call_Words,
92 eCall_Words,
94 protected:
95 enum eAffixLocation m_AffixLocation;
96 QList<class CStem*>* m_StemPtrList;
97 QList<class CStem*>* m_WordPtrList;
98 QList<class CSuffix*>* m_SuffixPtrList;
99 QList<class CPrefix*>* m_PrefixPtrList;
101 // XXX. we may want to do this with a bit-base
102 // vector someday to speed it up.
103 /// points to the indexes of the affixes
104 // class CSparseIntVector* m_Vector;
106 /// Count-related members
107 QVector<double> m_WordCounts;
108 QVector<double> m_StemCounts;
109 QVector<double> m_AffixCounts;
110 QVector<double> m_StemFrequencies;
111 QVector<double> m_AffixFrequencies;
112 QVector<double> m_WordFrequencies;
113 int m_TotalCount;
114 QString m_Remark;
116 // float* m_Frequencies;
119 /// the number of letters saved by having this signature
120 /// = ( Number of stems - 1 ) * ( Number of suffixes - 1);
121 /// Or, if there is a more robust signature which is
122 /// contained in this one.
123 /// It really measures how good our knowledge of these stems is, qua stems.
124 mutable double m_Robustness;
125 CSignature* m_Mentor;
126 QList<CSignature*>* m_MentorList;
127 CSignature* m_MyGeneralizer;
128 /// without the <e> stuff prefixed
129 CParse m_SimplifiedForm;
130 class CSignatureCollection* m_SignatureCollection;
132 /// These affixes are "close" to this signature, in the sense that
133 /// stems associated with this signature may also be marked to take
134 /// that affix.
135 CParse m_SatelliteAffixes;
137 // description length.
139 double m_DLofMyCorpus;
140 double m_DLofMyStemPointers;
141 double m_DLofMyAffixPointers;
142 double m_LengthOfPointerToMe;
143 public:
144 // construction/destruction.
146 CSignature(class CMiniLexicon* mini);
147 CSignature(enum eAffixLocation prefix_or_suffix,
148 class CMiniLexicon* mini);
149 CSignature(const CParse& affixes, class CMiniLexicon* mini);
150 /// single-affix signature
151 CSignature(const class CStringSurrogate& affix,
152 class CMiniLexicon* mini);
153 /// deprecated
154 CSignature(const CParse* affixes_ptr, class CMiniLexicon* mini);
155 ~CSignature();
157 // copy/assignment.
159 CSignature(const CSignature& x);
160 CSignature& operator=(const CSignature& x)
161 { *this = &x; return *this; }
162 /// deprecated
163 void operator=(const CSignature* x);
164 /// clear
165 void Suicide();
166 void ConsumeParse(CParse* sigs);
168 // convert to string.
170 /// includes deletees in affixes
171 QString Express(bool bDisplayDeletees = true);
172 // includes deletees in affixes
173 CSignature& Express(CSignature& Express, bool bDisplayDeletees = true);
174 QString Display(QChar sep, QMap<QString, QString>* filter = 0) const;
175 QString Display(QMap<QString, QString>* filter) const;
176 QString Display() const;
178 // counts.
180 /// number of occurences in corpus (corpus count).
181 /// output is a vector of integers whose length is the number of stems
182 /// times the number of suffixes.
183 int GetTotalCount() const;
184 int GetNumberOfWords() const;
185 double GetWordCount(int wordno) const;
186 double GetWordCount(int stemno, int affixno) const;
187 void SetWordCount(int stemno, int affixno, double value);
188 void CalculateWordCounts();
189 double GetStemCount(int stemno) const;
190 double GetAffixCount(int affixno) const;
191 double GetStemFrequency (int stemno) const;
192 double GetAffixFrequency(int affixno) const;
193 double GetWordFrequency(int stemno, int affixno) const;
194 int GetNumberOfAffixes() const;
195 int GetNumberOfStems() const;
196 void CalculateFrequencies(class CMiniLexicon* mini);
197 double GetCorpusCount() const;
198 void SetNewWordCounts (int size = 0);
199 void SetNewStemCount (int size = 0);
200 void SetNewAffixFrequencies(int size = 0);
201 void SetNewStemFrequencies(int size = 0);
203 // context (enclosing mini-lexicon).
205 /// affix location (prefix or suffix)
206 enum eAffixLocation GetAffixLocation() const;
207 void SetAffixLocation(enum eAffixLocation prefix_or_suffix) { m_AffixLocation = prefix_or_suffix; }
208 CSignatureCollection* GetSignatureCollection() { return m_SignatureCollection; }
209 void SetSignatureCollection(CSignatureCollection* list) { m_SignatureCollection = list; }
210 class CMiniLexicon* GetLexicon() const { return m_pMyMini; }
211 void SetLexicon(class CMiniLexicon* lex) { m_pMyMini = lex; }
213 /// human-readable string describing this signature’s discovery.
214 QString GetRemark() const;
215 void SetRemark(QString origin);
217 CSignature* GetGeneralizer() const { return m_MyGeneralizer; }
219 // contained affixes.
221 /// indices of affixes in the affix collection that holds them.
224 QList<CSuffix*>* GetSuffixPtrList() const;
225 QList<CPrefix*>* GetPrefixPtrList() const;
226 void SetNewStemPtrList ( QList< CStem* >& );
227 void CreateNewStemPtrList (int size);
228 void CreateNewSuffixPtrList (int size);
229 void CreateNewPrefixPtrList (int size);
230 CSuffix* GetSuffix (int n) const;
231 CPrefix* GetPrefix (int n) const;
232 void AppendSuffixPtr (CSuffix*);
233 void AppendPrefixPtr (CPrefix*);
234 void AppendStemPtr (CStem*) const;
235 bool StemListContains(CStem* );
236 void RemoveFromWordList(CStem* pWord);
237 void RecalculateStemAndWordPointers(); //jan 2010
239 // stems appearing with this sig.
241 CParse GetStems();
242 CStem* GetStem(int stemno) const;
243 QList<CStem*>* GetStemPtrList() const;
244 void ClearStemPtrList();
246 // words using this sig.
247 CParse GetWords();
248 CStem* GetWord(int stem, int suffix) const;
249 void AppendWordPointer(CStem* const);
251 // description length.
253 double GetDLofMyStemPointers();
254 double GetDLofMyAffixPointers();
255 virtual double ComputeDLofModel(int char_count = 26);
256 double FindCost(class CMiniLexicon* mini);
257 double ComputeDLofMyCorpus();
258 double GetLengthOfPointerToMe();
259 void SetLengthOfPointerToMe(double);
260 double GetSumOfDLofInternalPointers();
262 // part-of-speech discovery.
264 CSignature* GetMentor();
265 void SetMentor(CSignature* sig);
266 QList<CSignature*>* GetMentorList();
268 /// deprecated: index of a known suffix in the suffix list.
269 int GetNumber(CSuffix* suffix);
271 /// (#affixes - 1)(total length of stems) +
272 /// (#stems - 1)(total length of affixes)
273 double GetRobustness() const;
274 void SetRobustness(double value);
275 /// deprecated: synonym for GetRobustness
276 float GetSortingQuantity() const;
278 // insert.
280 /// register a stem appearing with this signature.
281 void operator<<(class CStem* stem);
282 /// Attach to Suffix Sig.
284 * This is the function to use to take a stem from one
285 * signature to a new one, and deal with all of the bookkeeping
286 * that goes with that. First it detaches the Stem from its
287 * old signature by using DetachStem() (see below);
288 * then it adds this stem to the New Signature;
289 * then it shifts all of the stem's words from the old signature
290 * to the new signature.
292 void AttachToSuffixSig(CStem* stem, bool bLookAtPreviousSig = true);
293 void AttachToPrefixSig(CStem* stem, bool bLookAtPreviousSig = true);
294 void AttachStemFromDifferentPrefixSignature(CStem* s) { AttachToPrefixSig(s); }
295 void AttachStemFromDifferentSuffixSignature(CStem* s) { AttachToSuffixSig(s); }
296 void AddWord(CStem* word);
297 /// add a satellite affix
298 /// (i.e. remember one which often occurs on stems that take
299 /// this signature)
300 void AppendSatelliteAffix(CParse& Affix);
301 void TakeAllStems(CSignature* other);
303 // remove.
305 void DetachWord(CStem* word, enum detachment_parameter);
306 /// remove a stem from a signature's Stem Ptr List, and if the
307 /// Detachment Parameter is true, also remove all of the stem's
308 /// WORDS from the signature's Word Ptr List.
309 /// This is called by AttachToSuffixSig().
310 void DetachStem(CStem* stem, enum detachment_parameter words_too);
311 bool RemoveStem(CStem* stem);
312 bool RemoveWord(CStem* word);
314 // output to GUI.
316 void BorrowedSigsDisplay(class Q3ListView* widget,
317 QMap<QString, QString>* filter = 0);
318 void ListDisplay(class Q3ListView* widget,
319 QMap<QString, QString>* filter = 0,
320 bool express_deletees = true);
322 /// output to file
323 void OutputSignatureXfst ( class QTextStream& outf, int count );
324 void OutputSignature(class QTextStream& out);
326 /// no empty affixes (but "NULL" as an affix is fine)
327 bool IsValid();
329 /// XXX. no-op
330 void FindCorpusCount();
332 /// XXX. unused
333 void SetMyGeneralizer(CSignature* sig);
335 // allomorphy and cut shifting helpers.
337 /// delete Letter from the beginning of each suffix, and
338 /// add it to each stem.
339 /// add a <Letter> to suffixes
340 /// Much like shifting boundary of stem/suffix split to the right.
341 void RemoveLetter(CStringSurrogate& Letter,
342 class CMiniLexicon* mini, CSignature* out);
343 /// add Letter to the beginning of each suffix
344 void AddLetter(const QString& Letter);
345 /// whether adding a letter at the start of each suffix still yield
346 /// known suffixes
347 bool EachSuffixCanHaveThisLetterPrefixedToIt(const QString& Letter);
348 void ShiftStemSuffixCutToTheLeft(int distance, const QString& Piece);
349 void ShiftStemSuffixCutToTheLeft(int distance);
350 bool Generalizes(CSignature* other);
352 CParse CreateADeletingSignature(CParse& Deleter,
353 class CMiniLexicon* mini);
355 /// This will look at words claimed by this signature,
356 /// and actively make cuts in them to match the affixes declared by the signature.
357 void CutMyWordsAsIDeclare();
359 /// entropy of final ngrams of stems
360 double ComputeFinalNgramEntropyOfStems(int n);
362 // real workers.
364 int CheckOut(class CMiniLexicon* mini);
365 void IterateThroughStems(int NumberOfLettersShifted,
366 class CMiniLexicon* Lexicon, CLParse* pPiece,
367 double& TotalDecreaseInDLDueToShorterStems,
368 double LogTotalNumberOfAnalyzedWords,
369 double& StemPointersToThisSig,
370 double& SavingsBecauseStemAlreadyExisted,
371 bool analyzingSuffixex);
373 friend class QTextStream& operator<<(class QTextStream& out,
374 CSignature* sigp);
377 #endif // SIGNATURE_H