CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / MiniLexicon.h
blobd5f92782f68ac6d658221eaa8e8b8cf949c88a6d
1 // Sub-stem morphology model and driver
2 // Copyright © 2009 The University of Chicago
3 #ifndef MINILEXICON_H
4 #define MINILEXICON_H
6 class CMiniLexicon;
8 #include <QMap>
9 #include "generaldefinitions.h"
10 #include "AffixLocation.h"
11 class QString;
12 class CStringSurrogate;
14 /// All analyses at the sub-stem level (non-compound) are done in the
15 /// lexicon’s mini-lexica.
16 class CMiniLexicon {
17 public:
18 enum which_words {
19 WW_All,
20 WW_AnalyzedOnly,
21 WW_UnanalyzedOnly,
23 private:
24 /// The parent of this mini-lexicon
25 class CLexicon* m_pLexicon;
27 /// The index of this mini in its parent's mini-lexicon list.
28 int m_Index;
30 /// A flag designating the affix type (prefix|suffix). This is also
31 /// noted by the signatures and their collection.
32 enum eAffixLocation m_AffixLocation;
34 /// This mini-lexicon's local copy of words will be the basis for all
35 /// analyses within the mini-lexicon. The suffixes, prefixes, stems and
36 /// signatures are all discovered locally in this mini-lexicon. The
37 /// mini-lexicon has the responsibility to update the global collection.
38 class CWordCollection* m_pWords;
39 class CSuffixCollection* m_pSuffixes;
40 class CPrefixCollection* m_pPrefixes;
41 class CStemCollection* m_pStems;
42 class CSignatureCollection* m_pSignatures;
43 class LxPoSCollection* m_pPOS;
44 class CDescriptionLength* m_DescriptionLength;
45 double m_CorpusCountOfUnanalyzedWords;
46 double m_PhonologicalInformationOfUnanalyzedWords;
48 class GUIWordCollection* m_GUIWords;
50 QMap<QString, class CDatum> m_DataMap; // for deMarcken JG
52 // class FSA* m_pFSA;
53 public:
54 // construction/destruction.
56 CMiniLexicon(CLexicon* parent = 0, int index = -1,
57 enum eAffixLocation affixLocation = STEM_FINAL);
58 ~CMiniLexicon();
60 // disable copy.
61 private:
62 CMiniLexicon(const CMiniLexicon& x);
63 CMiniLexicon& operator=(const CMiniLexicon& x);
64 public:
66 // assign.
68 void ClearAll();
70 // location in parent lexicon.
72 int GetIndex() { return m_Index; }
73 CLexicon* GetLexicon() { return m_pLexicon; }
74 class CCorpusWord* FindAWord(class CStem* stem, class CSuffix* suffix);
75 /// total number of (not necessarily unique) words in corpus
76 int GetCorpusCount();
77 int GetNumberOfCharacterTypes();
78 int GetMiniCount();
79 int GetMiniSize();
80 CMiniLexicon* GetMiniLexicon(int index);
82 // parameters.
84 enum eAffixLocation GetAffixLocation() { return m_AffixLocation; }
85 /// Set affix type.
86 /// If affix type has changed from initial to final or vice-versa,
87 /// this entails forgetting any previously discovered affixes.
88 /// If this further entails forgetting some discovered stems,
89 /// do nothing; result is false.
90 /// Result is true on success.
91 bool SetAffixLocation(enum eAffixLocation prefix_or_suffix);
92 int GetIntParameter(QString key, int default_value);
94 // analyzed and unanalyzed words.
96 CWordCollection* GetWords() { return m_pWords; }
97 void AddToWordCollection(CStemCollection* words);
98 void AddToWordCollection(CWordCollection* words,
99 enum which_words which = WW_UnanalyzedOnly);
100 int GetNumberOfUnanalyzedWords();
101 int GetCorpusCountOfUnanalyzedWords();
102 int GetNumberOfAnalyzedWords(int& unanalyzed_count);
103 CStem* GetWordFromStemSuffix(CStem*, CSuffix*);
104 CStem* GetWordFromStemPrefix(CStem*, class CPrefix*);
106 // stem/signature/suffix model of morphology.
108 CPrefixCollection* GetPrefixes() { return m_pPrefixes; }
109 CSignatureCollection* GetSignatures() { return m_pSignatures; }
110 CStemCollection* GetStems() { return m_pStems; }
111 CSuffixCollection* GetSuffixes() { return m_pSuffixes; }
112 int GetNumberOfStems();
113 int GetNumberOfSuffixes();
115 // linguistic analysis (stem/signature/suffix discovery).
117 CPrefixCollection* FindPrefixes();
118 /// If affix type is not suffix, do nothing.
119 /// Otherwise, apply various heuristics to fill this mini-lexicon’s
120 /// suffix collection. Result points to the suffix collection.
121 CSuffixCollection* FindSuffixes();
122 void FindSingletonSignatures();
123 /// Analyze unanalyzed words using known stems and known affixes.
124 /// XXX. If a word can be analyzed in two ways as stem + suffix,
125 /// this function does not assess their relative merits.
126 /// Instead, it just picks the version with the longer suffix.
127 /// The situation is even worse for prefixes: the version with
128 /// the stem earliest in alphabetical order is used.
129 void ExtendKnownStemsToKnownAffixes();
130 /// Rebuild signature list through knowledge of stems and their
131 /// associated affixes.
132 void FindAllSignatures();
133 void FromStemsFindAffixes();
134 /// Find new signatures using unanalyzed words and known affixes.
135 /// More precisely:
137 /// For each word, if it ends with a known suffix (resp starts with
138 /// a known prefix), consider the corresponding stem text.
140 /// If adding that new stem (and the corresponding signature from
141 /// analyzing other words with it) decreases description length,
142 /// record the new stem, signature, and analyzed words.
143 void LooseFit();
144 void RebuildAffixesStemsAndSignaturesFromWordSplits(
145 CStringSurrogate& remark);
146 void TakeSplitWords_ProduceStemsAndSigs(CStringSurrogate& Remark,
147 CWordCollection* words = 0, CStemCollection* stems = 0,
148 CPrefixCollection* prefixes = 0,
149 CSuffixCollection* suffixes = 0);
150 /// successor-frequency algorithm (bootstrapping step)
151 void TakeSignaturesFindStems(CSignatureCollection* Sigs = 0);
152 void CheckSignatures();
154 // description length.
156 CDescriptionLength* GetDescriptionLength() { return m_DescriptionLength; }
157 double CalculateDescriptionLength();
158 double CalculateSumOfPointersToMyUnanalyzedWords(enum eMDL_STYLE style);
159 double CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
160 class CDLHistory* GetDLHistory();
161 double CalculateCompressedLengthOfUnanalyzedWords();
163 // part-of-speech discovery (does not affect description length).
165 LxPoSCollection* GetPOS() { return m_pPOS; }
166 /// Build or update m_pPOS to reflect top discovered signatures
167 void FindMajorSignatures();
169 // output.
171 //FSA* GetFSA() { return m_pLexicon->m_pFSA; }
172 bool LogFileOn();
173 class QTextStream* GetLogFile();
174 QMap<QString, QString>* GetOutFilter();
176 // output to GUI.
178 /// set text for “command line” pane
179 void AddToScreen(QString info);
180 /// handle for UI operations
181 class LinguisticaMainWindow* GetDocument();
182 GUIWordCollection* GetGUIWords();
184 // CDatum* LookUp(CStringSurrogate&);
186 // allomorphy.
188 void FindAllomorphy();
189 void RelateStems();
190 void ShiftFinalLetterToStem(class CStem* pStem, QString& FinalLetter);
191 void HowManyStemsWithThisSuffixEndInThisLetter(CStringSurrogate& Suffix,
192 CStringSurrogate& Letter, int& TotalStemsWithSuffix,
193 int& HowManyEndWithThisLetter);
194 int StemsWithBothSuffixes(
195 CStringSurrogate& suf1, CStringSurrogate& suf2);
196 int StemsWithBothSuffixes(QString suf1, CStringSurrogate& suf2);
197 void MoveWordsStemSuffixBoundaryToRight(class CSignature* pSigToChange,
198 CStringSurrogate& Letters, class CParse* pSuffixCandidates);
199 void MoveWordsStemSuffixBoundaryToRight(class CSignature* pSigToChange,
200 QString Letters, class CParse* pSuffixCandidates);
201 class CParse CreateADeletingSignature(class CSignature* pSig,
202 CStringSurrogate Deletee, class CParse& ReplacingSuffixes,
203 int* bStatus, class CParse& WhatSigWillBecome,
204 QMap<QString, QString>& Remapper);
205 class CParse CreateADeletingSignature(class CSignature* pSig,
206 CStringSurrogate Deletee, class CParse& ReplacingSuffixes,
207 int* bStatus, class CParse& WhatSigWillBecome,
208 class CParse* pSuffixCandidates);
210 // LogFile functions
211 void LogFileSmallTitle(QString);
212 void LogFileSmallTitle(QString, QString);
213 void LogFileSmallTitle(QString, QString, QString);
214 void LogFileLargeTitle(QString);
215 void LogFileStartTable();
216 void LogFileEndTable();
217 void LogFileStartRow();
218 void LogFileStartRow(QString);
219 void LogFileEndRow();
220 void LogFile (QString);
221 void LogFile (double);
222 void LogFile1SimpleString(QString); // no row start or end
223 void LogFileSimpleString(QString); // no row start or end
224 void LogFileSimpleInteger(int); // no row start or end
225 void LogFileSimpleDouble(double); // no row start or end
226 void LogFile (QString, QString);
227 void LogFile (QString, QString, QString);
228 void LogFile (QString, QString, QString, QString);
229 void LogFile( QString, QString, QString, QString, QString);
230 void LogFile( QString, QString, QString, QString, QString, QString);
231 void LogFile (QString, int);
232 void LogFile(int, QString);
233 void LogFile(int, double, QString);
234 void LogFile (QString, double);
235 void LogFile(QString, int, double);
236 void LogFile(QString, int, int, double, double, double);
237 void LogFileHeader(QString);
238 void LogFileHeader ( QString, QString);
239 void LogFileHeader (QString, QString, QString);
240 void LogFileHeader( QString, QString, QString, QString, QString, QString);
244 #endif // MINILEXICON_H