CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Lexicon.h
blob0ff1e5772c84f1abcb3b98a22edc544f066a8938
1 // driver and model for linguistica analyses
2 // Copyright © 2009 The University of Chicago
3 #ifndef LEXICON_H
4 #define LEXICON_H
6 class CLexicon;
8 #include <QStringList>
9 #include <Q3PtrVector>
10 #include <QList>
11 #include <Q3Dict>
12 #include <QMap>
13 #include "generaldefinitions.h"
14 class QString;
15 class CStringSurrogate;
16 namespace linguistica { namespace ui {
17 struct status_user_agent;
18 } }
20 // compatibility typedefs.
22 typedef QMap<QString, class CStem> StringToStem;
23 typedef QMap<QString, int> StringToInt;
24 typedef QMap<QString, QString> StringToString;
25 typedef QList<class CPrefix*> PrefixSet;
26 typedef QList<class CSignature*> SignatureSet;
27 typedef QList<class CSuffix*> SuffixSet;
28 typedef QMap<QString, class CStem*> StringToPtrCStem;
29 typedef QList<class CStem*> StemSet;
31 /// The lexicon contains all the morphological collections.
32 /// These include words, stems, affixes, signatures, and other structures
33 /// used by different Linguistica functions.
34 class CLexicon {
35 protected:
36 /// parent driver object (and UI)
37 class LinguisticaMainWindow* m_pDoc;
38 /// defaults for user-specified parameters (see CLPreferences class)
39 QMap<QString, QString> m_ParamDefaults;
41 /// master corpus line list. (raw lines from the input file,
42 /// except hyphenated lines are joined if the user so requests.)
43 QStringList m_Corpus;
44 /// master corpus word list (see class CCorpusWord).
45 class CCorpusWordCollection* m_pCorpusWords;
46 /// maps strings in QStringList::split(' ', m_Corpus[i])
47 /// to corpus words in the master corpus word list
48 Q3Dict<class CCorpusWord> m_CorpusMap;
50 /// sub-stem morphology model (see class CMiniLexicon).
51 Q3PtrVector<class CMiniLexicon>* m_pMiniLexica;
52 /// mini-lexicon selected by and displayed to the user,
53 /// the only mini-lexicon that changes.
54 /// -1 indicates no mini-lexicon is active; otherwise,
55 /// 0 <= m_ActiveMini < m_pMiniLexica->size()
56 int m_ActiveMini;
57 /// mini-lexicon “words” with changes to propagate to the lexicon
58 /// see UpdateWord(), DoWordUpdates()
59 QList<class CStem*> m_WordUpdates;
60 // stems, suffixes, prefixes, and signatures owned by mini-lexica.
61 Q3Dict< QList<class CPrefix*> > m_AllPrefixes;
62 Q3Dict< QList<class CSignature*> > m_AllPrefixSigs;
63 Q3Dict< QList<class CStem*> > m_AllStems;
64 Q3Dict< QList<class CSuffix*> > m_AllSuffixes;
65 Q3Dict< QList<class CSignature*> > m_AllSuffixSigs;
66 Q3Dict< QList<class CStem*> > m_AllWords;
68 // compound discovery happens globally (not inside the mini-lexica),
69 // using information from all mini-lexica.
70 class CCompoundCollection* m_pCompounds; ///< discovered compounds
71 class CLinkerCollection* m_pLinkers; ///< compound interfixes
72 /// compounds with changes to propagate to other lexicon members
73 /// see UpdateCompound(), DoWordUpdates()
74 QStringList m_CompoundUpdates;
76 // scrubbing filters.
77 /// map grapheme sequences to phoneme sequences
78 QMap<QString, QString>* m_pInFilter;
79 /// map phone sequences to grapheme sequences
80 QMap<QString, QString>* m_pOutFilter;
82 // description length.
83 /// number of distinct phonemes encountered
84 int m_NumberOfCharacterTypes;
85 /// number of distinct words encountered
86 int m_tokenCount;
87 /// log of description length changes and their reasons.
88 class CDLHistory* m_pDLHistory;
89 /// “all-purpose” (i.e., unused) description length accounting object
90 class CDescriptionLength* m_DescriptionLength;
92 // string edit distance-based analyses.
93 /// working copy of word list for string edit distance-based analysis
94 class CWordCollection* m_pSEDWords;
95 public:
96 // syllabification.
97 class corpussyl* DCNsylTrainCorpus;
98 class corpussyl* DCNsylTestCorpus;
99 bool isDCNtrainRead;
100 bool isDCNtestRead;
101 protected:
102 // hidden Markov model.
103 /// “all-purpose” (i.e., unused) HMM
104 class StateEmitHMM* m_HMM;
105 class FSA* m_pFSA;
107 public:
108 // construction/destruction.
109 private:
110 // disable default-construction.
111 CLexicon();
112 public:
113 CLexicon(LinguisticaMainWindow*);
114 ~CLexicon();
115 private:
116 // disable copies and copy-assignment.
117 CLexicon(const CLexicon& x);
118 CLexicon& operator=(const CLexicon& x);
119 public:
121 // clear.
122 void ClearAll();
124 // input from file.
126 /// requires: filename is a valid filename (in particular, nonempty)
127 int ReadCorpus(const QString filename, int NumberOfWords = 5000);
128 int ReadProjectFile(QString filename);
129 int ReadDX1File (const QString filename, int NumberOfWords = -1);
130 /// requires: filename is a valid filename (in particular, nonempty)
131 int RereadCorpus(const QString filename, int NumberOfWords = 5000);
133 // input from stream.
134 /// write number of occurences of each word in lines to out
135 int Tokenize(QStringList& lines, QMap<QString, int>& out);
137 // output to file.
138 void MakeBrokenCorpus(QString filename);
139 void OutputStats(QString filename);
141 // compound discovery and stem-level morphology discovery.
142 void UpdateCompound(QString);
143 void UpdateWord(CStem*);
144 void DoWordUpdates();
145 void FindPrefixes(bool AutoLayer = false);
146 void FindSuffixes(bool AutoLayer = false);
148 // compound discovery. (Lexicon_Compounds.cpp)
149 void FromStemsFindCompounds(QList<CStem*>* compounds = 0,
150 QList<CStem*>* components = 0,
151 QString linker = QString());
152 void CalculateCoefficientsOfAffixness();
153 void FromAffixnessUpdateSigsAndCompounds();
154 void FromStemsFindFlatCompounds(QList<CStem*>* compounds = 0,
155 QList<CStem*>* components = 0,
156 QString linker = QString(),
157 int maxNumberOfRoots = 5);
159 /// driver and UI.
160 LinguisticaMainWindow* GetDocument() { return m_pDoc; }
161 linguistica::ui::status_user_agent& status_display();
162 int GetIntParameter(QString key, int default_value);
163 void AddToScreen(QString text);
164 void ClearScreen();
165 class QTextStream* GetLogFileStream();
166 bool LogFileOn();
167 QString GetStringParameter(QString key);
168 /// defaults for user settings.
169 QMap<QString, QString>* GetDefaultParams() { return &m_ParamDefaults; }
171 // basic accessors.
172 QStringList* GetCorpus() { return &m_Corpus; }
173 CCorpusWordCollection* GetWords() { return m_pCorpusWords; }
174 Q3Dict<CCorpusWord>* GetCorpusMap() { return &m_CorpusMap; }
175 CCorpusWord* GetCorpusWord(const CStringSurrogate& word_text);
177 // convenience accessors.
178 CCorpusWord* FindAWord(CStem*, CSuffix*);
180 // mini-lexica.
181 int GetActiveMiniIndex() { return m_ActiveMini; }
182 int GetMiniCount() { return m_pMiniLexica->count(); }
183 int GetMiniSize() { return m_pMiniLexica->size(); }
184 /// points to the mini-lexicon with index n
185 /// result is a null pointer if n == -1, for convenience
186 /// in expressions such as GetMiniLexicon(GetActiveMiniIndex())
187 /// Requires: n == -1 or n is a valid mini-lexicon index
188 CMiniLexicon* GetMiniLexicon(int n);
189 void ClearMiniLexicon(int n);
190 void DeleteMiniLexicon(int n);
191 int NewMiniLexicon();
192 void SetActiveMiniIndex(int);
194 // mini-lexicon contents.
195 QList<CPrefix*>* GetPrefixSet(const CStringSurrogate& text);
196 QList<CSignature*>* GetPrefixSigSet(const CStringSurrogate& text);
197 QList<CStem*>* GetStemSet(const CStringSurrogate& text);
198 QList<CSuffix*>* GetSuffixSet(const CStringSurrogate& text);
199 QList<CSignature*>* GetSuffixSigSet(const CStringSurrogate& text);
200 QList<CStem*>* GetWordSet(const CStringSurrogate& text);
202 Q3Dict<QList<CPrefix*> >* GetAllPrefixes() { return &m_AllPrefixes; }
203 Q3Dict<QList<CSignature*> >* GetAllPrefixSigs()
204 { return &m_AllPrefixSigs; }
205 Q3Dict<QList<CStem*> >* GetAllStems() { return &m_AllStems; }
206 Q3Dict<QList<CSuffix*> >* GetAllSuffixes() { return &m_AllSuffixes; }
207 Q3Dict<QList<CSignature*> >* GetAllSuffixSigs()
208 { return &m_AllSuffixSigs; }
209 Q3Dict<QList<CStem*> >* GetAllWords() { return &m_AllWords; }
211 bool InsertPrefix(CPrefix*);
212 bool InsertPrefixSig(CSignature*);
213 bool InsertStem(CStem*);
214 bool InsertSuffix(CSuffix*);
215 bool InsertSuffixSig(CSignature*);
216 bool InsertWord(CStem*);
218 bool RemovePrefix(CPrefix*);
219 bool RemovePrefixSig(CSignature*);
220 bool RemoveStem(CStem*);
221 bool RemoveSuffix(CSuffix*);
222 bool RemoveSuffixSig(CSignature*);
223 bool RemoveWord(CStem*);
225 // scrubbing rules.
226 QMap<QString, QString>* GetInFilter() { return m_pInFilter; }
227 QMap<QString, QString>* GetOutFilter() { return m_pOutFilter; }
229 // discovered compounds.
230 CCompoundCollection* GetCompounds() { return m_pCompounds; }
231 CLinkerCollection* GetLinkers() { return m_pLinkers; }
233 // string edit distance-based analysis.
234 CWordCollection* GetSEDWords() { return m_pSEDWords; }
236 // description length. (Lexicon.cpp, DescriptionLength.cpp)
237 int GetNumberOfCharacterTypes() { return m_NumberOfCharacterTypes; }
238 int GetTokenCount() { return m_tokenCount; }
239 CDLHistory* GetDLHistory() { return m_pDLHistory; }
240 CDescriptionLength* GetDescriptionLength() { return m_DescriptionLength; }
241 int GetCorpusCount();
242 double CalculateTotalPhonologicalInformationContentOfStems();
243 double CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
244 double GetPhonologicalInformationContentOfSuffixes();
245 FSA* GetFSA() { return m_pFSA; }
246 void SetFSA(FSA* pFSA) { m_pFSA = pFSA; }
248 // output to GUI.
249 void CompoundListDisplay(class Q3ListView* widget,
250 QMap<QString, QString>* filter = 0, QChar separator = QChar());
251 void CompoundComponentListDisplay(class Q3ListView* widget);
252 void CorpusWordListDisplay(class Q3ListView* widget,
253 QMap<QString, QString>* filter,
254 bool analyzed_words_only = true);
255 void LinkerListDisplay(class Q3ListView* widget,
256 QMap<QString, QString>* filter = 0);
257 void PrefixListDisplay(class Q3ListView* widget);
258 void SignatureListDisplay(class Q3ListView* widget,
259 enum eDocumentType affix_loc);
260 void StemListDisplay(class Q3ListView* widget);
261 void SuffixListDisplay(class Q3ListView* widget);
262 void WordListDisplay(class Q3ListView* widget, bool analyzed_only); // used only when there is no MiniLexicon
264 // scrubbing filters.
265 void SetFilters(QStringList* phonemes);
267 // “all-purpose” hidden Markov model.
268 StateEmitHMM* GetHMM();
269 StateEmitHMM* CreateNewHMM();
271 friend class LinguisticaMainWindow;
274 #endif // LEXICON_H