HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Lexicon.h
blob38efab31fe118e694a327dbab815534450a4d9e0
1 // driver and model for linguistica analyses
2 // Copyright © 2009 The University of Chicago
3 #ifndef LEXICON_H
4 #define LEXICON_H
6 class CLexicon;
8 #include <QStringList>
9 #include <Q3PtrVector>
10 #include <QList>
11 #include <Q3Dict>
12 #include <QMap>
13 #include "generaldefinitions.h"
14 class QString;
15 class CStringSurrogate;
16 namespace linguistica { namespace ui {
17 struct status_user_agent;
18 } }
20 // compatibility typedefs.
22 typedef QMap<QString, class CStem> StringToStem;
23 typedef QMap<QString, int> StringToInt;
24 typedef QMap<QString, QString> StringToString;
25 typedef QList<class CPrefix*> PrefixSet;
26 typedef QList<class CSignature*> SignatureSet;
27 typedef QList<class CSuffix*> SuffixSet;
28 typedef QMap<QString, class CStem*> StringToPtrCStem;
29 typedef QList<class CStem*> StemSet;
31 /// The lexicon contains all the morphological collections.
32 /// These include words, stems, affixes, signatures, and other structures
33 /// used by different Linguistica functions.
34 class CLexicon {
35 protected:
36 /// parent driver object (and UI)
37 class LinguisticaMainWindow* m_pDoc;
38 /// defaults for user-specified parameters (see CLPreferences class)
39 QMap<QString, QString> m_ParamDefaults;
41 /// master corpus line list. (raw lines from the input file,
42 /// except hyphenated lines are joined if the user so requests.)
43 QStringList m_Corpus;
44 /// master corpus word list (see class CCorpusWord).
45 class CCorpusWordCollection* m_pCorpusWords;
46 /// maps strings in QStringList::split(' ', m_Corpus[i])
47 /// to corpus words in the master corpus word list
48 Q3Dict<class CCorpusWord> m_CorpusMap;
50 /// sub-stem morphology model (see class CMiniLexicon).
51 Q3PtrVector<class CMiniLexicon>* m_pMiniLexica;
52 /// mini-lexicon selected by and displayed to the user,
53 /// the only mini-lexicon that changes.
54 /// -1 indicates no mini-lexicon is active; otherwise,
55 /// 0 <= m_ActiveMini < m_pMiniLexica->size()
56 int m_ActiveMini;
57 /// mini-lexicon “words” with changes to propagate to the lexicon
58 /// see UpdateWord(), DoWordUpdates()
59 QList<class CStem*> m_WordUpdates;
60 // stems, suffixes, prefixes, and signatures owned by mini-lexica.
61 Q3Dict< QList<class CPrefix*> > m_AllPrefixes;
62 Q3Dict< QList<class CSignature*> > m_AllPrefixSigs;
63 Q3Dict< QList<class CStem*> > m_AllStems;
64 Q3Dict< QList<class CSuffix*> > m_AllSuffixes;
65 Q3Dict< QList<class CSignature*> > m_AllSuffixSigs;
66 Q3Dict< QList<class CStem*> > m_AllWords;
68 // compound discovery happens globally (not inside the mini-lexica),
69 // using information from all mini-lexica.
70 class CCompoundCollection* m_pCompounds; ///< discovered compounds
71 class CLinkerCollection* m_pLinkers; ///< compound interfixes
72 /// compounds with changes to propagate to other lexicon members
73 /// see UpdateCompound(), DoWordUpdates()
74 QStringList m_CompoundUpdates;
76 // scrubbing filters.
77 /// map grapheme sequences to phoneme sequences
78 QMap<QString, QString>* m_pInFilter;
79 /// map phone sequences to grapheme sequences
80 QMap<QString, QString>* m_pOutFilter;
82 // description length.
83 /// number of distinct phonemes encountered
84 int m_NumberOfCharacterTypes;
85 /// number of distinct words encountered
86 int m_tokenCount;
87 /// log of description length changes and their reasons.
88 class CDLHistory* m_pDLHistory;
89 /// “all-purpose” (i.e., unused) description length accounting object
90 class CDescriptionLength* m_DescriptionLength;
92 // string edit distance-based analyses.
93 /// working copy of word list for string edit distance-based analysis
94 class CWordCollection* m_pSEDWords;
95 public:
96 // syllabification.
97 class corpussyl* DCNsylTrainCorpus;
98 class corpussyl* DCNsylTestCorpus;
99 bool isDCNtrainRead;
100 bool isDCNtestRead;
101 protected:
102 // hidden Markov model.
103 /// “all-purpose” (i.e., unused) HMM
104 class StateEmitHMM* m_HMM;
105 public:
106 // construction/destruction.
107 private:
108 // disable default-construction.
109 CLexicon();
110 public:
111 CLexicon(LinguisticaMainWindow*);
112 ~CLexicon();
113 private:
114 // disable copies and copy-assignment.
115 CLexicon(const CLexicon& x);
116 CLexicon& operator=(const CLexicon& x);
117 public:
119 // clear.
120 void ClearAll();
122 // input from file.
124 /// requires: filename is a valid filename (in particular, nonempty)
125 int ReadCorpus(const QString filename, int NumberOfWords = 5000);
126 int ReadProjectFile(QString filename);
127 int ReadDX1File (const QString filename, int NumberOfWords = -1);
128 /// requires: filename is a valid filename (in particular, nonempty)
129 int RereadCorpus(const QString filename, int NumberOfWords = 5000);
131 // input from stream.
132 /// write number of occurences of each word in lines to out
133 int Tokenize(QStringList& lines, QMap<QString, int>& out);
135 // output to file.
136 void MakeBrokenCorpus(QString filename);
137 void OutputStats(QString filename);
139 // compound discovery and stem-level morphology discovery.
140 void UpdateCompound(QString);
141 void UpdateWord(CStem*);
142 void DoWordUpdates();
143 void FindPrefixes(bool AutoLayer = false);
144 void FindSuffixes(bool AutoLayer = false);
146 // compound discovery. (Lexicon_Compounds.cpp)
147 void FromStemsFindCompounds(QList<CStem*>* compounds = 0,
148 QList<CStem*>* components = 0,
149 QString linker = QString());
150 void CalculateCoefficientsOfAffixness();
151 void FromAffixnessUpdateSigsAndCompounds();
152 void FromStemsFindFlatCompounds(QList<CStem*>* compounds = 0,
153 QList<CStem*>* components = 0,
154 QString linker = QString(),
155 int maxNumberOfRoots = 5);
157 /// driver and UI.
158 LinguisticaMainWindow* GetDocument() { return m_pDoc; }
159 linguistica::ui::status_user_agent& status_display();
160 int GetIntParameter(QString key, int default_value);
161 void AddToScreen(QString text);
162 void ClearScreen();
163 class QTextStream* GetLogFileStream();
164 bool LogFileOn();
165 QString GetStringParameter(QString key);
166 /// defaults for user settings.
167 QMap<QString, QString>* GetDefaultParams() { return &m_ParamDefaults; }
169 // basic accessors.
170 QStringList* GetCorpus() { return &m_Corpus; }
171 CCorpusWordCollection* GetWords() { return m_pCorpusWords; }
172 Q3Dict<CCorpusWord>* GetCorpusMap() { return &m_CorpusMap; }
173 CCorpusWord* GetCorpusWord(const CStringSurrogate& word_text);
175 // convenience accessors.
176 CCorpusWord* FindAWord(CStem*, CSuffix*);
178 // mini-lexica.
179 int GetActiveMiniIndex() { return m_ActiveMini; }
180 int GetMiniCount() { return m_pMiniLexica->count(); }
181 int GetMiniSize() { return m_pMiniLexica->size(); }
182 /// points to the mini-lexicon with index n
183 /// result is a null pointer if n == -1, for convenience
184 /// in expressions such as GetMiniLexicon(GetActiveMiniIndex())
185 /// Requires: n == -1 or n is a valid mini-lexicon index
186 CMiniLexicon* GetMiniLexicon(int n);
187 void ClearMiniLexicon(int n);
188 void DeleteMiniLexicon(int n);
189 int NewMiniLexicon();
190 void SetActiveMiniIndex(int);
192 // mini-lexicon contents.
193 QList<CPrefix*>* GetPrefixSet(const CStringSurrogate& text);
194 QList<CSignature*>* GetPrefixSigSet(const CStringSurrogate& text);
195 QList<CStem*>* GetStemSet(const CStringSurrogate& text);
196 QList<CSuffix*>* GetSuffixSet(const CStringSurrogate& text);
197 QList<CSignature*>* GetSuffixSigSet(const CStringSurrogate& text);
198 QList<CStem*>* GetWordSet(const CStringSurrogate& text);
200 Q3Dict<QList<CPrefix*> >* GetAllPrefixes() { return &m_AllPrefixes; }
201 Q3Dict<QList<CSignature*> >* GetAllPrefixSigs()
202 { return &m_AllPrefixSigs; }
203 Q3Dict<QList<CStem*> >* GetAllStems() { return &m_AllStems; }
204 Q3Dict<QList<CSuffix*> >* GetAllSuffixes() { return &m_AllSuffixes; }
205 Q3Dict<QList<CSignature*> >* GetAllSuffixSigs()
206 { return &m_AllSuffixSigs; }
207 Q3Dict<QList<CStem*> >* GetAllWords() { return &m_AllWords; }
209 bool InsertPrefix(CPrefix*);
210 bool InsertPrefixSig(CSignature*);
211 bool InsertStem(CStem*);
212 bool InsertSuffix(CSuffix*);
213 bool InsertSuffixSig(CSignature*);
214 bool InsertWord(CStem*);
216 bool RemovePrefix(CPrefix*);
217 bool RemovePrefixSig(CSignature*);
218 bool RemoveStem(CStem*);
219 bool RemoveSuffix(CSuffix*);
220 bool RemoveSuffixSig(CSignature*);
221 bool RemoveWord(CStem*);
223 // scrubbing rules.
224 QMap<QString, QString>* GetInFilter() { return m_pInFilter; }
225 QMap<QString, QString>* GetOutFilter() { return m_pOutFilter; }
227 // discovered compounds.
228 CCompoundCollection* GetCompounds() { return m_pCompounds; }
229 CLinkerCollection* GetLinkers() { return m_pLinkers; }
231 // string edit distance-based analysis.
232 CWordCollection* GetSEDWords() { return m_pSEDWords; }
234 // description length. (Lexicon.cpp, DescriptionLength.cpp)
235 int GetNumberOfCharacterTypes() { return m_NumberOfCharacterTypes; }
236 int GetTokenCount() { return m_tokenCount; }
237 CDLHistory* GetDLHistory() { return m_pDLHistory; }
238 CDescriptionLength* GetDescriptionLength() { return m_DescriptionLength; }
239 int GetCorpusCount();
240 double CalculateTotalPhonologicalInformationContentOfStems();
241 double CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
242 double GetPhonologicalInformationContentOfSuffixes();
244 // output to GUI.
245 void CompoundListDisplay(class Q3ListView* widget,
246 QMap<QString, QString>* filter = 0, QChar separator = QChar());
247 void CompoundComponentListDisplay(class Q3ListView* widget);
248 void CorpusWordListDisplay(class Q3ListView* widget,
249 QMap<QString, QString>* filter,
250 bool analyzed_words_only = true);
251 void LinkerListDisplay(class Q3ListView* widget,
252 QMap<QString, QString>* filter = 0);
253 void PrefixListDisplay(class Q3ListView* widget);
254 void SignatureListDisplay(class Q3ListView* widget,
255 enum eDocumentType affix_loc);
256 void StemListDisplay(class Q3ListView* widget);
257 void SuffixListDisplay(class Q3ListView* widget);
258 void WordListDisplay(class Q3ListView* widget, bool analyzed_only); // used only when there is no MiniLexicon
260 // scrubbing filters.
261 void SetFilters(QStringList* phonemes);
263 // “all-purpose” hidden Markov model.
264 StateEmitHMM* GetHMM();
265 StateEmitHMM* CreateNewHMM();
267 friend class LinguisticaMainWindow;
270 #endif // LEXICON_H