1 // driver and model for linguistica analyses
2 // Copyright © 2009 The University of Chicago
13 #include "generaldefinitions.h"
15 class CStringSurrogate
;
16 namespace linguistica
{ namespace ui
{
17 struct status_user_agent
;
20 // compatibility typedefs.
22 typedef QMap
<QString
, class CStem
> StringToStem
;
23 typedef QMap
<QString
, int> StringToInt
;
24 typedef QMap
<QString
, QString
> StringToString
;
25 typedef QList
<class CPrefix
*> PrefixSet
;
26 typedef QList
<class CSignature
*> SignatureSet
;
27 typedef QList
<class CSuffix
*> SuffixSet
;
28 typedef QMap
<QString
, class CStem
*> StringToPtrCStem
;
29 typedef QList
<class CStem
*> StemSet
;
31 /// The lexicon contains all the morphological collections.
32 /// These include words, stems, affixes, signatures, and other structures
33 /// used by different Linguistica functions.
36 /// parent driver object (and UI)
37 class LinguisticaMainWindow
* m_pDoc
;
38 /// defaults for user-specified parameters (see CLPreferences class)
39 QMap
<QString
, QString
> m_ParamDefaults
;
41 /// master corpus line list. (raw lines from the input file,
42 /// except hyphenated lines are joined if the user so requests.)
44 /// master corpus word list (see class CCorpusWord).
45 class CCorpusWordCollection
* m_pCorpusWords
;
46 /// maps strings in QStringList::split(' ', m_Corpus[i])
47 /// to corpus words in the master corpus word list
48 Q3Dict
<class CCorpusWord
> m_CorpusMap
;
50 /// sub-stem morphology model (see class CMiniLexicon).
51 Q3PtrVector
<class CMiniLexicon
>* m_pMiniLexica
;
52 /// mini-lexicon selected by and displayed to the user,
53 /// the only mini-lexicon that changes.
54 /// -1 indicates no mini-lexicon is active; otherwise,
55 /// 0 <= m_ActiveMini < m_pMiniLexica->size()
57 /// mini-lexicon “words” with changes to propagate to the lexicon
58 /// see UpdateWord(), DoWordUpdates()
59 QList
<class CStem
*> m_WordUpdates
;
60 // stems, suffixes, prefixes, and signatures owned by mini-lexica.
61 Q3Dict
< QList
<class CPrefix
*> > m_AllPrefixes
;
62 Q3Dict
< QList
<class CSignature
*> > m_AllPrefixSigs
;
63 Q3Dict
< QList
<class CStem
*> > m_AllStems
;
64 Q3Dict
< QList
<class CSuffix
*> > m_AllSuffixes
;
65 Q3Dict
< QList
<class CSignature
*> > m_AllSuffixSigs
;
66 Q3Dict
< QList
<class CStem
*> > m_AllWords
;
68 // compound discovery happens globally (not inside the mini-lexica),
69 // using information from all mini-lexica.
70 class CCompoundCollection
* m_pCompounds
; ///< discovered compounds
71 class CLinkerCollection
* m_pLinkers
; ///< compound interfixes
72 /// compounds with changes to propagate to other lexicon members
73 /// see UpdateCompound(), DoWordUpdates()
74 QStringList m_CompoundUpdates
;
77 /// map grapheme sequences to phoneme sequences
78 QMap
<QString
, QString
>* m_pInFilter
;
79 /// map phone sequences to grapheme sequences
80 QMap
<QString
, QString
>* m_pOutFilter
;
82 // description length.
83 /// number of distinct phonemes encountered
84 int m_NumberOfCharacterTypes
;
85 /// number of distinct words encountered
87 /// log of description length changes and their reasons.
88 class CDLHistory
* m_pDLHistory
;
89 /// “all-purpose” (i.e., unused) description length accounting object
90 class CDescriptionLength
* m_DescriptionLength
;
92 // string edit distance-based analyses.
93 /// working copy of word list for string edit distance-based analysis
94 class CWordCollection
* m_pSEDWords
;
97 class corpussyl
* DCNsylTrainCorpus
;
98 class corpussyl
* DCNsylTestCorpus
;
102 // hidden Markov model.
103 /// “all-purpose” (i.e., unused) HMM
104 class StateEmitHMM
* m_HMM
;
108 // construction/destruction.
110 // disable default-construction.
113 CLexicon(LinguisticaMainWindow
*);
116 // disable copies and copy-assignment.
117 CLexicon(const CLexicon
& x
);
118 CLexicon
& operator=(const CLexicon
& x
);
126 /// requires: filename is a valid filename (in particular, nonempty)
127 int ReadCorpus(const QString filename
, int NumberOfWords
= 5000);
128 int ReadProjectFile(QString filename
);
129 int ReadDX1File (const QString filename
, int NumberOfWords
= -1);
130 /// requires: filename is a valid filename (in particular, nonempty)
131 int RereadCorpus(const QString filename
, int NumberOfWords
= 5000);
133 // input from stream.
134 /// write number of occurences of each word in lines to out
135 int Tokenize(QStringList
& lines
, QMap
<QString
, int>& out
);
138 void MakeBrokenCorpus(QString filename
);
139 void OutputStats(QString filename
);
141 // compound discovery and stem-level morphology discovery.
142 void UpdateCompound(QString
);
143 void UpdateWord(CStem
*);
144 void DoWordUpdates();
145 void FindPrefixes(bool AutoLayer
= false);
146 void FindSuffixes(bool AutoLayer
= false);
148 // compound discovery. (Lexicon_Compounds.cpp)
149 void FromStemsFindCompounds(QList
<CStem
*>* compounds
= 0,
150 QList
<CStem
*>* components
= 0,
151 QString linker
= QString());
152 void CalculateCoefficientsOfAffixness();
153 void FromAffixnessUpdateSigsAndCompounds();
154 void FromStemsFindFlatCompounds(QList
<CStem
*>* compounds
= 0,
155 QList
<CStem
*>* components
= 0,
156 QString linker
= QString(),
157 int maxNumberOfRoots
= 5);
160 LinguisticaMainWindow
* GetDocument() { return m_pDoc
; }
161 linguistica::ui::status_user_agent
& status_display();
162 int GetIntParameter(QString key
, int default_value
);
163 void AddToScreen(QString text
);
165 class QTextStream
* GetLogFileStream();
167 QString
GetStringParameter(QString key
);
168 /// defaults for user settings.
169 QMap
<QString
, QString
>* GetDefaultParams() { return &m_ParamDefaults
; }
172 QStringList
* GetCorpus() { return &m_Corpus
; }
173 CCorpusWordCollection
* GetWords() { return m_pCorpusWords
; }
174 Q3Dict
<CCorpusWord
>* GetCorpusMap() { return &m_CorpusMap
; }
175 CCorpusWord
* GetCorpusWord(const CStringSurrogate
& word_text
);
177 // convenience accessors.
178 CCorpusWord
* FindAWord(CStem
*, CSuffix
*);
181 int GetActiveMiniIndex() { return m_ActiveMini
; }
182 int GetMiniCount() { return m_pMiniLexica
->count(); }
183 int GetMiniSize() { return m_pMiniLexica
->size(); }
184 /// points to the mini-lexicon with index n
185 /// result is a null pointer if n == -1, for convenience
186 /// in expressions such as GetMiniLexicon(GetActiveMiniIndex())
187 /// Requires: n == -1 or n is a valid mini-lexicon index
188 CMiniLexicon
* GetMiniLexicon(int n
);
189 void ClearMiniLexicon(int n
);
190 void DeleteMiniLexicon(int n
);
191 int NewMiniLexicon();
192 void SetActiveMiniIndex(int);
194 // mini-lexicon contents.
195 QList
<CPrefix
*>* GetPrefixSet(const CStringSurrogate
& text
);
196 QList
<CSignature
*>* GetPrefixSigSet(const CStringSurrogate
& text
);
197 QList
<CStem
*>* GetStemSet(const CStringSurrogate
& text
);
198 QList
<CSuffix
*>* GetSuffixSet(const CStringSurrogate
& text
);
199 QList
<CSignature
*>* GetSuffixSigSet(const CStringSurrogate
& text
);
200 QList
<CStem
*>* GetWordSet(const CStringSurrogate
& text
);
202 Q3Dict
<QList
<CPrefix
*> >* GetAllPrefixes() { return &m_AllPrefixes
; }
203 Q3Dict
<QList
<CSignature
*> >* GetAllPrefixSigs()
204 { return &m_AllPrefixSigs
; }
205 Q3Dict
<QList
<CStem
*> >* GetAllStems() { return &m_AllStems
; }
206 Q3Dict
<QList
<CSuffix
*> >* GetAllSuffixes() { return &m_AllSuffixes
; }
207 Q3Dict
<QList
<CSignature
*> >* GetAllSuffixSigs()
208 { return &m_AllSuffixSigs
; }
209 Q3Dict
<QList
<CStem
*> >* GetAllWords() { return &m_AllWords
; }
211 bool InsertPrefix(CPrefix
*);
212 bool InsertPrefixSig(CSignature
*);
213 bool InsertStem(CStem
*);
214 bool InsertSuffix(CSuffix
*);
215 bool InsertSuffixSig(CSignature
*);
216 bool InsertWord(CStem
*);
218 bool RemovePrefix(CPrefix
*);
219 bool RemovePrefixSig(CSignature
*);
220 bool RemoveStem(CStem
*);
221 bool RemoveSuffix(CSuffix
*);
222 bool RemoveSuffixSig(CSignature
*);
223 bool RemoveWord(CStem
*);
226 QMap
<QString
, QString
>* GetInFilter() { return m_pInFilter
; }
227 QMap
<QString
, QString
>* GetOutFilter() { return m_pOutFilter
; }
229 // discovered compounds.
230 CCompoundCollection
* GetCompounds() { return m_pCompounds
; }
231 CLinkerCollection
* GetLinkers() { return m_pLinkers
; }
233 // string edit distance-based analysis.
234 CWordCollection
* GetSEDWords() { return m_pSEDWords
; }
236 // description length. (Lexicon.cpp, DescriptionLength.cpp)
237 int GetNumberOfCharacterTypes() { return m_NumberOfCharacterTypes
; }
238 int GetTokenCount() { return m_tokenCount
; }
239 CDLHistory
* GetDLHistory() { return m_pDLHistory
; }
240 CDescriptionLength
* GetDescriptionLength() { return m_DescriptionLength
; }
241 int GetCorpusCount();
242 double CalculateTotalPhonologicalInformationContentOfStems();
243 double CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
244 double GetPhonologicalInformationContentOfSuffixes();
245 FSA
* GetFSA() { return m_pFSA
; }
246 void SetFSA(FSA
* pFSA
) { m_pFSA
= pFSA
; }
249 void CompoundListDisplay(class Q3ListView
* widget
,
250 QMap
<QString
, QString
>* filter
= 0, QChar separator
= QChar());
251 void CompoundComponentListDisplay(class Q3ListView
* widget
);
252 void CorpusWordListDisplay(class Q3ListView
* widget
,
253 QMap
<QString
, QString
>* filter
,
254 bool analyzed_words_only
= true);
255 void LinkerListDisplay(class Q3ListView
* widget
,
256 QMap
<QString
, QString
>* filter
= 0);
257 void PrefixListDisplay(class Q3ListView
* widget
);
258 void SignatureListDisplay(class Q3ListView
* widget
,
259 enum eDocumentType affix_loc
);
260 void StemListDisplay(class Q3ListView
* widget
);
261 void SuffixListDisplay(class Q3ListView
* widget
);
262 void WordListDisplay(class Q3ListView
* widget
, bool analyzed_only
); // used only when there is no MiniLexicon
264 // scrubbing filters.
265 void SetFilters(QStringList
* phonemes
);
267 // “all-purpose” hidden Markov model.
268 StateEmitHMM
* GetHMM();
269 StateEmitHMM
* CreateNewHMM();
271 friend class LinguisticaMainWindow
;