1 // driver and model for linguistica analyses
2 // Copyright © 2009 The University of Chicago
13 #include "generaldefinitions.h"
15 class CStringSurrogate
;
16 namespace linguistica
{ namespace ui
{
17 struct status_user_agent
;
20 // compatibility typedefs.
22 typedef QMap
<QString
, class CStem
> StringToStem
;
23 typedef QMap
<QString
, int> StringToInt
;
24 typedef QMap
<QString
, QString
> StringToString
;
25 typedef QList
<class CPrefix
*> PrefixSet
;
26 typedef QList
<class CSignature
*> SignatureSet
;
27 typedef QList
<class CSuffix
*> SuffixSet
;
28 typedef QMap
<QString
, class CStem
*> StringToPtrCStem
;
29 typedef QList
<class CStem
*> StemSet
;
31 /// The lexicon contains all the morphological collections.
32 /// These include words, stems, affixes, signatures, and other structures
33 /// used by different Linguistica functions.
36 /// parent driver object (and UI)
37 class LinguisticaMainWindow
* m_pDoc
;
38 /// defaults for user-specified parameters (see CLPreferences class)
39 QMap
<QString
, QString
> m_ParamDefaults
;
41 /// master corpus line list. (raw lines from the input file,
42 /// except hyphenated lines are joined if the user so requests.)
44 /// master corpus word list (see class CCorpusWord).
45 class CCorpusWordCollection
* m_pCorpusWords
;
46 /// maps strings in QStringList::split(' ', m_Corpus[i])
47 /// to corpus words in the master corpus word list
48 Q3Dict
<class CCorpusWord
> m_CorpusMap
;
50 /// sub-stem morphology model (see class CMiniLexicon).
51 Q3PtrVector
<class CMiniLexicon
>* m_pMiniLexica
;
52 /// mini-lexicon selected by and displayed to the user,
53 /// the only mini-lexicon that changes.
54 /// -1 indicates no mini-lexicon is active; otherwise,
55 /// 0 <= m_ActiveMini < m_pMiniLexica->size()
57 /// mini-lexicon “words” with changes to propagate to the lexicon
58 /// see UpdateWord(), DoWordUpdates()
59 QList
<class CStem
*> m_WordUpdates
;
60 // stems, suffixes, prefixes, and signatures owned by mini-lexica.
61 Q3Dict
< QList
<class CPrefix
*> > m_AllPrefixes
;
62 Q3Dict
< QList
<class CSignature
*> > m_AllPrefixSigs
;
63 Q3Dict
< QList
<class CStem
*> > m_AllStems
;
64 Q3Dict
< QList
<class CSuffix
*> > m_AllSuffixes
;
65 Q3Dict
< QList
<class CSignature
*> > m_AllSuffixSigs
;
66 Q3Dict
< QList
<class CStem
*> > m_AllWords
;
68 // compound discovery happens globally (not inside the mini-lexica),
69 // using information from all mini-lexica.
70 class CCompoundCollection
* m_pCompounds
; ///< discovered compounds
71 class CLinkerCollection
* m_pLinkers
; ///< compound interfixes
72 /// compounds with changes to propagate to other lexicon members
73 /// see UpdateCompound(), DoWordUpdates()
74 QStringList m_CompoundUpdates
;
77 /// map grapheme sequences to phoneme sequences
78 QMap
<QString
, QString
>* m_pInFilter
;
79 /// map phone sequences to grapheme sequences
80 QMap
<QString
, QString
>* m_pOutFilter
;
82 // description length.
83 /// number of distinct phonemes encountered
84 int m_NumberOfCharacterTypes
;
85 /// number of distinct words encountered
87 /// log of description length changes and their reasons.
88 class CDLHistory
* m_pDLHistory
;
89 /// “all-purpose” (i.e., unused) description length accounting object
90 class CDescriptionLength
* m_DescriptionLength
;
92 // string edit distance-based analyses.
93 /// working copy of word list for string edit distance-based analysis
94 class CWordCollection
* m_pSEDWords
;
97 class corpussyl
* DCNsylTrainCorpus
;
98 class corpussyl
* DCNsylTestCorpus
;
102 // hidden Markov model.
103 /// “all-purpose” (i.e., unused) HMM
104 class StateEmitHMM
* m_HMM
;
106 // construction/destruction.
108 // disable default-construction.
111 CLexicon(LinguisticaMainWindow
*);
114 // disable copies and copy-assignment.
115 CLexicon(const CLexicon
& x
);
116 CLexicon
& operator=(const CLexicon
& x
);
124 /// requires: filename is a valid filename (in particular, nonempty)
125 int ReadCorpus(const QString filename
, int NumberOfWords
= 5000);
126 int ReadProjectFile(QString filename
);
127 int ReadDX1File (const QString filename
, int NumberOfWords
= -1);
128 /// requires: filename is a valid filename (in particular, nonempty)
129 int RereadCorpus(const QString filename
, int NumberOfWords
= 5000);
131 // input from stream.
132 /// write number of occurences of each word in lines to out
133 int Tokenize(QStringList
& lines
, QMap
<QString
, int>& out
);
136 void MakeBrokenCorpus(QString filename
);
137 void OutputStats(QString filename
);
139 // compound discovery and stem-level morphology discovery.
140 void UpdateCompound(QString
);
141 void UpdateWord(CStem
*);
142 void DoWordUpdates();
143 void FindPrefixes(bool AutoLayer
= false);
144 void FindSuffixes(bool AutoLayer
= false);
146 // compound discovery. (Lexicon_Compounds.cpp)
147 void FromStemsFindCompounds(QList
<CStem
*>* compounds
= 0,
148 QList
<CStem
*>* components
= 0,
149 QString linker
= QString());
150 void CalculateCoefficientsOfAffixness();
151 void FromAffixnessUpdateSigsAndCompounds();
152 void FromStemsFindFlatCompounds(QList
<CStem
*>* compounds
= 0,
153 QList
<CStem
*>* components
= 0,
154 QString linker
= QString(),
155 int maxNumberOfRoots
= 5);
158 LinguisticaMainWindow
* GetDocument() { return m_pDoc
; }
159 linguistica::ui::status_user_agent
& status_display();
160 int GetIntParameter(QString key
, int default_value
);
161 void AddToScreen(QString text
);
163 class QTextStream
* GetLogFileStream();
165 QString
GetStringParameter(QString key
);
166 /// defaults for user settings.
167 QMap
<QString
, QString
>* GetDefaultParams() { return &m_ParamDefaults
; }
170 QStringList
* GetCorpus() { return &m_Corpus
; }
171 CCorpusWordCollection
* GetWords() { return m_pCorpusWords
; }
172 Q3Dict
<CCorpusWord
>* GetCorpusMap() { return &m_CorpusMap
; }
173 CCorpusWord
* GetCorpusWord(const CStringSurrogate
& word_text
);
175 // convenience accessors.
176 CCorpusWord
* FindAWord(CStem
*, CSuffix
*);
179 int GetActiveMiniIndex() { return m_ActiveMini
; }
180 int GetMiniCount() { return m_pMiniLexica
->count(); }
181 int GetMiniSize() { return m_pMiniLexica
->size(); }
182 /// points to the mini-lexicon with index n
183 /// result is a null pointer if n == -1, for convenience
184 /// in expressions such as GetMiniLexicon(GetActiveMiniIndex())
185 /// Requires: n == -1 or n is a valid mini-lexicon index
186 CMiniLexicon
* GetMiniLexicon(int n
);
187 void ClearMiniLexicon(int n
);
188 void DeleteMiniLexicon(int n
);
189 int NewMiniLexicon();
190 void SetActiveMiniIndex(int);
192 // mini-lexicon contents.
193 QList
<CPrefix
*>* GetPrefixSet(const CStringSurrogate
& text
);
194 QList
<CSignature
*>* GetPrefixSigSet(const CStringSurrogate
& text
);
195 QList
<CStem
*>* GetStemSet(const CStringSurrogate
& text
);
196 QList
<CSuffix
*>* GetSuffixSet(const CStringSurrogate
& text
);
197 QList
<CSignature
*>* GetSuffixSigSet(const CStringSurrogate
& text
);
198 QList
<CStem
*>* GetWordSet(const CStringSurrogate
& text
);
200 Q3Dict
<QList
<CPrefix
*> >* GetAllPrefixes() { return &m_AllPrefixes
; }
201 Q3Dict
<QList
<CSignature
*> >* GetAllPrefixSigs()
202 { return &m_AllPrefixSigs
; }
203 Q3Dict
<QList
<CStem
*> >* GetAllStems() { return &m_AllStems
; }
204 Q3Dict
<QList
<CSuffix
*> >* GetAllSuffixes() { return &m_AllSuffixes
; }
205 Q3Dict
<QList
<CSignature
*> >* GetAllSuffixSigs()
206 { return &m_AllSuffixSigs
; }
207 Q3Dict
<QList
<CStem
*> >* GetAllWords() { return &m_AllWords
; }
209 bool InsertPrefix(CPrefix
*);
210 bool InsertPrefixSig(CSignature
*);
211 bool InsertStem(CStem
*);
212 bool InsertSuffix(CSuffix
*);
213 bool InsertSuffixSig(CSignature
*);
214 bool InsertWord(CStem
*);
216 bool RemovePrefix(CPrefix
*);
217 bool RemovePrefixSig(CSignature
*);
218 bool RemoveStem(CStem
*);
219 bool RemoveSuffix(CSuffix
*);
220 bool RemoveSuffixSig(CSignature
*);
221 bool RemoveWord(CStem
*);
224 QMap
<QString
, QString
>* GetInFilter() { return m_pInFilter
; }
225 QMap
<QString
, QString
>* GetOutFilter() { return m_pOutFilter
; }
227 // discovered compounds.
228 CCompoundCollection
* GetCompounds() { return m_pCompounds
; }
229 CLinkerCollection
* GetLinkers() { return m_pLinkers
; }
231 // string edit distance-based analysis.
232 CWordCollection
* GetSEDWords() { return m_pSEDWords
; }
234 // description length. (Lexicon.cpp, DescriptionLength.cpp)
235 int GetNumberOfCharacterTypes() { return m_NumberOfCharacterTypes
; }
236 int GetTokenCount() { return m_tokenCount
; }
237 CDLHistory
* GetDLHistory() { return m_pDLHistory
; }
238 CDescriptionLength
* GetDescriptionLength() { return m_DescriptionLength
; }
239 int GetCorpusCount();
240 double CalculateTotalPhonologicalInformationContentOfStems();
241 double CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
242 double GetPhonologicalInformationContentOfSuffixes();
245 void CompoundListDisplay(class Q3ListView
* widget
,
246 QMap
<QString
, QString
>* filter
= 0, QChar separator
= QChar());
247 void CompoundComponentListDisplay(class Q3ListView
* widget
);
248 void CorpusWordListDisplay(class Q3ListView
* widget
,
249 QMap
<QString
, QString
>* filter
,
250 bool analyzed_words_only
= true);
251 void LinkerListDisplay(class Q3ListView
* widget
,
252 QMap
<QString
, QString
>* filter
= 0);
253 void PrefixListDisplay(class Q3ListView
* widget
);
254 void SignatureListDisplay(class Q3ListView
* widget
,
255 enum eDocumentType affix_loc
);
256 void StemListDisplay(class Q3ListView
* widget
);
257 void SuffixListDisplay(class Q3ListView
* widget
);
258 void WordListDisplay(class Q3ListView
* widget
, bool analyzed_only
); // used only when there is no MiniLexicon
260 // scrubbing filters.
261 void SetFilters(QStringList
* phonemes
);
263 // “all-purpose” hidden Markov model.
264 StateEmitHMM
* GetHMM();
265 StateEmitHMM
* CreateNewHMM();
267 friend class LinguisticaMainWindow
;