1 // Sub-stem morphology model and driver
2 // Copyright © 2009 The University of Chicago
9 #include "generaldefinitions.h"
10 #include "AffixLocation.h"
12 class CStringSurrogate
;
14 /// All analyses at the sub-stem level (non-compound) are done in the
15 /// lexicon’s mini-lexica.
24 /// The parent of this mini-lexicon
25 class CLexicon
* m_pLexicon
;
27 /// The index of this mini in its parent's mini-lexicon list.
30 /// A flag designating the affix type (prefix|suffix). This is also
31 /// noted by the signatures and their collection.
32 enum eAffixLocation m_AffixLocation
;
34 /// This mini-lexicon's local copy of words will be the basis for all
35 /// analyses within the mini-lexicon. The suffixes, prefixes, stems and
36 /// signatures are all discovered locally in this mini-lexicon. The
37 /// mini-lexicon has the responsibility to update the global collection.
38 class CWordCollection
* m_pWords
;
39 class CSuffixCollection
* m_pSuffixes
;
40 class CPrefixCollection
* m_pPrefixes
;
41 class CStemCollection
* m_pStems
;
42 class CSignatureCollection
* m_pSignatures
;
43 class LxPoSCollection
* m_pPOS
;
44 class CDescriptionLength
* m_DescriptionLength
;
45 double m_CorpusCountOfUnanalyzedWords
;
46 double m_PhonologicalInformationOfUnanalyzedWords
;
48 class GUIWordCollection
* m_GUIWords
;
50 QMap
<QString
, class CDatum
> m_DataMap
; // for deMarcken JG
54 // construction/destruction.
56 CMiniLexicon(CLexicon
* parent
= 0, int index
= -1,
57 enum eAffixLocation affixLocation
= STEM_FINAL
);
62 CMiniLexicon(const CMiniLexicon
& x
);
63 CMiniLexicon
& operator=(const CMiniLexicon
& x
);
70 // location in parent lexicon.
72 int GetIndex() { return m_Index
; }
73 CLexicon
* GetLexicon() { return m_pLexicon
; }
74 class CCorpusWord
* FindAWord(class CStem
* stem
, class CSuffix
* suffix
);
75 /// total number of (not necessarily unique) words in corpus
77 int GetNumberOfCharacterTypes();
80 CMiniLexicon
* GetMiniLexicon(int index
);
84 enum eAffixLocation
GetAffixLocation() { return m_AffixLocation
; }
86 /// If affix type has changed from initial to final or vice-versa,
87 /// this entails forgetting any previously discovered affixes.
88 /// If this further entails forgetting some discovered stems,
89 /// do nothing; result is false.
90 /// Result is true on success.
91 bool SetAffixLocation(enum eAffixLocation prefix_or_suffix
);
92 int GetIntParameter(QString key
, int default_value
);
94 // analyzed and unanalyzed words.
96 CWordCollection
* GetWords() { return m_pWords
; }
97 void AddToWordCollection(CStemCollection
* words
);
98 void AddToWordCollection(CWordCollection
* words
,
99 enum which_words which
= WW_UnanalyzedOnly
);
100 int GetNumberOfUnanalyzedWords();
101 int GetCorpusCountOfUnanalyzedWords();
102 int GetNumberOfAnalyzedWords(int& unanalyzed_count
);
103 CStem
* GetWordFromStemSuffix(CStem
*, CSuffix
*);
104 CStem
* GetWordFromStemPrefix(CStem
*, class CPrefix
*);
106 // stem/signature/suffix model of morphology.
108 CPrefixCollection
* GetPrefixes() { return m_pPrefixes
; }
109 CSignatureCollection
* GetSignatures() { return m_pSignatures
; }
110 CStemCollection
* GetStems() { return m_pStems
; }
111 CSuffixCollection
* GetSuffixes() { return m_pSuffixes
; }
112 int GetNumberOfStems();
113 int GetNumberOfSuffixes();
115 // linguistic analysis (stem/signature/suffix discovery).
117 CPrefixCollection
* FindPrefixes();
118 /// If affix type is not suffix, do nothing.
119 /// Otherwise, apply various heuristics to fill this mini-lexicon’s
120 /// suffix collection. Result points to the suffix collection.
121 CSuffixCollection
* FindSuffixes();
122 void FindSingletonSignatures();
123 /// Analyze unanalyzed words using known stems and known affixes.
124 /// XXX. If a word can be analyzed in two ways as stem + suffix,
125 /// this function does not assess their relative merits.
126 /// Instead, it just picks the version with the longer suffix.
127 /// The situation is even worse for prefixes: the version with
128 /// the stem earliest in alphabetical order is used.
129 void ExtendKnownStemsToKnownAffixes();
130 /// Rebuild signature list through knowledge of stems and their
131 /// associated affixes.
132 void FindAllSignatures();
133 void FromStemsFindAffixes();
134 /// Find new signatures using unanalyzed words and known affixes.
137 /// For each word, if it ends with a known suffix (resp starts with
138 /// a known prefix), consider the corresponding stem text.
140 /// If adding that new stem (and the corresponding signature from
141 /// analyzing other words with it) decreases description length,
142 /// record the new stem, signature, and analyzed words.
144 void RebuildAffixesStemsAndSignaturesFromWordSplits(
145 CStringSurrogate
& remark
);
146 void TakeSplitWords_ProduceStemsAndSigs(CStringSurrogate
& Remark
,
147 CWordCollection
* words
= 0, CStemCollection
* stems
= 0,
148 CPrefixCollection
* prefixes
= 0,
149 CSuffixCollection
* suffixes
= 0);
150 /// successor-frequency algorithm (bootstrapping step)
151 void TakeSignaturesFindStems(CSignatureCollection
* Sigs
= 0);
152 void CheckSignatures();
154 // description length.
156 CDescriptionLength
* GetDescriptionLength() { return m_DescriptionLength
; }
157 double CalculateDescriptionLength();
158 double CalculateSumOfPointersToMyUnanalyzedWords(enum eMDL_STYLE style
);
159 double CalculateUnanalyzedWordsTotalPhonologicalInformationContent();
160 class CDLHistory
* GetDLHistory();
161 double CalculateCompressedLengthOfUnanalyzedWords();
163 // part-of-speech discovery (does not affect description length).
165 LxPoSCollection
* GetPOS() { return m_pPOS
; }
166 /// Build or update m_pPOS to reflect top discovered signatures
167 void FindMajorSignatures();
171 //FSA* GetFSA() { return m_pLexicon->m_pFSA; }
173 class QTextStream
* GetLogFile();
174 QMap
<QString
, QString
>* GetOutFilter();
178 /// set text for “command line” pane
179 void AddToScreen(QString info
);
180 /// handle for UI operations
181 class LinguisticaMainWindow
* GetDocument();
182 GUIWordCollection
* GetGUIWords();
184 // CDatum* LookUp(CStringSurrogate&);
188 void FindAllomorphy();
190 void ShiftFinalLetterToStem(class CStem
* pStem
, QString
& FinalLetter
);
191 void HowManyStemsWithThisSuffixEndInThisLetter(CStringSurrogate
& Suffix
,
192 CStringSurrogate
& Letter
, int& TotalStemsWithSuffix
,
193 int& HowManyEndWithThisLetter
);
194 int StemsWithBothSuffixes(
195 CStringSurrogate
& suf1
, CStringSurrogate
& suf2
);
196 int StemsWithBothSuffixes(QString suf1
, CStringSurrogate
& suf2
);
197 void MoveWordsStemSuffixBoundaryToRight(class CSignature
* pSigToChange
,
198 CStringSurrogate
& Letters
, class CParse
* pSuffixCandidates
);
199 void MoveWordsStemSuffixBoundaryToRight(class CSignature
* pSigToChange
,
200 QString Letters
, class CParse
* pSuffixCandidates
);
201 class CParse
CreateADeletingSignature(class CSignature
* pSig
,
202 CStringSurrogate Deletee
, class CParse
& ReplacingSuffixes
,
203 int* bStatus
, class CParse
& WhatSigWillBecome
,
204 QMap
<QString
, QString
>& Remapper
);
205 class CParse
CreateADeletingSignature(class CSignature
* pSig
,
206 CStringSurrogate Deletee
, class CParse
& ReplacingSuffixes
,
207 int* bStatus
, class CParse
& WhatSigWillBecome
,
208 class CParse
* pSuffixCandidates
);
211 void LogFileSmallTitle(QString
);
212 void LogFileSmallTitle(QString
, QString
);
213 void LogFileSmallTitle(QString
, QString
, QString
);
214 void LogFileLargeTitle(QString
);
215 void LogFileStartTable();
216 void LogFileEndTable();
217 void LogFileStartRow();
218 void LogFileStartRow(QString
);
219 void LogFileEndRow();
220 void LogFile (QString
);
221 void LogFile (double);
222 void LogFile1SimpleString(QString
); // no row start or end
223 void LogFileSimpleString(QString
); // no row start or end
224 void LogFileSimpleInteger(int); // no row start or end
225 void LogFileSimpleDouble(double); // no row start or end
226 void LogFile (QString
, QString
);
227 void LogFile (QString
, QString
, QString
);
228 void LogFile (QString
, QString
, QString
, QString
);
229 void LogFile( QString
, QString
, QString
, QString
, QString
);
230 void LogFile( QString
, QString
, QString
, QString
, QString
, QString
);
231 void LogFile (QString
, int);
232 void LogFile(int, QString
);
233 void LogFile(int, double, QString
);
234 void LogFile (QString
, double);
235 void LogFile(QString
, int, double);
236 void LogFile(QString
, int, int, double, double, double);
237 void LogFileHeader(QString
);
238 void LogFileHeader ( QString
, QString
);
239 void LogFileHeader (QString
, QString
, QString
);
240 void LogFileHeader( QString
, QString
, QString
, QString
, QString
, QString
);
244 #endif // MINILEXICON_H