1 // Signature type for suffix/signature-based morphology discovery
2 // Copyright © 2009 The University of Chicago
6 class CSignatureListViewItem
;
9 // See the CMiniLexicon class in MiniLexicon.h for an overview of
10 // suffix/signature-based discovery of morphology.
12 // for CSignatureListViewItem:
13 #include <Q3ListViewItem>
15 template<class K
, class V
> class QMap
;
22 #include "AffixLocation.h"
23 template<class K
, class V
> class QMap
;
24 template<class T
> class QList
;
26 /// A list view item for signatures.
28 Contains all the necessary variables needed when displaying a
29 signature and its data in a list view.
31 class CSignatureListViewItem
: public Q3ListViewItem
{
32 CSignature
* m_signature
;
33 QMap
<QString
, QString
>* m_filter
;
35 class Q3ListView
* m_parentlist
;
38 // construction/destruction.
40 explicit CSignatureListViewItem(class Q3ListView
* parent
= 0,
41 QString sig_text
= QString(), int mini_index
= -1,
42 CSignature
* sig
= 0, QMap
<QString
, QString
>* filter
= 0);
43 CSignatureListViewItem(Q3ListViewItem
* parent
,
44 QString sig_text
= QString(), int mini_index
= -1,
45 CSignature
* sig
= 0, QMap
<QString
, QString
>* filter
= 0);
49 CSignatureListViewItem(const CSignatureListViewItem
& x
)
51 m_signature(x
.m_signature
),
54 m_parentlist(x
.m_parentlist
),
56 CSignatureListViewItem
& operator=(const CSignatureListViewItem
& x
)
58 Q3ListViewItem::operator=(x
);
59 m_signature
= x
.m_signature
;
60 m_filter
= x
.m_filter
;
62 m_parentlist
= x
.m_parentlist
;
67 // Qt3 list view item interface.
69 virtual QString
text(int column_index
) const;
70 int compare(Q3ListViewItem
* other
, int column
, bool ascending
) const;
72 // underlying signature object.
74 CSignature
* GetSignature() { return m_signature
; }
75 void SetSignature(CSignature
* pSig
) { m_signature
= pSig
; }
78 Q3ListView
* GetParent() { return m_parentlist
; }
81 /// A string implementation of signatures.
83 Signatures are simple statements of morphological patterns, which aid both
84 in quantifying the Minimum Description Length (MDL) account and in constructively
85 building a satisfactory morphological grammar (for MDL offers no guidance in the
86 task of seeking the optimal analysis).
88 class CSignature
: public CLParse
{
90 enum detachment_parameter
{
95 enum eAffixLocation m_AffixLocation
;
96 QList
<class CStem
*>* m_StemPtrList
;
97 QList
<class CStem
*>* m_WordPtrList
;
98 QList
<class CSuffix
*>* m_SuffixPtrList
;
99 QList
<class CPrefix
*>* m_PrefixPtrList
;
101 // XXX. we may want to do this with a bit-base
102 // vector someday to speed it up.
103 /// points to the indexes of the affixes
104 // class CSparseIntVector* m_Vector;
106 /// Count-related members
107 QVector
<double> m_WordCounts
;
108 QVector
<double> m_StemCounts
;
109 QVector
<double> m_AffixCounts
;
110 QVector
<double> m_StemFrequencies
;
111 QVector
<double> m_AffixFrequencies
;
112 QVector
<double> m_WordFrequencies
;
116 // float* m_Frequencies;
119 /// the number of letters saved by having this signature
120 /// = ( Number of stems - 1 ) * ( Number of suffixes - 1);
121 /// Or, if there is a more robust signature which is
122 /// contained in this one.
123 /// It really measures how good our knowledge of these stems is, qua stems.
124 mutable double m_Robustness
;
125 CSignature
* m_Mentor
;
126 QList
<CSignature
*>* m_MentorList
;
127 CSignature
* m_MyGeneralizer
;
128 /// without the <e> stuff prefixed
129 CParse m_SimplifiedForm
;
130 class CSignatureCollection
* m_SignatureCollection
;
132 /// These affixes are "close" to this signature, in the sense that
133 /// stems associated with this signature may also be marked to take
135 CParse m_SatelliteAffixes
;
137 // description length.
139 double m_DLofMyCorpus
;
140 double m_DLofMyStemPointers
;
141 double m_DLofMyAffixPointers
;
142 double m_LengthOfPointerToMe
;
144 // construction/destruction.
146 CSignature(class CMiniLexicon
* mini
);
147 CSignature(enum eAffixLocation prefix_or_suffix
,
148 class CMiniLexicon
* mini
);
149 CSignature(const CParse
& affixes
, class CMiniLexicon
* mini
);
150 /// single-affix signature
151 CSignature(const class CStringSurrogate
& affix
,
152 class CMiniLexicon
* mini
);
154 CSignature(const CParse
* affixes_ptr
, class CMiniLexicon
* mini
);
159 CSignature(const CSignature
& x
);
160 CSignature
& operator=(const CSignature
& x
)
161 { *this = &x
; return *this; }
163 void operator=(const CSignature
* x
);
166 void ConsumeParse(CParse
* sigs
);
168 // convert to string.
170 /// includes deletees in affixes
171 QString
Express(bool bDisplayDeletees
= true);
172 // includes deletees in affixes
173 CSignature
& Express(CSignature
& Express
, bool bDisplayDeletees
= true);
174 QString
Display(QChar sep
, QMap
<QString
, QString
>* filter
= 0) const;
175 QString
Display(QMap
<QString
, QString
>* filter
) const;
176 QString
Display() const;
180 /// number of occurences in corpus (corpus count).
181 /// output is a vector of integers whose length is the number of stems
182 /// times the number of suffixes.
183 int GetTotalCount() const;
184 int GetNumberOfWords() const;
185 double GetWordCount(int wordno
) const;
186 double GetWordCount(int stemno
, int affixno
) const;
187 void SetWordCount(int stemno
, int affixno
, double value
);
188 void CalculateWordCounts();
189 double GetStemCount(int stemno
) const;
190 double GetAffixCount(int affixno
) const;
191 double GetStemFrequency (int stemno
) const;
192 double GetAffixFrequency(int affixno
) const;
193 double GetWordFrequency(int stemno
, int affixno
) const;
194 int GetNumberOfAffixes() const;
195 int GetNumberOfStems() const;
196 void CalculateFrequencies(class CMiniLexicon
* mini
);
197 double GetCorpusCount() const;
198 void SetNewWordCounts (int size
= 0);
199 void SetNewStemCount (int size
= 0);
200 void SetNewAffixFrequencies(int size
= 0);
201 void SetNewStemFrequencies(int size
= 0);
203 // context (enclosing mini-lexicon).
205 /// affix location (prefix or suffix)
206 enum eAffixLocation
GetAffixLocation() const;
207 void SetAffixLocation(enum eAffixLocation prefix_or_suffix
) { m_AffixLocation
= prefix_or_suffix
; }
208 CSignatureCollection
* GetSignatureCollection() { return m_SignatureCollection
; }
209 void SetSignatureCollection(CSignatureCollection
* list
) { m_SignatureCollection
= list
; }
210 class CMiniLexicon
* GetLexicon() const { return m_pMyMini
; }
211 void SetLexicon(class CMiniLexicon
* lex
) { m_pMyMini
= lex
; }
213 /// human-readable string describing this signature’s discovery.
214 QString
GetRemark() const;
215 void SetRemark(QString origin
);
217 CSignature
* GetGeneralizer() const { return m_MyGeneralizer
; }
219 // contained affixes.
221 /// indices of affixes in the affix collection that holds them.
224 QList
<CSuffix
*>* GetSuffixPtrList() const;
225 QList
<CPrefix
*>* GetPrefixPtrList() const;
226 void SetNewStemPtrList ( QList
< CStem
* >& );
227 void CreateNewStemPtrList (int size
);
228 void CreateNewSuffixPtrList (int size
);
229 void CreateNewPrefixPtrList (int size
);
230 CSuffix
* GetSuffix (int n
) const;
231 CPrefix
* GetPrefix (int n
) const;
232 void AppendSuffixPtr (CSuffix
*);
233 void AppendPrefixPtr (CPrefix
*);
234 void AppendStemPtr (CStem
*) const;
235 bool StemListContains(CStem
* );
236 void RemoveFromWordList(CStem
* pWord
);
237 void RecalculateStemAndWordPointers(); //jan 2010
239 // stems appearing with this sig.
242 CStem
* GetStem(int stemno
) const;
243 QList
<CStem
*>* GetStemPtrList() const;
244 void ClearStemPtrList();
246 // words using this sig.
248 CStem
* GetWord(int stem
, int suffix
) const;
249 void AppendWordPointer(CStem
* const);
251 // description length.
253 double GetDLofMyStemPointers();
254 double GetDLofMyAffixPointers();
255 virtual double ComputeDLofModel(int char_count
= 26);
256 double FindCost(class CMiniLexicon
* mini
);
257 double ComputeDLofMyCorpus();
258 double GetLengthOfPointerToMe();
259 void SetLengthOfPointerToMe(double);
260 double GetSumOfDLofInternalPointers();
262 // part-of-speech discovery.
264 CSignature
* GetMentor();
265 void SetMentor(CSignature
* sig
);
266 QList
<CSignature
*>* GetMentorList();
268 /// deprecated: index of a known suffix in the suffix list.
269 int GetNumber(CSuffix
* suffix
);
271 /// (#affixes - 1)(total length of stems) +
272 /// (#stems - 1)(total length of affixes)
273 double GetRobustness() const;
274 void SetRobustness(double value
);
275 /// deprecated: synonym for GetRobustness
276 float GetSortingQuantity() const;
280 /// register a stem appearing with this signature.
281 void operator<<(class CStem
* stem
);
282 /// Attach to Suffix Sig.
284 * This is the function to use to take a stem from one
285 * signature to a new one, and deal with all of the bookkeeping
286 * that goes with that. First it detaches the Stem from its
287 * old signature by using DetachStem() (see below);
288 * then it adds this stem to the New Signature;
289 * then it shifts all of the stem's words from the old signature
290 * to the new signature.
292 void AttachToSuffixSig(CStem
* stem
, bool bLookAtPreviousSig
= true);
293 void AttachToPrefixSig(CStem
* stem
, bool bLookAtPreviousSig
= true);
294 void AttachStemFromDifferentPrefixSignature(CStem
* s
) { AttachToPrefixSig(s
); }
295 void AttachStemFromDifferentSuffixSignature(CStem
* s
) { AttachToSuffixSig(s
); }
296 void AddWord(CStem
* word
);
297 /// add a satellite affix
298 /// (i.e. remember one which often occurs on stems that take
300 void AppendSatelliteAffix(CParse
& Affix
);
301 void TakeAllStems(CSignature
* other
);
305 void DetachWord(CStem
* word
, enum detachment_parameter
);
306 /// remove a stem from a signature's Stem Ptr List, and if the
307 /// Detachment Parameter is true, also remove all of the stem's
308 /// WORDS from the signature's Word Ptr List.
309 /// This is called by AttachToSuffixSig().
310 void DetachStem(CStem
* stem
, enum detachment_parameter words_too
);
311 bool RemoveStem(CStem
* stem
);
312 bool RemoveWord(CStem
* word
);
316 void BorrowedSigsDisplay(class Q3ListView
* widget
,
317 QMap
<QString
, QString
>* filter
= 0);
318 void ListDisplay(class Q3ListView
* widget
,
319 QMap
<QString
, QString
>* filter
= 0,
320 bool express_deletees
= true);
323 void OutputSignatureXfst ( class QTextStream
& outf
, int count
);
324 void OutputSignature(class QTextStream
& out
);
326 /// no empty affixes (but "NULL" as an affix is fine)
330 void FindCorpusCount();
333 void SetMyGeneralizer(CSignature
* sig
);
335 // allomorphy and cut shifting helpers.
337 /// delete Letter from the beginning of each suffix, and
338 /// add it to each stem.
339 /// add a <Letter> to suffixes
340 /// Much like shifting boundary of stem/suffix split to the right.
341 void RemoveLetter(CStringSurrogate
& Letter
,
342 class CMiniLexicon
* mini
, CSignature
* out
);
343 /// add Letter to the beginning of each suffix
344 void AddLetter(const QString
& Letter
);
345 /// whether adding a letter at the start of each suffix still yield
347 bool EachSuffixCanHaveThisLetterPrefixedToIt(const QString
& Letter
);
348 void ShiftStemSuffixCutToTheLeft(int distance
, const QString
& Piece
);
349 void ShiftStemSuffixCutToTheLeft(int distance
);
350 bool Generalizes(CSignature
* other
);
352 CParse
CreateADeletingSignature(CParse
& Deleter
,
353 class CMiniLexicon
* mini
);
355 /// This will look at words claimed by this signature,
356 /// and actively make cuts in them to match the affixes declared by the signature.
357 void CutMyWordsAsIDeclare();
359 /// entropy of final ngrams of stems
360 double ComputeFinalNgramEntropyOfStems(int n
);
364 int CheckOut(class CMiniLexicon
* mini
);
365 void IterateThroughStems(int NumberOfLettersShifted
,
366 class CMiniLexicon
* Lexicon
, CLParse
* pPiece
,
367 double& TotalDecreaseInDLDueToShorterStems
,
368 double LogTotalNumberOfAnalyzedWords
,
369 double& StemPointersToThisSig
,
370 double& SavingsBecauseStemAlreadyExisted
,
371 bool analyzingSuffixex
);
373 friend class QTextStream
& operator<<(class QTextStream
& out
,
377 #endif // SIGNATURE_H