CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / MiniLexicon_SuccessorFreqs.cpp
blobcfe02bf2d8d7b20346433e04abf5458c3f3d3787
1 // Analyzing words using discovered signatures
2 // Copyright © 2009 The University of Chicago
3 #include "MiniLexicon.h"
5 #include <Q3ValueList>
6 #include <QString>
7 #include "ui/Status.h"
8 #include "Lexicon.h"
9 #include "DLHistory.h"
10 #include "Signature.h"
11 #include "Stem.h"
12 #include "SignatureCollection.h"
13 #include "WordCollection.h"
14 #include "StringSurrogate.h"
15 #include "Parse.h"
16 #include "HTML.h"
18 // We accept any stem if it can match a good signature
20 void CMiniLexicon::TakeSignaturesFindStems(CSignatureCollection* Sigs)
22 CLexicon& lex = *m_pLexicon;
23 linguistica::ui::status_user_agent& status = lex.status_display();
25 CStem* pWord;
26 const int SizeThreshold = lex.GetIntParameter(
27 "TakeSignaturesFindStems\\SizeThreshold", 2);
28 const int StemCountThreshold = lex.GetIntParameter(
29 "TakeSignaturesFindStems\\StemCountThreshold", 2);
30 const int MinimumStemLength = lex.GetIntParameter(
31 "Main\\MinimumStemLength", 10);
33 CStringSurrogate ssAffix;
34 CStringSurrogate ssStem;
35 CSignature* pSig;
36 QString msg, stem, word;
37 QMap<QString, int> ParsableWords;
38 QStringList TempStems;
39 QStringList NewStemsFound;
40 int AffixLength;
41 bool FailureFlag;
42 if (Sigs == 0)
43 Sigs = m_pSignatures;
45 status.major_operation =
46 QString("Mini-Lexicon %1: Take signatures to find stems")
47 .arg(m_Index + 1);
48 status.progress.clear();
50 LogFileLargeTitle("Phase: Take Signatures, Find Stems");
52 const bool analyzingSuffixes = !is_initial(GetAffixLocation());
54 for (int wordno = 0; wordno < (int)m_pWords->GetCount(); wordno++) {
55 pWord = m_pWords->GetAt(wordno);
56 if (pWord->MayBeParsed())
57 ParsableWords.insert(pWord->Display(), 1); // 1 is a dummy value.
60 // We loop through the good signatures and
61 // then run through the words to see
62 // if they could belong to the good signatures.
63 // We have to be careful, because a word might
64 // have belonged to a different signature and
65 // still have the marks of those suffixes
66 // in its factorization.
67 LogFileStartTable();
68 LogFileHeader("--", "Signature");
70 // Go through signatures:
71 Sigs->Sort(SIGS);
73 status.progress.set_denominator(Sigs->GetCount());
74 for (int signo = 0; signo < (int)Sigs->GetCount(); signo++) {
75 status.progress = signo;
76 pSig = Sigs->GetAtSort(signo);
77 if (!pSig)
78 continue;
79 if (pSig->Size() < SizeThreshold)
80 continue;
81 if (pSig->GetNumberOfStems() < StemCountThreshold)
82 continue;
84 LogFileSmallTitle("Empirical: " + pSig->Display('.'));
85 status.details = pSig->Display();
87 // Choose the first suffix in pSig that isn't NULL.
88 int suffixno = 1;
89 if (pSig->GetPiece(1).IsNULL())
90 suffixno = 2;
92 TempStems.clear();
93 NewStemsFound.clear();
95 ssAffix = pSig->GetPiece(suffixno);
96 AffixLength = ssAffix.GetLength();
97 QMapIterator<QString, int> iter(ParsableWords);
99 while (iter.hasNext()) {
100 word = iter.next().key();
101 if (analyzingSuffixes) {
102 if (word.endsWith(ssAffix.Display())) {
103 if (word.length() == AffixLength)
104 continue;
105 stem = word.left(word.length() - AffixLength);
106 Q_ASSERT(stem.length() != 0);
107 if ((int) stem.length() < MinimumStemLength)
108 continue;
109 // put into Temp Stems
110 // all those stems from words which might
111 // be analyzed as ending in ssAffix.
112 TempStems.append(stem);
114 } else { // analyzing prefixes
115 if (word.startsWith(ssAffix.Display())) {
116 if (word.length() == AffixLength)
117 continue;
118 stem = word.right(word.length() - AffixLength);
119 Q_ASSERT(stem.length() != 0);
120 if ((int) stem.length() < MinimumStemLength)
121 continue;
122 TempStems.append(stem);
125 } // end of loop on words
127 LogFileStartTable();
128 int colno = 1;
129 const int numberofcolumns = 8;
130 for (int stemno = 0; stemno < TempStems.count(); stemno++) {
131 FailureFlag = false;
132 stem = TempStems.at(stemno);
133 // LogFileSimpleString(stem);
134 for (int affixno =1; affixno <= pSig->Size(); affixno++) {
135 ssAffix = pSig->GetPiece(affixno);
136 analyzingSuffixes ? word = stem + ssAffix.Display():
137 word = ssAffix.Display() + stem;
138 if (! ParsableWords.contains(word)) {
139 FailureFlag = true;
140 break;
143 if (FailureFlag == false) {
144 NewStemsFound.append(stem);
145 if (colno == numberofcolumns) {
146 LogFileEndRow();
147 colno = 1;
149 if (colno == 1)
150 LogFileStartRow();
151 LogFileSimpleString(stem);
152 colno++;
154 } // end of stemno loop
155 LogFileEndTable();
157 // Now start building up pSig again.
158 LogFileSmallTitle("Reanalyzed words");
159 LogFileStartTable();
160 for (int stemno = 0; stemno < NewStemsFound.count(); stemno++) {
161 stem = NewStemsFound.at(stemno);
162 int colno = 1;
163 const int numberofcolumns = 8;
164 for (int affixno = 1; affixno <= pSig->Size(); affixno++) {
165 ssAffix = pSig->GetPiece(affixno);
166 if (ssAffix.IsNULL())
167 continue;
169 if (analyzingSuffixes)
170 word = stem + ssAffix.Display();
171 else
172 word = ssAffix.Display() + stem;
174 pWord = *m_pWords ^= CStringSurrogate(word);
175 if (!pWord)
176 break;
178 if (pWord && analyzingSuffixes) {
179 pWord->ClearRootSuffixSplit();
180 pWord->CutRightBeforeHere(stem.length());
181 pWord->SetStemLoc(1);
182 pWord->SetSuffixLoc(2);
183 m_pLexicon->UpdateWord(pWord);
184 } else {
185 pWord->ClearPrefixStemSplit();
186 pWord->CutRightBeforeHere(ssAffix.GetLength());
187 pWord->SetStemLoc(2);
188 pWord->SetPrefixLoc(1);
189 m_pLexicon->UpdateWord(pWord);
192 if (pWord->GetConfidence().length() == 0) {
193 msg = "3: From sig find stem";
194 pWord->AppendToConfidence(msg);
196 if (colno == numberofcolumns) {
197 LogFileEndRow();
198 colno = 1;
200 if (colno == 1)
201 LogFileStartRow();
202 LogFileSimpleString(pWord->Display());
203 colno++;
204 } // end of loop on affixno
205 if (pSig->Size() > 0)
206 LogFileEndRow();
207 }// cycle through this set of Stems
208 LogFileEndTable();
209 }// end of loop on signo
210 status.progress.clear();
211 status.details.clear();
213 LogFileEndTable();
215 QString mini_name("Mini-Lexicon %1");
216 msg = "From sigs find stems";
217 CStringSurrogate ssRemark = msg;
219 // Writes to status.details instead of status.major_operation.
220 TakeSplitWords_ProduceStemsAndSigs(ssRemark);
222 status.major_operation.clear();
223 mini_name = mini_name.arg(GetIndex() + 1);
224 QString remark = "From sigs: find stems";
225 GetDLHistory()->append(mini_name, remark, this);