CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / StemCollection.cpp
blob6613505b0eb73b61151f231cdde51f7979b6a7fe
1 // Implementation of CStemCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "StemCollection.h"
5 #include <Q3TextStream>
6 #include <QIODevice>
7 #include <QFile>
8 #include "ui/Status.h"
9 #include "MiniLexicon.h"
10 #include "Lexicon.h"
11 #include "Stem.h"
12 #include "SignatureCollection.h"
13 #include "StringSurrogate.h"
14 #include "Typedefs.h"
15 #include "implicit_cast.h"
16 using linguistica::implicit_cast;
18 namespace {
19 typedef TCollection<CStem> base;
22 // construction/destruction.
24 CStemCollection::CStemCollection()
25 : base(),
26 m_AffixLocation(WORD_FINAL),
27 m_TotalUseCount(0.0) { }
29 CStemCollection::CStemCollection(CMiniLexicon* Lex)
30 : base(Lex, "Stems"),
31 m_TotalUseCount(0.0) { }
33 CStemCollection::~CStemCollection()
35 if (m_pLexicon != 0)
36 for (int i = 0; i < GetCount(); ++i) {
37 CStem* pStem = GetAt(i);
38 m_pLexicon->RemoveStem(pStem);
42 // Qt3-style collection view.
43 // see GUIclasses.cpp for CStemCollection::ListDisplay() definition.
45 // input/output to file.
47 void CStemCollection::OutputStems(QString FileName,
48 QMap<QString, QString>* filter)
50 QFile file( FileName );
52 if( file.open( QIODevice::WriteOnly ) )
54 Q3TextStream outf( &file );
55 outf.setEncoding( Q3TextStream::Unicode );
57 outf << "# Stem Count" << endl;
58 outf << "# ------------" << endl;
59 outf << " " << GetCount() << endl << endl;
61 outf << "# Index | Stem | Confidence | Corpus Count | Affix Count | Affixes " << endl;
62 outf << "# ------------------------------------------------------------------------------------------ " << endl;
64 Sort(CORPUSCOUNT);
65 for (int i = 0; i < GetCount(); ++i)
66 GetAtSort(i)->OutputStem(outf, i, filter);
68 file.close();
72 void CStemCollection::ReadStemFile(QString FileName,
73 enum eAffixLocation affixLoc)
75 CLexicon& lex = *m_pLexicon;
76 linguistica::ui::status_user_agent& status = lex.status_display();
78 CStem* pStem;
79 CSignature* pSig;
80 QFile file(FileName);
81 QString buffer,
82 stem,
83 confidence;
85 CNode* pTerminal;
87 int index,
88 size,
89 corpusCount,
90 affixCount,
91 lineCount = 0;
93 if( file.exists() && file.open( QIODevice::ReadOnly ) )
95 Q3TextStream inf(&file);
96 inf.setEncoding ( Q3TextStream::Locale );
98 buffer = inf.readLine();
99 Q_ASSERT( buffer[0] == '#' );
101 buffer = inf.readLine();
102 Q_ASSERT( buffer[0] == '#' );
104 inf >> size;
106 buffer = inf.readLine(); // end of size line
107 Q_ASSERT( buffer.length() == 0 );
109 buffer = inf.readLine(); // blank line
110 Q_ASSERT( buffer.length() == 0 );
112 buffer = inf.readLine();
113 Q_ASSERT( buffer[0] == '#' );
115 buffer = inf.readLine();
116 Q_ASSERT( buffer[0] == '#' );
118 status.progress.clear();
119 status.progress.set_denominator(size);
120 while (!inf.atEnd() && lineCount < size) {
121 status.progress = lineCount++;
122 inf >> index
123 >> stem
124 >> confidence
125 >> corpusCount
126 >> affixCount;
128 stem = Filter( m_pLexicon->GetInFilter(), stem );
129 confidence = confidence.replace( "_", " " );
131 pStem = new CStem( CSS( stem ), m_pMiniLex );
133 for (int j = 1; j <= affixCount; j++)
135 inf >> buffer;
137 // Filter all sequences that should be
138 // analyzed as one character
139 buffer = Filter( m_pLexicon->GetInFilter(), buffer );
141 switch( affixLoc )
143 case WORD_INITIAL:
144 case STEM_INITIAL:
145 pStem->GetPrefixList()->Append( buffer );
146 break;
148 case WORD_FINAL:
149 case STEM_FINAL:
150 default:
151 pStem->GetSuffixList()->Append( buffer );
152 break;
156 if( affixCount > 0 )
158 switch( affixLoc )
160 case WORD_INITIAL:
161 case STEM_INITIAL:
162 pSig = *m_pMiniLex->GetSignatures() ^= pStem->GetPrefixList();
163 pStem->SetPrefixSignature( pSig );
164 break;
166 case WORD_FINAL:
167 case STEM_FINAL:
168 default:
169 pSig = *m_pMiniLex->GetSignatures() ^= pStem->GetSuffixList();
170 pStem->SetSuffixSignature( pSig );
171 break;
175 if( confidence != "NONE" ) pStem->SetConfidence( confidence );
177 pStem->SetWordCount( affixCount );
178 pStem->SetCorpusCount( corpusCount );
180 pTerminal = Insert( CSS( stem ) );
181 pTerminal->SetPointer( pStem );
183 status.progress.clear();
184 file.close();
188 // insert.
190 CStem* CStemCollection::operator<<(const CStem* stem)
191 { return operator<<(implicit_cast<const CParse*>(stem)); }
193 CStem* CStemCollection::operator<<(const CParse* stem)
195 CStringSurrogate text = stem->GetKey();
196 CStem* new_stem = AddToCollection(text);
198 QChar* alphabetized_text = LxAlphabetizeString(
199 stem->GetKeyPointer(),
200 stem->GetKeyLength());
201 new_stem->SetAlphabetizedForm(QString(alphabetized_text,
202 stem->GetKeyLength()));
203 return new_stem;
206 CStem* CStemCollection::operator<<(CStringSurrogate stem_text)
208 CStem* new_stem = AddToCollection(stem_text);
210 QChar* alphabetized_text = LxAlphabetizeString(
211 stem_text.Display().unicode(),
212 stem_text.GetLength());
213 new_stem->SetAlphabetizedForm(QString(alphabetized_text,
214 stem_text.GetLength()));
215 return new_stem;
218 CStem* CStemCollection::operator<<(QString stem_text)
220 CStringSurrogate text(stem_text);
221 CStem* new_stem = AddToCollection(text);
222 QChar* alphabetized_text = LxAlphabetizeString(
223 stem_text.unicode(),
224 stem_text.length());
225 new_stem->SetAlphabetizedForm(QString(alphabetized_text,
226 stem_text.length()));
227 return new_stem;
230 void CStemCollection::AddPointer(CStem* stem)
232 base::AddPointer(stem);
233 if (m_pLexicon != 0)
234 m_pLexicon->InsertStem(stem);
237 CStem* CStemCollection::AddToCollection(const CParse& stem_text)
239 CStem* new_stem = base::AddToCollection(stem_text);
240 if (m_pLexicon != 0)
241 m_pLexicon->InsertStem(new_stem);
242 return new_stem;
245 CStem* CStemCollection::AddToCollection(const CStringSurrogate& stem_text)
247 CStem* new_stem = base::AddToCollection(stem_text);
248 if (m_pLexicon != 0)
249 m_pLexicon->InsertStem(new_stem);
250 return new_stem;
253 // clear.
255 void CStemCollection::Empty()
257 if (m_pLexicon != 0)
258 for (int i = 0; i < GetCount(); ++i) {
259 CStem* stem = GetAt(i);
260 m_pLexicon->RemoveStem(stem);
263 base::Empty();
266 void CStemCollection::RemoveAll()
268 if (m_pLexicon != 0)
269 for (int i = 0; i < GetCount(); ++i) {
270 CStem* stem = GetAt(i);
271 m_pLexicon->RemoveStem(stem);
274 base::RemoveAll();
277 // remove.
279 bool CStemCollection::Remove(CStem* stem)
281 if (m_pLexicon != 0)
282 m_pLexicon->RemoveStem(stem);
283 return base::Remove(stem);
286 bool CStemCollection::RemoveMember(CStem* stem)
288 if (m_pLexicon != 0)
289 m_pLexicon->RemoveStem(stem);
290 return base::RemoveMember(stem);
293 bool CStemCollection::RemoveMember(const CStringSurrogate& stem_text)
295 CStem* stem = static_cast<CStem*>(Find1(stem_text)->Get_T_Pointer());
296 return RemoveMember(stem);
299 bool CStemCollection::RemoveMember(const CStringSurrogate& stem_text,
300 bool b)
302 CStem* stem = static_cast<CStem*>(Find1(stem_text)->Get_T_Pointer());
303 if (m_pLexicon != 0)
304 m_pLexicon->RemoveStem(stem);
305 return base::RemoveMember(stem_text, b);
308 void CStemCollection::DeleteMarkedMembers()
310 // XXX. add hook to base for on-deletion actions so this can
311 // be cleaned up.
313 if (base::m_DeletionArray == 0)
314 return;
316 if (m_pLexicon != 0) {
317 int count = GetCount();
318 for (int i = 0; i < count; ++i)
319 if (base::m_DeletionArray[i] == 1)
320 m_pLexicon->RemoveStem(m_PointerArray[i]);
322 base::DeleteMarkedMembers();
325 // accessors.
327 double CStemCollection::GetTotalUseCount()
329 m_TotalUseCount = 0;
331 if (!is_initial(m_AffixLocation))
332 for (int i = 0; i < GetCount(); ++i) {
333 CStem* stem = GetAt(i);
334 m_TotalUseCount += stem->GetNumberOfSuffixes();
336 else
337 for (int i = 0; i < GetCount(); ++i) {
338 CStem* stem = GetAt(i);
339 m_TotalUseCount += stem->GetNumberOfPrefixes();
341 return m_TotalUseCount;
344 // description length.
345 // see DescriptionLength.cpp for
346 // CStemCollection::CalculateTotalPhonologicalInformationContent(),
347 // CStemCollection::CalculateSumOfPointersToMyStems() definitions.