1 // Implementation of CStemCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "StemCollection.h"
5 #include <Q3TextStream>
9 #include "MiniLexicon.h"
12 #include "SignatureCollection.h"
13 #include "StringSurrogate.h"
15 #include "implicit_cast.h"
16 using linguistica::implicit_cast
;
19 typedef TCollection
<CStem
> base
;
22 // construction/destruction.
24 CStemCollection::CStemCollection()
26 m_AffixLocation(WORD_FINAL
),
27 m_TotalUseCount(0.0) { }
29 CStemCollection::CStemCollection(CMiniLexicon
* Lex
)
31 m_TotalUseCount(0.0) { }
33 CStemCollection::~CStemCollection()
36 for (int i
= 0; i
< GetCount(); ++i
) {
37 CStem
* pStem
= GetAt(i
);
38 m_pLexicon
->RemoveStem(pStem
);
42 // Qt3-style collection view.
43 // see GUIclasses.cpp for CStemCollection::ListDisplay() definition.
45 // input/output to file.
47 void CStemCollection::OutputStems(QString FileName
,
48 QMap
<QString
, QString
>* filter
)
50 QFile
file( FileName
);
52 if( file
.open( QIODevice::WriteOnly
) )
54 Q3TextStream
outf( &file
);
55 outf
.setEncoding( Q3TextStream::Unicode
);
57 outf
<< "# Stem Count" << endl
;
58 outf
<< "# ------------" << endl
;
59 outf
<< " " << GetCount() << endl
<< endl
;
61 outf
<< "# Index | Stem | Confidence | Corpus Count | Affix Count | Affixes " << endl
;
62 outf
<< "# ------------------------------------------------------------------------------------------ " << endl
;
65 for (int i
= 0; i
< GetCount(); ++i
)
66 GetAtSort(i
)->OutputStem(outf
, i
, filter
);
72 void CStemCollection::ReadStemFile(QString FileName
,
73 enum eAffixLocation affixLoc
)
75 CLexicon
& lex
= *m_pLexicon
;
76 linguistica::ui::status_user_agent
& status
= lex
.status_display();
93 if( file
.exists() && file
.open( QIODevice::ReadOnly
) )
95 Q3TextStream
inf(&file
);
96 inf
.setEncoding ( Q3TextStream::Locale
);
98 buffer
= inf
.readLine();
99 Q_ASSERT( buffer
[0] == '#' );
101 buffer
= inf
.readLine();
102 Q_ASSERT( buffer
[0] == '#' );
106 buffer
= inf
.readLine(); // end of size line
107 Q_ASSERT( buffer
.length() == 0 );
109 buffer
= inf
.readLine(); // blank line
110 Q_ASSERT( buffer
.length() == 0 );
112 buffer
= inf
.readLine();
113 Q_ASSERT( buffer
[0] == '#' );
115 buffer
= inf
.readLine();
116 Q_ASSERT( buffer
[0] == '#' );
118 status
.progress
.clear();
119 status
.progress
.set_denominator(size
);
120 while (!inf
.atEnd() && lineCount
< size
) {
121 status
.progress
= lineCount
++;
128 stem
= Filter( m_pLexicon
->GetInFilter(), stem
);
129 confidence
= confidence
.replace( "_", " " );
131 pStem
= new CStem( CSS( stem
), m_pMiniLex
);
133 for (int j
= 1; j
<= affixCount
; j
++)
137 // Filter all sequences that should be
138 // analyzed as one character
139 buffer
= Filter( m_pLexicon
->GetInFilter(), buffer
);
145 pStem
->GetPrefixList()->Append( buffer
);
151 pStem
->GetSuffixList()->Append( buffer
);
162 pSig
= *m_pMiniLex
->GetSignatures() ^= pStem
->GetPrefixList();
163 pStem
->SetPrefixSignature( pSig
);
169 pSig
= *m_pMiniLex
->GetSignatures() ^= pStem
->GetSuffixList();
170 pStem
->SetSuffixSignature( pSig
);
175 if( confidence
!= "NONE" ) pStem
->SetConfidence( confidence
);
177 pStem
->SetWordCount( affixCount
);
178 pStem
->SetCorpusCount( corpusCount
);
180 pTerminal
= Insert( CSS( stem
) );
181 pTerminal
->SetPointer( pStem
);
183 status
.progress
.clear();
190 CStem
* CStemCollection::operator<<(const CStem
* stem
)
191 { return operator<<(implicit_cast
<const CParse
*>(stem
)); }
193 CStem
* CStemCollection::operator<<(const CParse
* stem
)
195 CStringSurrogate text
= stem
->GetKey();
196 CStem
* new_stem
= AddToCollection(text
);
198 QChar
* alphabetized_text
= LxAlphabetizeString(
199 stem
->GetKeyPointer(),
200 stem
->GetKeyLength());
201 new_stem
->SetAlphabetizedForm(QString(alphabetized_text
,
202 stem
->GetKeyLength()));
206 CStem
* CStemCollection::operator<<(CStringSurrogate stem_text
)
208 CStem
* new_stem
= AddToCollection(stem_text
);
210 QChar
* alphabetized_text
= LxAlphabetizeString(
211 stem_text
.Display().unicode(),
212 stem_text
.GetLength());
213 new_stem
->SetAlphabetizedForm(QString(alphabetized_text
,
214 stem_text
.GetLength()));
218 CStem
* CStemCollection::operator<<(QString stem_text
)
220 CStringSurrogate
text(stem_text
);
221 CStem
* new_stem
= AddToCollection(text
);
222 QChar
* alphabetized_text
= LxAlphabetizeString(
225 new_stem
->SetAlphabetizedForm(QString(alphabetized_text
,
226 stem_text
.length()));
230 void CStemCollection::AddPointer(CStem
* stem
)
232 base::AddPointer(stem
);
234 m_pLexicon
->InsertStem(stem
);
237 CStem
* CStemCollection::AddToCollection(const CParse
& stem_text
)
239 CStem
* new_stem
= base::AddToCollection(stem_text
);
241 m_pLexicon
->InsertStem(new_stem
);
245 CStem
* CStemCollection::AddToCollection(const CStringSurrogate
& stem_text
)
247 CStem
* new_stem
= base::AddToCollection(stem_text
);
249 m_pLexicon
->InsertStem(new_stem
);
255 void CStemCollection::Empty()
258 for (int i
= 0; i
< GetCount(); ++i
) {
259 CStem
* stem
= GetAt(i
);
260 m_pLexicon
->RemoveStem(stem
);
266 void CStemCollection::RemoveAll()
269 for (int i
= 0; i
< GetCount(); ++i
) {
270 CStem
* stem
= GetAt(i
);
271 m_pLexicon
->RemoveStem(stem
);
279 bool CStemCollection::Remove(CStem
* stem
)
282 m_pLexicon
->RemoveStem(stem
);
283 return base::Remove(stem
);
286 bool CStemCollection::RemoveMember(CStem
* stem
)
289 m_pLexicon
->RemoveStem(stem
);
290 return base::RemoveMember(stem
);
293 bool CStemCollection::RemoveMember(const CStringSurrogate
& stem_text
)
295 CStem
* stem
= static_cast<CStem
*>(Find1(stem_text
)->Get_T_Pointer());
296 return RemoveMember(stem
);
299 bool CStemCollection::RemoveMember(const CStringSurrogate
& stem_text
,
302 CStem
* stem
= static_cast<CStem
*>(Find1(stem_text
)->Get_T_Pointer());
304 m_pLexicon
->RemoveStem(stem
);
305 return base::RemoveMember(stem_text
, b
);
308 void CStemCollection::DeleteMarkedMembers()
310 // XXX. add hook to base for on-deletion actions so this can
313 if (base::m_DeletionArray
== 0)
316 if (m_pLexicon
!= 0) {
317 int count
= GetCount();
318 for (int i
= 0; i
< count
; ++i
)
319 if (base::m_DeletionArray
[i
] == 1)
320 m_pLexicon
->RemoveStem(m_PointerArray
[i
]);
322 base::DeleteMarkedMembers();
327 double CStemCollection::GetTotalUseCount()
331 if (!is_initial(m_AffixLocation
))
332 for (int i
= 0; i
< GetCount(); ++i
) {
333 CStem
* stem
= GetAt(i
);
334 m_TotalUseCount
+= stem
->GetNumberOfSuffixes();
337 for (int i
= 0; i
< GetCount(); ++i
) {
338 CStem
* stem
= GetAt(i
);
339 m_TotalUseCount
+= stem
->GetNumberOfPrefixes();
341 return m_TotalUseCount
;
344 // description length.
345 // see DescriptionLength.cpp for
346 // CStemCollection::CalculateTotalPhonologicalInformationContent(),
347 // CStemCollection::CalculateSumOfPointersToMyStems() definitions.