HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Signature.cpp
blob9fc7e9da2fea0c6e2181518f60a5a2f27311b2b3
1 // Implementation of CSignature, CSignatureListViewItem methods
2 // Copyright © 2009 The University of Chicago
3 #include "Signature.h"
4 #include <QMessageBox>
5 #include <QTextStream>
6 #include <QList>
7 #include "linguisticamainwindow.h"
8 #include <QPair>
9 #include "MiniLexicon.h"
10 #include "LPreferences.h"
11 #include "CorpusWord.h"
12 #include "Suffix.h"
13 #include "Prefix.h"
14 #include "Stem.h"
15 #include "SignatureCollection.h"
16 #include "SuffixCollection.h"
17 #include "PrefixCollection.h"
18 #include "WordCollection.h"
19 #include "StemCollection.h"
20 #include "SparseIntVector.h"
21 #include "CompareFunc.h"
22 #include "HTML.h"
23 #include "log2.h"
24 #include "Typedefs.h"
25 #include "implicit_cast.h"
27 bool stemlessthan(const QPair<CStem*, int> pair1, const QPair<CStem*, int> pair2 );
29 bool stemlessthan(const QPair<CStem*, int> pair1, const QPair<CStem*, int> pair2 )
31 return pair2.second < pair1.second;
35 //===================================================================================================//
37 // Signature listview item
39 //===================================================================================================//
40 CSignatureListViewItem::CSignatureListViewItem(Q3ListView *parent,
41 QString signature, int mini, CSignature* pSig,
42 QMap<QString, QString>* filter)
43 : Q3ListViewItem( parent, signature )
45 m_signature = pSig;
46 m_filter = filter;
47 m_label = signature;
48 m_parentlist = parent;
49 m_mini = mini;
53 CSignatureListViewItem::CSignatureListViewItem(Q3ListViewItem *parent,
54 QString signature, int mini, CSignature* pSig,
55 QMap<QString, QString>* filter)
56 : Q3ListViewItem( parent, signature )
58 m_signature = pSig;
59 m_filter = filter;
60 m_label = signature;
61 m_parentlist = parent->listView();
62 m_mini = mini;
65 int CSignatureListViewItem::compare(Q3ListViewItem *item, int col, bool asc) const
68 if (col== 2)
70 return MakeComparable ( m_signature->ComputeDLofModel() , ((CSignatureListViewItem*) item)->GetSignature()->ComputeDLofModel() );
72 if (col== 3)
74 return MakeComparable ( m_signature->GetCorpusCount() , ((CSignatureListViewItem*) item)->GetSignature()->GetCorpusCount() );
76 if (col== 4)
78 return MakeComparable ( m_signature->GetNumberOfStems() , ((CSignatureListViewItem*) item)->GetSignature()->GetNumberOfStems() );
80 if (col== 6)
82 return MakeComparable ( ((CSignatureListViewItem*) item)->GetSignature()->GetRobustness(), m_signature->GetRobustness() );
84 else
86 return Q3ListViewItem::compare(item, col, asc);
91 QString CSignatureListViewItem::text( int column ) const
95 CSignatureListViewItem* child = NULL;
97 int count;
98 QString dummy;
101 switch( column )
103 case 0:
104 if( m_signature && m_parentlist->sortColumn() == 0 && m_signature->GetMentor() )
106 return " : " + m_label;
108 else return m_label;
109 case 1:
110 if( m_signature && m_signature->GetNumberOfStems() > 0 )
112 if (m_signature->GetNumberOfStems() > 0 ) return m_signature->GetStem(0)->Display( QChar(0), m_filter );
114 else return "";
115 case 2:
116 if( m_signature ) return dummy.setNum( m_signature->ComputeDLofModel() );
117 else return "";
118 case 3:
119 if( m_signature ) return dummy.setNum ( m_signature->GetCorpusCount() );
120 else
122 count = 0;
123 child = (CSignatureListViewItem*) firstChild();
124 while( child )
126 if( child->GetSignature() )
128 count += child->GetSignature()->GetCorpusCount();
130 child = (CSignatureListViewItem*) child->nextSibling();
132 return dummy.setNum( count );
134 case 4:
135 if( m_signature && m_signature->GetNumberOfStems() > 0 ) return dummy.setNum( m_signature->GetNumberOfStems() );
136 else
138 count = 0;
139 child = (CSignatureListViewItem*) firstChild();
140 while( child )
142 if( child->GetSignature() &&
143 child->GetSignature()->GetNumberOfStems() > 0 )
145 count += child->GetSignature()->GetNumberOfStems();
147 child = (CSignatureListViewItem*) child->nextSibling();
149 return dummy.setNum( count );
151 case 5:
152 if( m_signature ) return m_signature->GetRemark();
153 else return "";
155 case 6:
156 if( m_signature ) return dummy.setNum( (int) m_signature->GetRobustness() );
158 else
160 count = 0;
161 child = (CSignatureListViewItem*) firstChild();
162 while( child )
164 if( child->GetSignature() &&
165 child->GetSignature()->GetNumberOfStems() > 0 )
167 count += child->GetSignature()->GetNumberOfStems();
169 child = (CSignatureListViewItem*) child->nextSibling();
171 return dummy.setNum( count );
173 case 7:
174 return "";
175 default:
176 return Q3ListViewItem::text( column );
180 //===================================================================================================//
182 // GUI stuff
184 //===================================================================================================//
185 void CSignature::BorrowedSigsDisplay(Q3ListView* List,
186 QMap<QString, QString>* filter)
188 QString source = "Unknown", dummy;
189 for (int minino = 0; minino < m_pMyMini->GetMiniSize(); ++minino) {
190 CMiniLexicon* mini = m_pMyMini->GetMiniLexicon(minino);
191 if (mini == 0)
192 continue;
194 CSignatureCollection& sigs = *mini->GetSignatures();
195 if (sigs ^= this) {
196 // found!
197 source = dummy.setNum(minino + 1);
198 break;
202 static_cast<void>(new Q3ListViewItem(
203 List, Display('.', filter), source));
206 //===================================================================================================//
208 // Constructor/destructor
210 //===================================================================================================//
212 CSignature::CSignature( CMiniLexicon* Lexicon ) : CLParse( Lexicon )
214 m_pMyMini = Lexicon;
216 m_StemPtrList = new QList<CStem*>();
217 m_WordPtrList = new QList<CStem*>();
218 m_MentorList = new QList<CSignature*>();
219 m_SuffixPtrList = new QList<CSuffix*>();
220 m_PrefixPtrList = new QList<CPrefix*>();
221 m_SortStyle = eAlphabetized;
222 // Description Length
223 m_DLofMyCorpus = 0;
224 m_DLofMyStemPointers = 0;
225 m_DLofMyAffixPointers = 0;
226 m_LengthOfPointerToMe = 0;
227 m_MyGeneralizer = NULL;
228 m_Remark = "";
229 m_Robustness = 0;
230 m_Mentor = NULL;
231 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
235 CSignature::CSignature( eAffixLocation AffixLocation, CMiniLexicon* Lexicon ) : CLParse( Lexicon )
237 m_pMyMini = Lexicon;
238 m_StemPtrList = new QList<CStem*>();
239 m_WordPtrList = new QList<CStem*>();
240 m_MentorList = new QList<CSignature*>();
241 m_SuffixPtrList = new QList<CSuffix*>();
242 m_PrefixPtrList = new QList<CPrefix*>();
243 m_SortStyle = eAlphabetized;
244 m_MyGeneralizer = NULL;
245 m_AffixLocation = AffixLocation;
247 m_Remark = "";
248 // Description Length
249 m_DLofMyCorpus = 0;
250 m_DLofMyStemPointers = 0;
251 m_DLofMyAffixPointers = 0;
252 m_LengthOfPointerToMe = 0;
253 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
257 CSignature::CSignature (const CParse& ParseSig, CMiniLexicon* Lexicon) : CLParse ( ParseSig, Lexicon )
259 m_pMyMini = Lexicon;
260 m_AffixLocation = Lexicon->GetAffixLocation();
261 m_StemPtrList = new QList<CStem*>();
262 m_WordPtrList = new QList<CStem*>();
263 m_MentorList = new QList<CSignature*>();
264 m_SuffixPtrList = new QList<CSuffix*>();
265 m_PrefixPtrList = new QList<CPrefix*>();
266 m_SortStyle = eAlphabetized;
267 m_Remark = "";
268 m_MyGeneralizer = NULL;
269 // Description Length
270 m_DLofMyCorpus = 0;
271 m_DLofMyStemPointers = 0;
272 m_DLofMyAffixPointers = 0;
273 m_LengthOfPointerToMe = 0;
274 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
278 CSignature::CSignature (const CParse* pParseSig, CMiniLexicon* Lexicon) : CLParse ( *pParseSig, Lexicon )
280 m_pMyMini = Lexicon;
282 m_StemPtrList = new QList<CStem*>();
283 m_WordPtrList = new QList<CStem*>();
284 m_MentorList = new QList<CSignature*>();
285 m_SuffixPtrList = new QList<CSuffix*>();
286 m_PrefixPtrList = new QList<CPrefix*>();
287 m_SortStyle = eAlphabetized;
288 m_MyGeneralizer = NULL;
289 m_AffixLocation = Lexicon->GetAffixLocation();
290 m_Remark = "";
291 // Description Length
292 m_DLofMyCorpus = 0;
293 m_DLofMyStemPointers = 0;
294 m_DLofMyAffixPointers = 0;
295 m_LengthOfPointerToMe = 0;
296 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
300 CSignature::CSignature(const CSignature& Sig) : CLParse (Sig, Sig.GetLexicon())
302 int affixno,
303 stemno;
304 m_AffixLocation = Sig.GetAffixLocation();
305 m_Remark = Sig.GetRemark();
306 m_pMyMini = Sig.GetLexicon();
307 m_MyGeneralizer = Sig.GetGeneralizer();
309 int NumberOfStems = Sig.GetNumberOfStems();
310 int NumberOfAffixes = Sig.Size();
311 int NumberOfWords = NumberOfStems*NumberOfAffixes;
312 QVector<double> m_WordCounts (NumberOfAffixes * NumberOfStems );
313 QVector<double> m_StemCounts ( NumberOfStems );
314 QVector<double> m_AffixCounts( NumberOfAffixes );
315 QVector<double> m_WordFrequencies (NumberOfWords);
316 QVector<double> m_StemFrequencies (NumberOfStems);
317 QVector<double> m_AffixFrequencies (NumberOfAffixes);
318 m_TotalCount = Sig.GetTotalCount();
320 m_StemPtrList = new QList<CStem*>();
321 for ( stemno = 0; stemno < NumberOfStems; stemno++)
323 AppendStemPtr( Sig.GetStem(stemno));
324 m_StemCounts[stemno] = Sig.GetStemCount(stemno);
325 m_StemFrequencies[stemno] = Sig.GetStemFrequency(stemno);
327 if (m_AffixLocation == WORD_FINAL || m_AffixLocation == STEM_FINAL) {
328 m_SuffixPtrList = new QList<CSuffix*>();
329 for ( affixno = 0; affixno < NumberOfAffixes; affixno++)
331 AppendSuffixPtr ( Sig.GetSuffix(affixno) );
332 m_AffixCounts[affixno] = Sig.GetAffixCount(affixno);
333 m_AffixFrequencies[affixno] = Sig.GetAffixFrequency(affixno);
336 if (m_AffixLocation == WORD_INITIAL || m_AffixLocation == STEM_INITIAL) {
337 m_PrefixPtrList = new QList<CPrefix*>();
338 for ( affixno = 0; affixno < NumberOfAffixes; affixno++)
340 AppendPrefixPtr ( Sig.GetPrefix(affixno) );
341 m_AffixCounts[affixno] = Sig.GetAffixCount(affixno);
342 m_AffixFrequencies[affixno] = Sig.GetAffixFrequency(affixno);
347 m_WordPtrList = new QList<CStem*>();
348 for (stemno = 0; stemno < NumberOfStems ; stemno++) {
349 for (affixno = 0; affixno < NumberOfAffixes; affixno++) {
350 SetWordCount(stemno, affixno, 0);
351 AppendWordPointer (Sig.GetWord(stemno, affixno));
355 m_Robustness = Sig.GetRobustness();
356 m_Mentor = NULL;
357 m_SortStyle = eAlphabetized;
358 m_MentorList = new QList<CSignature*>();
367 CSignature::CSignature(const CStringSurrogate& ssSig, CMiniLexicon* Lexicon) : CLParse(ssSig, Lexicon)
369 Collapse( ssSig, '.');
370 m_pMyMini = Lexicon;
372 m_StemPtrList = new QList<CStem*>();
373 m_WordPtrList = new QList<CStem*>();
374 m_MentorList = new QList<CSignature*>();
375 m_SuffixPtrList = new QList<CSuffix*>();
376 m_PrefixPtrList = new QList<CPrefix*>();
377 m_SortStyle = eAlphabetized;
378 m_MyGeneralizer = NULL;
379 // Description Length
380 m_DLofMyCorpus = 0;
381 m_DLofMyStemPointers = 0;
382 m_DLofMyAffixPointers = 0;
383 m_LengthOfPointerToMe = 0;
384 m_Remark = "";
385 if( Lexicon ) m_AffixLocation = Lexicon->GetAffixLocation();
386 m_Robustness = 0;
387 m_Mentor = NULL;
389 m_SortStyle = eAlphabetized;
393 CSignature::~CSignature()
396 if( m_StemPtrList ) delete m_StemPtrList;
397 if( m_WordPtrList ) delete m_WordPtrList;
398 if( m_MentorList ) delete m_MentorList;
399 if( m_SuffixPtrList ) delete m_SuffixPtrList;
400 if( m_PrefixPtrList ) delete m_PrefixPtrList ;
402 //===================================================================================================//
404 // Display
406 //===================================================================================================//
407 QString CSignature::Display(QChar sep, QMap<QString, QString>* filter) const
409 QString sd = sep;
410 if (sd == ".") {
411 sd = m_pMyMini->GetDocument()->GetPreferences()
412 ->GetPreference("Sig_Delimiter");
413 if (sd.size() != 1)
414 sd = ".";
416 return CParse::Display(sd.at(0), filter);
419 QString CSignature::Display(QMap<QString, QString>* filter) const
420 { return CParse::Display(filter); }
422 QString CSignature::Display() const
423 { return CParse::Display('.'); }
425 //===================================================================================================//
429 //===================================================================================================//
431 void CSignature::ConsumeParse( CParse* pParse )
433 ClearParse();
434 SetKey( pParse );
435 CopyParseStructure( *pParse );
439 void CSignature::Suicide()
441 //TODO: fill this in;
443 void CSignature::SetMyGeneralizer (CSignature* pSig)
445 m_MyGeneralizer = pSig;
447 //===================================================================================================//
449 // Operators
451 //===================================================================================================//
452 void CSignature::operator=(const CSignature* pSig)
454 m_pMyMini = pSig->GetMyMini();
455 CLParse::operator=(*pSig);
456 m_AffixLocation = pSig->GetAffixLocation();
458 int NumberOfStems = pSig->GetNumberOfStems();
459 int NumberOfAffixes = pSig->GetNumberOfAffixes();
460 int NumberOfWords = NumberOfStems*NumberOfAffixes;
461 m_StemCounts.resize(NumberOfStems);
462 m_WordCounts.resize(NumberOfWords);
463 m_AffixCounts.resize(NumberOfAffixes);
465 m_StemCounts.resize(NumberOfStems);
466 for (int stemno = 0; stemno < pSig->GetNumberOfStems(); stemno++) {
467 m_StemPtrList->append ( pSig->GetStem(stemno) );
468 m_StemCounts[stemno]=pSig->GetStemCount(stemno);
469 for (int affixno = 0; affixno < pSig->GetNumberOfAffixes(); affixno++)
471 m_WordPtrList->append ( pSig->GetWord(stemno, affixno));
472 SetWordCount(stemno, affixno, pSig->GetWordCount(stemno, affixno));
476 if (m_AffixLocation == WORD_FINAL || m_AffixLocation == STEM_FINAL ) {
477 for (int suffixno = 0; suffixno < pSig->GetNumberOfAffixes(); suffixno++)
479 m_SuffixPtrList->append ( pSig->GetSuffix(suffixno) );
480 m_AffixCounts[suffixno] = pSig->GetAffixCount(suffixno);
482 } else {
483 for (int prefixno = 0; prefixno < GetNumberOfAffixes(); prefixno++) {
484 m_PrefixPtrList->append(pSig->GetPrefix(prefixno) );
485 m_AffixCounts[prefixno] = pSig->GetAffixCount(prefixno);
491 m_Robustness = pSig->GetRobustness();
492 m_Mentor = NULL;
493 m_Remark = pSig->GetRemark();
497 QTextStream& operator<< (QTextStream& stream, CSignature* pSig)
499 CStem* pStem;
501 stream << endl << pSig->Display();
502 stream.width(6);
503 stream << pSig -> GetNumberOfStems() << " " << pSig->GetCorpusCount();
505 for (int stemno = 0; stemno < pSig->GetNumberOfStems(); stemno++)
507 pStem = pSig->GetStem(stemno);
508 if ( pStem->GetKey() != CStringSurrogate() )
510 stream << endl;
511 stream.width(20);
512 stream << pStem->GetKey().Display();
513 } else
515 stream << endl;
516 stream.width(20);
517 stream << "???";
521 return stream;
524 // <<-------------------------------------------------------------------------------------------------------->>
525 void CSignature::operator<< (CStem* pStem) //add to tail of list.
528 CStem* pWord;
530 if ( m_StemPtrList->indexOf ( pStem ) < 0 )
532 Q_ASSERT (pStem->GetKeyLength() > 0);
533 m_StemPtrList->append(pStem);
536 Q_ASSERT ( m_PieceCount <= m_LengthOfPieceVector ) ;
538 for (int wordno = 0; wordno < pStem->GetWordPtrList()->size(); wordno++)
540 pWord = pStem->GetWord(wordno);
541 Q_ASSERT (pWord->GetKeyLength() > 0);
542 m_WordPtrList->append (pWord);
544 pStem->SetSuffixSignature (this);
546 m_Robustness = 0;
547 m_Robustness = GetRobustness();
550 //===================================================================================================//
552 // Accessors and setters
554 //===================================================================================================//
555 CSignature* CSignature::GetMentor ( ) { return m_Mentor; }
556 // <<-------------------------------------------------------------------------------------------------------->>
557 void CSignature::SetMentor ( CSignature* pSig )
559 m_Mentor = pSig;
560 if( pSig && pSig->GetMentorList() && pSig->GetMentorList()->indexOf (this) < 0) {
561 pSig->GetMentorList()->append( this );
566 int CSignature::GetNumberOfAffixes() const
569 if ( m_AffixLocation == STEM_FINAL || m_AffixLocation == WORD_FINAL)
571 return m_SuffixPtrList->count();
573 if ( m_AffixLocation == STEM_INITIAL || m_AffixLocation == WORD_INITIAL)
575 return m_PrefixPtrList->count();
577 return 0;
581 void CSignature::AppendSuffixPtr (CSuffix* pSuffix) { m_SuffixPtrList->append(pSuffix);}
582 QList<CSignature*>* CSignature::GetMentorList( ) { return m_MentorList; }
583 int CSignature::GetNumberOfStems() const { return m_StemPtrList->count(); }
584 //int CSignature::GetNumberOfSuffixes () const { return m_SuffixPtrList->count(); }
585 void CSignature::SetRemark ( QString remark) { m_Remark = remark; }
586 CPrefix* CSignature::GetPrefix(int prefixno) const { return m_PrefixPtrList->at(prefixno); }
587 QList<CPrefix*>* CSignature::GetPrefixPtrList() const { return m_PrefixPtrList; }
588 QString CSignature::GetRemark() const { return m_Remark; }
589 QList<CStem*>* CSignature::GetStemPtrList() const { return m_StemPtrList;}
590 CStem* CSignature::GetStem(int stemno) const { return m_StemPtrList->at(stemno); }
591 CSuffix* CSignature::GetSuffix(int suffixno) const { return m_SuffixPtrList->at(suffixno); }
592 QList<CSuffix*>* CSignature::GetSuffixPtrList() const { return m_SuffixPtrList; }
593 int CSignature::GetTotalCount() const { return m_TotalCount; }
594 double CSignature::GetCorpusCount() const { return corpus_count::GetCorpusCount();}
595 float CSignature::GetSortingQuantity() const { return (float) GetRobustness();}
597 bool CSignature::StemListContains(CStem* pstem) { return m_StemPtrList->contains(pstem); }
598 void CSignature::AppendStemPtr(CStem* pStem) const { m_StemPtrList->append(pStem);}
601 eAffixLocation CSignature::GetAffixLocation() const { return m_AffixLocation; }
602 // <<-------------------------------------------------------------------------------------------------------->>
603 CStem* CSignature::GetWord(int stemno, int affixno) const
605 if (stemno < 0 || affixno < 0 || stemno >= GetNumberOfStems() || affixno >= GetNumberOfAffixes())
606 return NULL;
607 if (stemno * GetNumberOfAffixes() + affixno >= m_WordPtrList->size() )
608 return NULL;
609 return m_WordPtrList->at(stemno* GetNumberOfAffixes() + affixno);
611 CParse CSignature::GetStems()
613 CParse List;
616 List.Alphabetize();
618 if ( m_StemPtrList->count() == 0 ) { return List; } // ********** This is clearly a mistake. Fix it.
619 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
621 List.Append( GetStem(stemno)->GetKey() );
623 return List;
625 // <<-------------------------------------------------------------------------------------------------------->>
634 // <<-------------------------------------------------------------------------------------------------------->>
635 double CSignature::GetStemFrequency(int stemno ) const {
636 if (stemno < 0 || stemno > GetNumberOfStems() ) return 0;
637 return m_StemFrequencies[stemno];
640 // <<-------------------------------------------------------------------------------------------------------->>
641 double CSignature::GetAffixFrequency(int affixno ) const {
642 if (affixno < 0 || affixno > GetNumberOfAffixes() ) {return 0; }
643 return m_AffixFrequencies[affixno];
645 // <<-------------------------------------------------------------------------------------------------------->>
646 double CSignature::GetStemCount(int stemno) const {
647 if (stemno < 0 || stemno > GetNumberOfStems() ){ return 0; }
648 return m_StemCounts[stemno];
650 // <<-------------------------------------------------------------------------------------------------------->>
652 double CSignature::GetAffixCount(int affixno) const
653 { if (affixno < 0 || affixno > GetNumberOfAffixes() ) return 0;
654 return m_AffixCounts[affixno];
656 // <<-------------------------------------------------------------------------------------------------------->>
657 double CSignature::GetWordCount(int wordno)const {
658 if (wordno < 0 || wordno > GetNumberOfWords() ) { return 0;}
659 return m_WordCounts[wordno]; }
660 // <<-------------------------------------------------------------------------------------------------------->>
662 //===================================================================================================//
664 // Calculate frequencies and counts
666 //===================================================================================================//
667 void CSignature::CalculateFrequencies(CMiniLexicon* Lexicon)
669 CStringSurrogate Suffix;
670 CSuffix* pSuffix;
671 CStem* pStem;
672 CCorpusWord* pCorpusWord;
673 Q_ASSERT( GetCorpusCount() > 0);
674 int TotalCorpusCount = 0;
675 int* SuffixCount = new int [ Size()+ 1 ];
676 for (int suffixno = 1; suffixno <= Size(); ++suffixno)
677 { SuffixCount[suffixno] = 0; }
679 for (int suffixno = 1; suffixno <= Size(); suffixno++)
681 Suffix = GetPiece(suffixno);
682 pSuffix = new CSuffix(Suffix);
684 for (int stemno= 0; stemno < GetNumberOfStems(); stemno++)
686 pStem = GetStem(stemno);
687 pCorpusWord = Lexicon->FindAWord (pStem, pSuffix);
688 if( pCorpusWord ) // might not exist if we have collapsed signatures.
690 TotalCorpusCount += pCorpusWord->GetCorpusCount();
691 SuffixCount[suffixno] += pCorpusWord->GetCorpusCount();
696 delete [] SuffixCount;
699 // <<-------------------------------------------------------------------------------------------------------->>
700 void CSignature::ListDisplay(Q3ListView* List,
701 QMap<QString, QString>* filter, bool ExpressDeletees)
703 CSignature sig(m_pMyMini);
704 Express(sig, ExpressDeletees);
705 QString text = sig.Display('.', filter);
707 static_cast<void>(new CSignatureListViewItem(
708 List, text, m_pMyMini->GetIndex(), this, filter));
711 // <<-------------------------------------------------------------------------------------------------------->>
712 void CSignature::FindCorpusCount( )
714 SetCorpusCount ( 0 );
715 for (int stemno =0; stemno < GetNumberOfStems(); stemno++) {
716 for (int affixno = 0; affixno < GetNumberOfAffixes(); affixno ++)
717 IncrementCorpusCount ( GetWord(stemno, affixno)->GetCorpusCount() );
720 // <<-------------------------------------------------------------------------------------------------------->>
721 void CSignature::AttachToSuffixSig(CStem* pStem, bool bLookAtPreviousSig) //add to tail of list.
723 int stemno;
724 int numberofaffixes = GetNumberOfAffixes();
725 CStem* pWord;
726 CSignature* pOldSig = pStem->GetSuffixSignature();
727 QString stem = pStem->Display();
729 /* First, remove pStem from any other SuffixSignature it might be linked to.*/
730 if ( pOldSig && pOldSig != this ) {
731 pOldSig->DetachStem( pStem, eDo_Not_Call_Words );
732 pOldSig->RecalculateStemAndWordPointers();
735 stemno = m_StemPtrList->indexOf ( pStem );
736 if( stemno < 0 ) {
737 m_StemPtrList->append( pStem );
738 stemno = GetNumberOfStems()-1;
741 switch( m_AffixLocation){
742 case (WORD_FINAL):
743 case (STEM_FINAL):
744 for (int affixno = 0; affixno < numberofaffixes; affixno++)
746 pWord = GetLexicon()->GetWordFromStemSuffix(pStem, GetSuffix(affixno));
747 if (pWord)
749 AppendWordPointer( pWord);
750 pWord->SetSuffixSignature (this);
752 else
754 AppendWordPointer(NULL);
757 break;
758 case (WORD_INITIAL):
759 case (STEM_INITIAL):
760 for (int prefixno = 0; prefixno < numberofaffixes; prefixno++)
762 pWord = GetLexicon()->GetWordFromStemPrefix(pStem, GetPrefix(prefixno));
763 if (pWord)
765 AppendWordPointer( pWord);
766 pWord->SetPrefixSignature (this);
768 else
770 AppendWordPointer(NULL);
773 break;
776 pStem->SetSuffixSignature( this );
777 IncrementCorpusCount( pStem->GetCorpusCount()-1 );// first time CC is incremented
779 m_Robustness = 0;
780 m_Robustness = GetRobustness();
782 // <<-------------------------------------------------------------------------------------------------------->>
783 void CSignature::AttachToPrefixSig( CStem* pStem, bool bLookAtPreviousSig ) //add to tail of list.
785 CStem* pWord;
786 CSignature* pOldSig = pStem->GetPrefixSignature();
788 /* First, remove pStem from any other PrefixSignature it might be linked to.*/
789 if ( pOldSig && pOldSig != this ) {
790 pOldSig->DetachStem( pStem, eDo_Not_Call_Words );
791 RecalculateStemAndWordPointers();
794 if( m_StemPtrList->indexOf ( pStem ) < 0 ) {
795 AppendStemPtr( pStem );
798 // move the Words from the old signature to this, the new one.
800 for (int wordno = 0; wordno < pStem->GetNumberOfWords(); wordno++) {
801 pWord = pStem->GetWord(wordno);
802 m_WordPtrList->append (pWord);
803 pWord->SetPrefixSignature (this);
808 pStem->SetPrefixSignature( this );
809 IncrementCorpusCount( pStem->GetCorpusCount()-1 );
810 m_Robustness = GetRobustness();
813 // <<-------------------------------------------------------------------------------------------------------->>
814 double CSignature::GetRobustness() const
816 int SuffixLetters = 0,
817 StemLetters = 0;
819 if (m_Robustness == 0)
821 SuffixLetters = GetKeyLength();
822 QString Null = "NULL";
823 if ( Contains( CStringSurrogate(Null.unicode(),0,Null.length()) ) ) { SuffixLetters -= 4; }
825 CStem* pStem;
826 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++) {
827 pStem = GetStem(stemno);
828 StemLetters += pStem->GetKeyLength();
831 m_Robustness = ( Size() - 1 ) * StemLetters + (GetNumberOfStems() - 1) * SuffixLetters;
834 return m_Robustness;
836 // <<-------------------------------------------------------------------------------------------------------->>
837 void CSignature::SetRobustness ( double R ) { m_Robustness = R; }
838 // <<-------------------------------------------------------------------------------------------------------->>
840 // the counts of each individual word analyzed by this signature.
841 //double* CSignature::GetWordCounts() const { return m_WordCounts;
843 // <<-------------------------------------------------------------------------------------------------------->>
844 double CSignature::GetWordCount(int stemno, int affixno) const
846 if ( stemno < 0 || affixno < 0 || stemno >= GetNumberOfStems() || affixno >= GetNumberOfAffixes() ) return 0;
847 return m_WordCounts[stemno * GetNumberOfStems() + affixno];
849 // <<-------------------------------------------------------------------------------------------------------->>
850 void CSignature::SetWordCount (int stemno, int affixno, double value)
852 if ( stemno < 0 || affixno < 0 || stemno >= GetNumberOfStems() || affixno >= GetNumberOfAffixes() )
853 return;
854 m_WordCounts[stemno * GetNumberOfAffixes() + affixno] = value;
855 return;
859 // <<-------------------------------------------------------------------------------------------------------->>
861 void CSignature::CalculateWordCounts()
862 { QString string;
864 int numberofstems = GetNumberOfStems();
865 int numberofaffixes = GetNumberOfAffixes();
866 int count = 0;
868 CStem* pWord;
870 m_WordCounts.clear();
871 m_WordCounts.resize(numberofstems*numberofaffixes);
872 m_StemCounts.clear();
873 m_StemCounts.resize(numberofstems);
874 m_AffixCounts.clear();
875 m_AffixCounts.resize(numberofaffixes);
876 m_TotalCount = 0;
877 for (int affixno = 0; affixno < numberofaffixes; affixno++) { m_AffixCounts[affixno] = 0; }
878 for (int stemno = 0; stemno < numberofstems; stemno++) { m_StemCounts[stemno] = 0; }
882 for (int stemno = 0; stemno < numberofstems; stemno++)
884 for ( int affixno = 0; affixno < numberofaffixes; affixno++)
886 pWord = GetWord(stemno, affixno);
887 count = pWord->GetCorpusCount();
888 // SetWordCount (stemno, affixno, count);
889 // m_StemCounts[stemno] = m_StemCounts[stemno] + count;
890 // m_AffixCounts[affixno] = m_AffixCounts[affixno] + count;
891 // m_TotalCount += count;
895 if (m_TotalCount <= 0) return;
897 m_WordFrequencies.resize(numberofstems*numberofaffixes);
898 m_StemFrequencies.resize(numberofstems);
899 m_AffixFrequencies.resize(numberofaffixes);
901 for ( int stemno = 0; stemno < numberofstems; stemno++)
903 m_StemFrequencies[stemno] = m_StemCounts[stemno]/m_TotalCount;
904 for ( affixno = 0; affixno < numberofaffixes; affixno++)
906 wordno = stemno * numberofaffixes + affixno;
907 m_WordFrequencies[wordno] = GetWordCount(stemno, affixno) / m_TotalCount;
911 for (int affixno = 0; affixno < numberofaffixes; affixno++){
912 m_AffixFrequencies[affixno] = m_AffixCounts[affixno] / m_TotalCount;
918 //=================================================================================================/
920 // TODO: make sure COST function is consistent with older versions and working right
921 double CSignature::FindCost(CMiniLexicon* Lexicon)
923 //=================================================================================================/
927 Cost of a sig =
929 Sum over all of its stems :
931 log ( CorpusSize / Stem-count ) ( cost )
932 length ( stem ) * cost of a letter ( savings )
934 Sum over all of its suffixes:
936 log ( CorpusSize / suffix-count ) ( cost )
937 length ( suffix ) * cost of a letter ( savings )
940 CStem* pStem;
941 double Cost = 0,
942 AffixCost = 0,
943 AffixSavings = 0,
944 SignatureCost = 0,
945 StemCost = 0,
946 StemSavings = 0,
947 CostOfALetter = base2log (26),
948 ThisAffixCost = 0,
949 NumberOfWords = Lexicon->GetWords()->GetCount();
950 CAffix* pAffix;
953 for (int affixno = 1; affixno <= Size(); affixno++)
955 if( m_AffixLocation == WORD_FINAL || m_AffixLocation == STEM_FINAL )
957 pAffix = *Lexicon->GetSuffixes() ^= GetPiece(affixno);
959 else
961 pAffix = *Lexicon->GetPrefixes() ^= GetPiece(affixno);
964 if ( pAffix ) // it already exists
966 ThisAffixCost = base2log ( NumberOfWords / pAffix->GetUseCount() );
968 else
970 ThisAffixCost = base2log ( NumberOfWords/GetNumberOfStems() );
971 ThisAffixCost += GetPiece(affixno).GetLength() * CostOfALetter;
973 AffixCost += ThisAffixCost;
975 AffixSavings += GetPiece(affixno).GetLength() * CostOfALetter;
977 SignatureCost += ThisAffixCost;
981 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
983 pStem = m_StemPtrList->at(stemno);
984 StemCost += base2log ( NumberOfWords / Size() ); // Size is the number of words that use stem, of course.
985 StemCost += pStem->GetKeyLength() * CostOfALetter;
986 StemSavings += pStem->GetKeyLength() * CostOfALetter * Size(); // save for each time stem appears, with each suffix
987 SignatureCost += StemCost;
990 Cost = AffixCost + StemCost - AffixSavings - StemSavings + SignatureCost;
992 return Cost;
996 // <<-------------------------------------------------------------------------------------------------------->>
998 void CSignature::OutputSignature( QTextStream& outf )
1002 QString string;
1003 CStem* pStem;
1006 outf << " ------------------------------------------------------------------------------------------ " << endl;
1007 outf << Display( '.', m_pMyMini->GetOutFilter() );
1008 outf << endl << " ------------------------------------------------------------------------------------------ " << endl;
1010 outf << endl;
1011 outf << " ";
1015 outf << "Number of stems: ";
1016 outf << QString("%1").arg( (int) GetNumberOfStems() );
1018 outf << " Corpus count: ";
1019 outf << QString("%1").arg( GetCorpusCount() );
1020 outf << " ";
1022 outf << " ";
1023 outf << GetRemark().replace( QChar(' '), "_" );
1024 outf << " ";
1026 outf << "Number of affixes: ";
1027 outf << GetNumberOfAffixes();
1028 outf << " Word Pointer List length: ";
1029 outf << m_WordPtrList->count();
1030 outf << endl;
1033 QStringList stems;
1035 CalculateWordCounts();
1036 int maxlength = 0;
1037 CStem* pWord;
1039 outf.setFieldAlignment( QTextStream::AlignLeft );
1040 QList< QPair<CStem*, int> > pstems;
1041 for (int stemno =0; stemno< GetNumberOfStems(); stemno++ )
1043 pStem = GetStem(stemno);
1044 pstems.append( qMakePair(pStem, pStem->GetCorpusCount() ) );
1045 if (pStem->GetKeyLength() > maxlength) { maxlength = pStem->GetKeyLength();}
1047 qSort(pstems.begin(), pstems.end(), stemlessthan);
1049 outf << "Sorted by stem frequency: " << endl << endl;
1050 outf << "# Rank | Stem | Words .... " << endl;
1051 outf << "# ------------------------------------------------------------------------------------------ " << endl;
1054 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
1056 outf.setf(2);
1057 outf.width(5);
1058 pStem = pstems[stemno].first;
1059 outf. width(6);
1060 outf << stemno;
1061 outf. width( maxlength + 5);
1062 outf << pStem->Display();
1063 outf.width (9);
1064 outf << pstems[stemno].second;
1065 outf << endl;
1067 outf << endl << "# ------------------------------------------------------------------------------------------ " << endl;
1068 outf << endl << endl <<"Display all words with counts: " << endl;
1069 outf << "# ------------------------------------------------------------------------------------------ " << endl;
1071 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
1073 for ( int affixno = 0; affixno < GetNumberOfAffixes(); affixno++)
1075 pWord = GetWord(stemno, affixno );
1076 if (pWord)
1078 outf.setFieldWidth (maxlength + 5); outf << pWord->Display();
1079 outf.setFieldWidth (5) ; outf << string.setNum( pWord->GetCorpusCount() );
1082 outf << endl;
1084 outf << endl << endl;
1090 /* This purpose of this function is to take a signature of the form A.SUFFIX
1091 and make it NULL.SUFFIX (the pAlternateSig), and move that letter A back onto its stems.
1094 // <<-------------------------------------------------------------------------------------------------------->>
1095 void CSignature::RemoveLetter (CStringSurrogate& ssLetter, CMiniLexicon* Lexicon, CSignature* pAlternateSig)
1098 CStem* qStem;
1099 CSuffix* pSuffix,
1100 *pNewSuffix = NULL;
1101 QString Stem,
1102 Suffix,
1103 Null = "NULL";
1104 QString OldKey = Display();
1105 CStringSurrogate ssSuffix,
1106 ssStem;
1107 CStem* pWord;
1108 CSignature NewSig ( WORD_FINAL, Lexicon );
1109 int LetterLength = ssLetter.GetLength();
1111 CSignature *qSig = NULL,
1112 *pOlderSig = NULL;
1113 CParse PSuffix,
1114 PWord,
1115 PNewStem;
1117 QMap<QString,CSuffix*> SuffixPtrTranslation;
1119 /* Create the NewSig */
1120 for (int affixno = 1; affixno <= Size(); affixno++)
1122 ssSuffix = GetPiece(affixno);
1123 if(!NewSig.GetSortStyle()== eAlphabetized) NewSig.Alphabetize();
1124 if ( ssSuffix == ssLetter )
1126 if(!NewSig.GetSortStyle()==eAlphabetized) NewSig.Alphabetize();
1127 NewSig.Append ( CStringSurrogate(Null.unicode(),0,Null.length()) );
1129 else
1131 QString lt_brak = "<", rt_brak = ">";
1133 PSuffix = CStringSurrogate(lt_brak.unicode(),0,1);
1134 PSuffix += ssLetter;
1135 PSuffix += CStringSurrogate(rt_brak.unicode(),0,1);
1136 PSuffix += ssSuffix;
1138 pSuffix = *Lexicon->GetSuffixes() << PSuffix;
1140 Suffix = "<" + ssLetter.Display() + ">" + ssSuffix.Display();
1141 SuffixPtrTranslation[ ssSuffix.Display() ] = pSuffix; // based on old suffix
1142 // SuffixStringTranslation[ ssSuffix.Display() ] = Suffix;
1144 NewSig.Append ( PSuffix.GetKey() );
1148 /* Change the KEY of this signature */
1150 SetKey ( NewSig );
1151 QString remark = GetRemark() + " +allomorphy";
1152 SetRemark ( remark );
1154 //-----------------------------------------------------------//
1155 // Change the signature, the stems, the words -- and the suffixes.
1156 //-----------------------------------//
1157 /* Deal with the stems */
1158 //-----------------------------------//
1161 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
1163 CStem* pStem = m_StemPtrList->at(stemno);
1164 ssStem = pStem->GetKey();
1165 PNewStem = ssStem + ssLetter;
1166 qStem = *Lexicon->GetStems() ^= PNewStem;
1168 if (qStem) // -- if the larger one already existed
1170 pOlderSig = *Lexicon->GetSignatures() ^= qStem->GetSuffixList();
1172 // this removes both stem and word from signature:
1173 pOlderSig -> DetachStem ( qStem, eCall_Words ); // we might want to eliminate this sig if it has no more stems
1175 qStem -> GetSuffixList()->MergeAndAlphabetizeParse( CParse(NewSig) );
1177 qSig = *Lexicon->GetSignatures() << qStem->GetSuffixList();
1179 // attaches both stems and words to qSig
1180 qSig -> AttachToSuffixSig(qStem, false);
1183 else // make the old stem into this new one
1185 pStem -> RepairSuffixList ( Lexicon );
1186 Lexicon -> GetStems()-> SetKey( pStem, PNewStem );
1187 pStem -> SetKey( PNewStem );
1191 Q_ASSERT(m_StemPtrList->size() > 0);
1192 CStem* pStem = m_StemPtrList->at(m_StemPtrList->size() - 1);
1194 //---------------------------------------------//
1195 /* Deal with the WORDs of this signature */
1196 //---------------------------------------------//
1198 for (int wordno = 0; wordno < m_WordPtrList->size(); wordno++)
1200 pWord = m_WordPtrList->at(wordno);
1201 pNewSuffix = SuffixPtrTranslation[ pWord->GetSuffix().Display() ];
1202 pWord -> ShiftStemSuffixBoundary ( LetterLength );
1204 pWord -> SetSuffixPtr ( pNewSuffix );
1205 pWord -> AttachWordAndSuffixalStem ( pStem );
1206 pWord -> SetSuffixSignature ( this );
1210 //------------------------------------------------------------//
1211 // Alternate Sig
1212 //------------------------------------------------------------//
1213 /* Shift stems from AlternateSig to the NewSig, but NOT
1214 if the stem ends with Letter; if it does, we'll
1215 keep the old signature with that stem.
1218 This will replace some or all of pAlternateSig --
1219 "some" when there are any stems that don't allow removal of the Letter.
1220 For example, NULL.ing will not disappear when <e>ing.NULL is created,
1221 because the stem "be" still requires NULL.ing --
1224 // Deal with stems in AlternateSig....
1226 for (int stemno = 0; stemno < pAlternateSig->GetNumberOfStems(); stemno++)
1228 pStem = pAlternateSig->GetStem(stemno);
1229 ssStem = pStem->GetKey();
1230 if ( ssStem.Right(LetterLength) == ssLetter )
1231 { continue; }
1233 pAlternateSig->DetachStem( pStem, eCall_Words );
1234 AttachToSuffixSig( pStem, false );
1236 // Deal with Words in Alternate signature
1238 for (int stemno = 0; stemno < pAlternateSig->GetNumberOfStems(); stemno++)
1240 pWord = pAlternateSig->GetStem(stemno);
1241 pNewSuffix = SuffixPtrTranslation[ pWord->GetSuffix().Display() ];
1243 pWord -> SetSuffixPtr ( pNewSuffix );
1244 pWord -> AttachWordAndSuffixalStem ( pStem );
1245 pWord ->SetSuffixSignature ( this );
1248 //------------------------------------------------------------//
1250 /* Get rid of the Alternate Sig ("NULL.ing" ) */
1252 if ( pAlternateSig->GetNumberOfStems() == 0 )
1254 Lexicon->GetSignatures()->Remove( pAlternateSig );
1259 // <<------------------------------------------------------------------------>>
1260 bool CSignature::EachSuffixCanHaveThisLetterPrefixedToIt ( const QString& Letter)
1262 QString Suffix;
1263 for (int affixno = 1; affixno <= Size(); ++affixno) {
1264 Suffix = GetPiece(affixno).Display();
1265 if (Suffix == "NULL" ) { Suffix = ""; }
1266 Suffix = Letter + Suffix;
1267 if(0)// TODO: if ( ! (*Lexicon->GetSuffixes() ^= Suffix ) )
1269 return FALSE;
1273 return TRUE;
1275 // <<------------------------------------------------------------------------>>
1276 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance,
1277 const QString& Piece)
1279 struct not_implemented { };
1280 throw not_implemented();
1282 // XXX. suppresses “unused parameter” warnings
1283 static_cast<void>(Distance);
1284 static_cast<void>(Piece);
1286 foreach (CStem* word, *m_WordPtrList) {
1287 word->ShiftStemSuffixBoundary(-1);
1288 Q_ASSERT(word->GetStemLoc() != 0);
1291 foreach (CStem* stem, *m_StemPtrList) {
1292 CStringSurrogate stem_text = stem->GetKey();
1293 stem->ClearParse();
1294 stem->SetKey(stem_text.Left(stem_text.GetLength() - 1));
1296 // XXX. Check to see if the new stem already exists.
1297 // Lexicon->GetStems()->GetHash()->RemoveKey ( Stem );
1298 // Lexicon->GetStems()->GetHash()->SetAt( NewStem, pStem );
1299 // Lexicon->GetStems()->SetKey( pStem, NewStem );
1302 // XXX. fix the signature
1303 // AddLetter ( 1, Piece );
1305 // Lexicon->AddToScreen ( Display() );
1308 // Variant in which the shifted string varies from stem to stem.
1309 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance)
1311 // XXX. suppresses “unused parameter” warning
1312 static_cast<void>(Distance);
1313 struct not_implemented { };
1314 throw not_implemented();
1316 // first, fix the words;
1317 foreach (CStem* word, *m_WordPtrList) {
1318 word->ShiftStemSuffixBoundary(-1);
1319 Q_ASSERT(word->GetStemLoc() != 0);
1322 // XXX. fix the signature
1323 // AddLetter ( 1, Piece );
1325 // Lexicon->AddToScreen ( Display() );
1328 void CSignature::AddLetter(const QString& Letter )
1330 PrefixToAllPieces ( CStringSurrogate(Letter.unicode(),0,Letter.length() ) );
1334 /// Looks at the final ngrams of the stems, and calculates its entropy
1335 double CSignature::ComputeFinalNgramEntropyOfStems(int n)
1337 TCollection<CLParse> Ngrams;
1338 foreach (CStem* pStem, *m_StemPtrList) {
1339 if (pStem->GetKeyLength() <= n)
1340 // too short
1341 return -1;
1343 CStringSurrogate ssPiece = pStem->GetKey();
1344 ssPiece = is_initial(GetAffixLocation()) ?
1345 ssPiece.Left(n) : ssPiece.Right(n);
1346 Ngrams << ssPiece;
1349 double Entropy = 0.0;
1350 const double StemCount = GetNumberOfStems();
1351 const int ngram_count = Ngrams.GetCount();
1352 for (int i = 0; i < ngram_count; ++i) {
1353 const double fraction = StemCount / Ngrams[i]->GetCorpusCount();
1354 Entropy += log2(fraction) / fraction;
1357 return Entropy;
1359 //===================================================================================================//
1361 // CHECK OUT: major function
1363 //===================================================================================================//
1364 /// Test to see whether the break with its stems is a good one.
1365 int CSignature::CheckOut(CMiniLexicon* Lexicon)
1367 using linguistica::implicit_cast;
1368 // Throughout, “DL” stands for “description length”.
1369 Lexicon->LogFileSmallTitle( Display() );
1370 if (Lexicon->LogFileOn()) {
1371 // dump stem list to log file.
1372 Lexicon->LogFileStartTable();
1373 Lexicon->LogFileStartRow();
1374 const int num_columns = 5;
1376 // For each stem:
1377 CParse Stems = GetStems();
1378 for (int stemno = 1; stemno <= GetNumberOfStems(); ++stemno) {
1379 if (stemno % num_columns == 0) {
1380 Lexicon->LogFileEndRow(); Lexicon->LogFileStartRow();
1382 Lexicon->LogFile( Stems[stemno].Display());
1384 Lexicon->LogFileEndRow(); Lexicon->LogFileEndTable();
1385 } // end of logfile on
1386 Lexicon->LogFileHeader("Number of letters","Entropy", "Resolution?" );
1387 bool LowEntropyFlag = false;
1388 int LargestSizeChunkToPullOffStem = 0;
1389 // Use entropy to see how many letters to consider shifting
1390 // XXX. Make this user-changeable.
1391 const double EntropyThreshold = 1.5;
1392 const int LengthToConsiderShifting = 4;
1393 for (int n = 1; n <= LengthToConsiderShifting; ++n) {
1394 const double Entropy = ComputeFinalNgramEntropyOfStems(n);
1396 if (Entropy < 0) {
1397 // Negative entropy:
1398 // stem too short to consider shortening.
1399 Lexicon->LogFile("", "", "No reanalysis");
1400 continue;
1403 if (Entropy >= EntropyThreshold) {
1404 Lexicon->LogFile ("", "", "Entropy too large.");
1405 break;
1408 // set of n-suffixes of stems has low entropy:
1409 // maybe stems have a common suffix that should be
1410 // incorporated into the signature.
1411 LowEntropyFlag = true;
1412 LargestSizeChunkToPullOffStem = n;
1413 Lexicon->LogFile(n, Entropy, "Entropy sufficiently small.");
1414 } //end of loop on n
1415 Lexicon->LogFileEndTable();
1416 if (!LowEntropyFlag)
1417 // Not enough stems share common endings to restructure,
1418 // so leave this signature alone.
1419 return 0;
1421 const bool analyzingSuffixes = !is_initial(GetAffixLocation());
1423 const double TotalNumberOfAnalyzedWords =
1424 Lexicon->GetSignatures()->GetTotalNumberOfWords();
1425 const double LogTotalNumberOfAnalyzedWords =
1426 base2log(TotalNumberOfAnalyzedWords);
1427 const double LengthOfPointerToThisSig =
1428 LogTotalNumberOfAnalyzedWords -
1429 base2log(Size() * GetNumberOfStems());
1431 // Description length of the original analysis
1432 double CurrentDL;
1434 // DL of this signature:
1436 // a. Length of pointers to its suffixes; var: LengthOfPointersToAllAffixesOfSig
1437 // b. Prorated responsibility for phonological content of suffixes
1438 // var: TotalResponsibilityForAffixListings
1439 // c. List of pointers from each stem to this signature
1440 // var: StemPointersToThisSig;
1441 // d. List of pointers from each word to its suffix
1443 // Compute DL of 'original' analysis.
1444 Lexicon->LogFileSmallTitle ("Description length of current signature");
1445 Lexicon->LogFileHeader("Affix", "Use count", "Pointer to this affix"); ;
1447 double LengthOfPointersToAllAffixesOfSig = 0.0;
1448 double TotalResponsibilityForAffixListings = 0.0;
1449 // for each suffix (resp. prefix) in this signature:
1450 for (int affixno = 1; affixno <= Size(); ++affixno) {
1451 QString Affix = GetPiece(affixno).Display();
1452 CAffix* pAffix = analyzingSuffixes
1453 ? implicit_cast<CAffix*>(
1454 *Lexicon->GetSuffixes() ^= Affix)
1455 : implicit_cast<CAffix*>(
1456 *Lexicon->GetPrefixes() ^= Affix);
1458 // Length of pointers to affixes
1459 // part a
1460 const double LengthOfPointerToThisAffix =
1461 LogTotalNumberOfAnalyzedWords -
1462 base2log(pAffix->GetUseCount());
1463 LengthOfPointersToAllAffixesOfSig +=
1464 LengthOfPointerToThisAffix;
1466 Lexicon->LogFile(Affix, pAffix->GetUseCount(), LengthOfPointerToThisAffix);
1468 // use count of affix; length of pointer to this affix.
1469 // Assign partial responsibility for this signature's
1470 // suffixes' entries.
1471 // part b.
1472 const double LocalProportion =
1473 double(GetNumberOfStems()) / pAffix->GetUseCount();
1474 const double ResponsibilityForThisAffixListing =
1475 LocalProportion * Affix.length() * base2log(26);
1476 TotalResponsibilityForAffixListings +=
1477 ResponsibilityForThisAffixListing; // in *bits*
1478 }// end of affixno loop
1480 Lexicon->LogFileEndTable();
1481 Lexicon->LogFileStartTable();
1482 Lexicon->LogFile("Part 1: Length of pointer to affixes", LengthOfPointersToAllAffixesOfSig);
1483 Lexicon->LogFile("Part 2: Prorated responsibility for phonology of affixes:", TotalResponsibilityForAffixListings);
1485 // part c.
1486 const double StemPointersToThisSig =
1487 GetNumberOfStems() * LengthOfPointerToThisSig;
1489 // In sum:
1490 const double total_dl =
1491 LengthOfPointersToAllAffixesOfSig +
1492 TotalResponsibilityForAffixListings +
1493 StemPointersToThisSig;
1494 Lexicon->LogFile("Part 3: Stem poionters to this sig:", StemPointersToThisSig);
1495 Lexicon->LogFile("Length of 1 pointer to this sig: ", LengthOfPointerToThisSig);
1496 Lexicon->LogFile("Total", total_dl);
1497 Lexicon->LogFileEndTable();
1498 CurrentDL = total_dl;
1500 double WinningDL = CurrentDL;
1501 int WinningLengthOfStemToShift = 0;
1503 // We might shift only those stems for which the EndPiece
1504 // occurs in more than 45% of the stems of this sig (that
1505 // leaves open the case of two closely related letters
1506 // comprising almost all of the cases).
1507 // But for now, we're not doing that.
1509 // The outer loop here is for the case where the entropy test
1510 // tells us that 2 or more letters can be shifted
1511 // (e.g., sig on.ve can be shifted either to ion.ive or
1512 // tion.tive), and we want to evaluate both.
1514 // Major loop through alternatives to the current signature
1515 CParse WinningSig;
1516 // loop through different lengths to shift:
1517 for (int NumberOfLettersShifted = LargestSizeChunkToPullOffStem;
1518 NumberOfLettersShifted > 0;
1519 --NumberOfLettersShifted) {
1521 TCollection<CLParse> EndPieces;
1522 foreach (CStem* pStem, *m_StemPtrList) {
1523 if (pStem->GetKeyLength() <= NumberOfLettersShifted)
1524 continue;
1526 CStringSurrogate stem_text = pStem->GetKey();
1527 CStringSurrogate ssPiece = analyzingSuffixes
1528 ? stem_text.Right(NumberOfLettersShifted)
1529 : stem_text.Left(NumberOfLettersShifted);
1530 EndPieces << ssPiece;
1533 // XXX. The function is supple enough to move material
1534 // from the stem to the affix in some cases but not in others.
1536 double AllNewSigsAnalysisDL = 0.0;
1537 double TotalDecreaseInDLDueToShorterStems = 0.0;
1538 // each of these is a distinct piece being, perhaps,
1539 // transferred from stem(s) to affixes
1540 // for each string of this length that would have to be shifted:
1541 CParse Sig;
1542 for (int pieceno = 0; pieceno < EndPieces.GetCount(); ++pieceno) {
1543 CLParse* pPiece = EndPieces.GetAt(pieceno);
1545 // make a copy to play with.
1546 Sig = *this;
1548 if (analyzingSuffixes)
1549 Sig.PrefixToAllPieces2(pPiece->GetKey());
1550 else
1551 Sig.SuffixToAllPieces2(pPiece->GetKey());
1553 // DL of this signature:
1555 // a. Length of pointers to its suffixes;
1556 // var: LengthOfPointersToAllAffixesOfSig
1557 // b. Prorated responsibility for phonological
1558 // content of suffixes
1559 // var: TotalResponsibilityForAffixListings
1560 // c. List of pointers from each stem to this
1561 // signature
1562 // var: PointersToThisSig;
1563 // d. Savings because stems already existed
1564 // var: SavingsBecauseStemAlreadyExisted
1565 // e. Savings because stems are shorter
1566 // var: TotalDecreaseInDLDueToShorterStems :
1567 // once for each *length* being shifted from
1568 // stem to suffix
1569 // f. List of pointers from each word to its
1570 // suffix
1571 // XXX. not implemented.
1573 double LengthOfPointersToAllAffixesOfSig = 0.0;
1574 double TotalResponsibilityForAffixListings = 0.0;
1575 if (*Lexicon->GetSignatures() ^= Sig) {
1576 // new signature already exists
1577 Lexicon->LogFileSmallTitle("Alternative analysis already existed", Sig.Display('-'));
1578 // XXX. address this case!
1579 } else {
1580 // new signature
1581 Lexicon->LogFileSmallTitle("Conjectured signature: ", Sig.Display('-'));
1583 // iterate through suffixes of the signature
1584 Lexicon->LogFileHeader("Suffix", "Previous count", "New count", "Pointer length to this affix", "Responsibility for this affix (phonology) in bits:", "New DL for this affix");
1585 double ThisNewSigDL = 0.0;
1586 // for each suffix (resp prefix) in the new sig:
1587 for (int affixno = 1; affixno <= Size(); ++affixno) {
1588 CStringSurrogate ssAffix =
1589 Sig.GetPiece(affixno);
1591 CAffix* pAffix = analyzingSuffixes
1592 ? implicit_cast<CAffix*>(
1593 *Lexicon->GetSuffixes() ^= ssAffix)
1594 : implicit_cast<CAffix*>(
1595 *Lexicon->GetPrefixes() ^= ssAffix);
1596 double sum;
1597 if (pAffix != 0) {
1598 const double ResponsibilityForThisAffixListing =
1599 double(ssAffix.GetLength()) * base2log(26) *
1600 GetNumberOfStems() /
1601 (double(GetNumberOfStems()) +
1602 pAffix->GetUseCount());
1603 const double LengthOfPointerToThisAffix =
1604 LogTotalNumberOfAnalyzedWords -
1605 base2log(pAffix->GetUseCount() +
1606 GetNumberOfStems());
1608 TotalResponsibilityForAffixListings +=
1609 ResponsibilityForThisAffixListing;
1610 LengthOfPointersToAllAffixesOfSig +=
1611 LengthOfPointerToThisAffix;
1613 sum = ResponsibilityForThisAffixListing +
1614 LengthOfPointerToThisAffix;
1615 Lexicon->LogFile (ssAffix.Display(), pAffix->GetUseCount(), GetNumberOfStems() + pAffix->GetUseCount(), LengthOfPointerToThisAffix, ResponsibilityForThisAffixListing, sum);
1617 } else {
1618 // new affix
1619 const double ResponsibilityForThisAffixListing =
1620 double(ssAffix.GetLength()) * base2log(26);
1621 const double LengthOfPointerToThisAffix =
1622 LogTotalNumberOfAnalyzedWords -
1623 base2log(GetNumberOfStems());
1625 LengthOfPointersToAllAffixesOfSig +=
1626 LengthOfPointerToThisAffix;
1627 TotalResponsibilityForAffixListings +=
1628 ResponsibilityForThisAffixListing;
1629 sum = ResponsibilityForThisAffixListing +
1630 LengthOfPointerToThisAffix;
1631 Lexicon->LogFile(ssAffix.Display(), 0, GetNumberOfStems(), LengthOfPointerToThisAffix, ResponsibilityForThisAffixListing, sum);
1632 } //end of else
1633 ThisNewSigDL += sum;
1635 Lexicon->LogFile("Total", 0, 0, LengthOfPointersToAllAffixesOfSig, TotalResponsibilityForAffixListings, ThisNewSigDL);
1638 // Length of the pointers to the sig from its stems:
1639 double SavingsBecauseStemAlreadyExisted = 0.0;
1640 double StemPointersToThisSig;
1641 IterateThroughStems(NumberOfLettersShifted,
1642 Lexicon,
1643 pPiece,
1644 TotalDecreaseInDLDueToShorterStems,
1645 LogTotalNumberOfAnalyzedWords,
1646 StemPointersToThisSig,
1647 SavingsBecauseStemAlreadyExisted,
1648 analyzingSuffixes);
1649 const double ThisNewSigDL =
1650 LengthOfPointersToAllAffixesOfSig +
1651 TotalResponsibilityForAffixListings +
1652 StemPointersToThisSig +
1653 -SavingsBecauseStemAlreadyExisted +
1654 -TotalDecreaseInDLDueToShorterStems;
1655 AllNewSigsAnalysisDL += ThisNewSigDL;
1656 Lexicon->LogFile("Part 1: Length of pointer to affixes: ", LengthOfPointersToAllAffixesOfSig);
1657 Lexicon->LogFile("Part 2: Prorated responsibility for phonology of affixes: ", TotalResponsibilityForAffixListings);
1658 Lexicon->LogFile("Part 3: Stem pointers to this sig:", StemPointersToThisSig);
1659 Lexicon->LogFile("Length of 1 poitner to this sig: ", LengthOfPointerToThisSig);
1660 Lexicon->LogFile("Part 4: Total savings from stems that had already existed", SavingsBecauseStemAlreadyExisted);
1661 Lexicon->LogFile("Part 5: Total decrease in DL due to shorter stems: ", TotalDecreaseInDLDueToShorterStems);
1662 Lexicon->LogFile("Total DL: ", ThisNewSigDL);
1664 if (Lexicon->LogFileOn()) *Lexicon->GetLogFile() <<
1665 "<br /><br />" <<
1666 QString("If we add %1 letters, total TD is %2")
1667 .arg(NumberOfLettersShifted).arg(AllNewSigsAnalysisDL) <<
1668 endl << "******" << endl <<
1669 "<br />";
1671 if (AllNewSigsAnalysisDL < WinningDL) {
1672 WinningDL = AllNewSigsAnalysisDL;
1673 WinningLengthOfStemToShift = NumberOfLettersShifted;
1674 WinningSig = Sig;
1678 if (WinningDL != CurrentDL) {
1679 if (Lexicon->LogFileOn()) *Lexicon->GetLogFile() <<
1680 SmallTitle(QString(
1681 "Change signature from \"%1\" to \"%2\"")
1682 .arg(Display(), WinningSig.Display('.'))) <<
1683 "<hr />";
1684 Lexicon->AddToScreen(
1685 QString("%1 >> %2")
1686 .arg(Display('.'), WinningSig.Display('.')));
1687 return WinningLengthOfStemToShift;
1688 } else {
1689 if (Lexicon->LogFileOn()) *Lexicon->GetLogFile() <<
1690 SmallTitle(QString(
1691 "%1: Conclusion: Keep original signature.")
1692 .arg(Display())) <<
1693 "<hr />";
1694 return 0;
1697 // <<-------------------------------------------------------------------------------------------------------->>
1698 void CSignature::IterateThroughStems( int NumberOfLettersShifted,
1699 CMiniLexicon* Lexicon,
1700 CLParse* pPiece,
1701 double& TotalDecreaseInDLDueToShorterStems,
1702 double LogTotalNumberOfAnalyzedWords,
1703 double& StemPointersToThisSig,
1704 double& SavingsBecauseStemAlreadyExisted,
1705 bool analyzingSuffixes)
1710 CStem* pStem;
1711 int HowManyStemsForThisSig = 0; //check that
1712 int NumberOfShortenedStemsThatPreExisted = 0;
1713 double ThisSavingBecauseStemAlreadyExisted = 0;
1714 double DecreaseInDLDueToShorterStems = 0;
1715 double LengthOfPointerToThisSig = 0;
1716 CSS ssNewStem;
1718 TotalDecreaseInDLDueToShorterStems = 0;
1719 SavingsBecauseStemAlreadyExisted = 0;
1721 Lexicon->LogFile (pPiece->Display() );
1722 Lexicon->LogFileHeader( "Current stem", "Proposed stem", "Savings from preexisting stem");
1725 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
1727 pStem = m_StemPtrList->at(stemno);
1728 ThisSavingBecauseStemAlreadyExisted =0;
1729 int StemLength = pStem->GetKeyLength();
1730 ssNewStem = pStem->GetKey().Left(
1731 StemLength - NumberOfLettersShifted);
1733 if ( analyzingSuffixes ) // Suffixes
1735 if ( pStem->GetKey().Right(NumberOfLettersShifted).Display() == pPiece->Display() )
1737 HowManyStemsForThisSig++;
1738 Lexicon->LogFile (pStem->Display(), ssNewStem.Display());
1740 else
1742 Lexicon->LogFile(pStem->Display(), ssNewStem.Display());
1743 continue;
1745 ssNewStem = pStem->GetKey().Left( pStem->GetKeyLength() - NumberOfLettersShifted );
1747 else // Prefixes
1749 if ( pStem->GetKey().Left(NumberOfLettersShifted).Display() == pPiece->Display() )
1751 HowManyStemsForThisSig++;
1753 else
1755 continue;
1757 ssNewStem = pStem->GetKey().Right( pStem->GetKeyLength() - NumberOfLettersShifted );
1762 if ( Lexicon->GetStems()->Contains( ssNewStem ) || // ** Was: "GetStems_Suffixed
1763 Lexicon->GetWords()->Contains( ssNewStem ) )
1765 NumberOfShortenedStemsThatPreExisted ++;
1766 ThisSavingBecauseStemAlreadyExisted = ssNewStem.GetLength() * base2log (26);
1767 SavingsBecauseStemAlreadyExisted += ThisSavingBecauseStemAlreadyExisted;
1769 // ** Add the cost of having a pointer to the stem ******
1773 if ( Lexicon->LogFileOn() &&
1774 ( pStem->GetKey().Right(NumberOfLettersShifted).Display() == pPiece->Display() ) )
1777 if ( ThisSavingBecauseStemAlreadyExisted > 0)
1779 Lexicon->LogFile("ThisSavingBecauseStemAlreadyExisted");
1780 } else
1782 Lexicon->LogFile("none (did not exist)");
1788 DecreaseInDLDueToShorterStems = ( HowManyStemsForThisSig - NumberOfShortenedStemsThatPreExisted ) *
1789 NumberOfLettersShifted * base2log (26);
1790 TotalDecreaseInDLDueToShorterStems += DecreaseInDLDueToShorterStems ;
1793 LengthOfPointerToThisSig = LogTotalNumberOfAnalyzedWords - base2log ( Size() * HowManyStemsForThisSig ) ;
1794 StemPointersToThisSig = HowManyStemsForThisSig * ( LengthOfPointerToThisSig ) ;
1795 if ( Lexicon-> LogFileOn() )
1797 *Lexicon->GetLogFile() << // FILL THIS IN --
1799 StartTable <<
1800 StartTableRow <<
1801 MakeTableHeader("Current stem") <<
1802 MakeTableHeader("Proposed stem") <<
1803 MakeTableHeader("Savings from preexisting stem") <<
1804 EndTableRow;
1810 bool CSignature::IsValid()
1811 // tests that pieces of the signature are all non-null
1812 { for (int affixno = 1; affixno <= m_PieceCount; affixno++) {
1813 if ( GetPiece(affixno).GetLength() < 1 ) {
1814 return FALSE;
1817 return TRUE;
1819 // <<-------------------------------------------------------------------------------------------------------->>
1820 void CSignature::DetachStem(CStem* pStem, detachment_parameter Parameter)
1822 if( !m_StemPtrList->isEmpty() &&
1823 m_StemPtrList->indexOf( pStem ) >= 0 &&
1824 m_StemPtrList->remove( pStem ) )
1826 IncrementCorpusCount( -1 * pStem->GetCorpusCount() );
1828 if( Parameter != eDo_Not_Call_Words )
1830 CStem *pWord;
1831 for (int wordno = 0; wordno < pStem->GetNumberOfWords(); wordno++)
1832 { pWord = pStem->GetWordPtrList()->at(wordno);
1833 m_WordPtrList->removeOne( pWord );
1838 // <<-------------------------------------------------------------------------------------------------------->>
1839 void CSignature::DetachWord(CStem* pWord, enum detachment_parameter param)
1841 struct not_implemented { };
1842 throw not_implemented();
1844 if( !m_WordPtrList->empty() && m_WordPtrList->indexOf( pWord ) >= 0 )
1845 m_WordPtrList->removeOne(pWord); //
1847 // Suppress a warning.
1848 static_cast<void>(param);
1850 // <<-------------------------------------------------------------------------------------------------------->>
1851 void CSignature::TakeAllStems(CSignature* source)
1853 //QList<CStem*>& source_stems = *source->GetStemPtrList();
1854 CStem* pStem;
1855 for (int stemno = 0; stemno < source->GetNumberOfStems(); stemno++)
1857 pStem=source->GetStem(stemno);;
1858 pStem->SetSuffixList(this);
1859 AppendStemPtr(pStem);
1860 IncrementCorpusCount(pStem->GetCorpusCount());
1862 // Remove items from source.
1863 //Q_ASSERT(!source_stems.autoDelete());
1864 //source_stems.clear();
1865 source->ClearStemPtrList();
1867 // XXX. Decrement source corpus count in turn?
1868 // Hard to tell, since there are no call sites.
1870 // <<-------------------------------------------------------------------------------------------------------->>
1871 void CSignature::AddWord (CStem* pWord)
1873 m_WordPtrList->append (pWord);
1874 IncrementCorpusCount (pWord->GetCorpusCount() );
1877 void CSignature::ClearStemPtrList() { m_StemPtrList->clear(); }
1878 void CSignature::AppendWordPointer(CStem* pWord ) { m_WordPtrList->append(pWord); }
1879 void CSignature::AppendPrefixPtr(CPrefix* pPrefix) { m_PrefixPtrList->append (pPrefix);}
1880 int CSignature::GetNumberOfWords() const
1882 return m_WordPtrList->count();
1885 // <<-------------------------------------------------------------------------------------------------------->>
1886 CParse CSignature::CreateADeletingSignature( CParse& Deletee, CMiniLexicon* Lexicon )
1888 CStringSurrogate ssSuffix;
1891 CParse PSuffix,
1892 NewSig,
1893 Suffix;
1894 CSuffix* pSuffix;
1895 QString Null = "NULL", lt_brak = "<", rt_brak = "<";
1898 Q_ASSERT (Deletee.Size() == 1);
1900 for (int affixno = 1; affixno <= Size(); affixno++)
1902 ssSuffix = GetPiece(affixno);
1903 if(NewSig.GetSortStyle() != eAlphabetized ) NewSig.Alphabetize();
1904 if ( ssSuffix == Deletee )
1906 NewSig.Append ( CStringSurrogate(Null.unicode(),0,Null.length() ) );
1908 else
1910 PSuffix = CStringSurrogate(lt_brak.unicode(),0,1);
1911 PSuffix += Deletee;
1912 PSuffix += CStringSurrogate(rt_brak.unicode(),0,1);
1913 PSuffix.ClearParseStructure();
1914 PSuffix += ssSuffix;
1915 NewSig.Append ( PSuffix.GetKey() );
1917 pSuffix = *Lexicon->GetSuffixes() << PSuffix;
1919 QString line = "<" + Deletee.Display() + ">" + ssSuffix.Display();
1920 Suffix = CStringSurrogate( line.unicode(),0,line.length());
1922 NewSig.Append (Suffix.GetKey());
1923 // Lexicon->SetSuffixTranslation(this, ssSuffix, Suffix);
1927 return NewSig;
1930 // <<-------------------------------------------------------------------------------------------------------->>
1931 bool CSignature::RemoveStem(CStem * pStem )
1933 return m_StemPtrList->remove( pStem );
1935 // <<-------------------------------------------------------------------------------------------------------->>
1938 bool CSignature::RemoveWord(CStem* pWord)
1940 return m_WordPtrList->remove( pWord );
1942 // <<-------------------------------------------------------------------------------------------------------->>
1943 // copy out affixes, with null affix replaced with "NULL",
1944 // possibly with deletees marked with angle brackets
1945 CSignature& CSignature::Express(CSignature& Output, bool bDisplayDeletees)
1947 CSuffixCollection* Suffixes = 0;
1948 CPrefixCollection* Prefixes = 0;
1949 if (!is_initial(GetAffixLocation()))
1950 Suffixes = GetSignatureCollection()->GetMySuffixes();
1951 else
1952 Prefixes = GetSignatureCollection()->GetMyPrefixes();
1954 Output.ClearParse();
1956 for (int affixno = 1; affixno <= Size(); ++affixno) {
1957 CStringSurrogate affix_text = GetPiece(affixno);
1959 if (affix_text.IsNULL()) {
1960 Output.Append(TheStringNULL);
1961 continue;
1963 if (!is_initial(m_AffixLocation)) {
1964 CSuffix* suffix = *Suffixes ^= affix_text;
1965 Q_ASSERT(suffix != 0);
1967 CParse Temp;
1968 Output.Append(
1969 suffix->Express(Temp, bDisplayDeletees));
1970 } else {
1971 CPrefix* prefix = *Prefixes ^= affix_text;
1972 Q_ASSERT(prefix != 0);
1974 CParse Temp;
1975 Output.Append(
1976 prefix->Express(Temp, bDisplayDeletees));
1979 return Output;
1981 // <<-------------------------------------------------------------------------------------------------------->>
1982 /// concatenate affixes, separated by -.
1983 QString CSignature::Express(bool bDisplayDeletees)
1985 CSuffixCollection* Suffixes = 0;
1986 CPrefixCollection* Prefixes = 0;
1987 if (!is_initial(GetAffixLocation()))
1988 Suffixes = GetSignatureCollection()->GetMySuffixes();
1989 else
1990 Prefixes = GetSignatureCollection()->GetMyPrefixes();
1992 QString Outstring;
1993 for (int affixno = 1; affixno <= Size(); ++affixno) {
1994 CStringSurrogate affix_text = GetPiece(affixno);
1996 if (affix_text.IsNULL()) {
1997 if (!Outstring.isEmpty())
1998 Outstring.append('-');
1999 Outstring.append(TheStringNULL);
2000 continue;
2003 if (is_initial(m_AffixLocation)) {
2004 CPrefix* prefix = *Prefixes ^= affix_text;
2005 Q_ASSERT(prefix != 0);
2006 if (!Outstring.isEmpty())
2007 Outstring.append('-');
2009 CParse Temp;
2010 Outstring.append(prefix->Express(Temp,
2011 bDisplayDeletees).Display());
2012 } else {
2013 CSuffix* suffix = *Suffixes ^= affix_text;
2014 Q_ASSERT(suffix != 0);
2015 if (!Outstring.isEmpty())
2016 Outstring.append('-');
2018 CParse Temp;
2019 Outstring.append(suffix->Express(Temp,
2020 bDisplayDeletees).Display());
2024 return Outstring;
2026 // <<-------------------------------------------------------------------------------------------------------->>
2028 // this should probably be replaced by ComputeDLofModel
2030 double CSignature::ComputeDL( int char_count )
2032 CStem* pStem;
2033 CAffix* pAffix;
2035 CStringSurrogate Affix;
2037 bool CORPUS_BASED_AFFIX_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedAffixCount", 0 );
2038 bool CORPUS_BASED_STEM_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedStemCount", 1 );
2040 double stems_dl = 0.0,
2041 affixes_dl = 0.0;
2043 uint stem_total = 0,
2044 affix_total = 0;
2046 if( CORPUS_BASED_STEM_COUNT )
2048 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2050 stems_dl += ( (double) -1 ) * base2log( (double) pStem->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2053 else
2055 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2057 stems_dl = ( (double) -1 ) * base2log( (double) pStem->GetWordPtrList()->count() / (double) m_pMyMini->GetWords()->GetCount() );
2061 bool analyzedSuffixes = TRUE;
2062 if( GetAffixLocation() == STEM_INITIAL || GetAffixLocation() == WORD_INITIAL ) analyzedSuffixes = FALSE;
2064 int i;
2065 if( !CORPUS_BASED_AFFIX_COUNT )
2067 for( i = 1; i <= m_PieceCount; i++ )
2069 Affix = GetPiece(i);
2071 if( analyzedSuffixes )
2073 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2075 else
2077 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2080 if( pAffix ) affix_total += pAffix->GetCorpusCount();
2084 for( i = 1; i <= m_PieceCount; i++ )
2086 Affix = GetPiece(i);
2088 if( analyzedSuffixes )
2090 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2092 else
2094 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2097 if( pAffix )
2099 if( CORPUS_BASED_AFFIX_COUNT ) affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2100 else affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) affix_total );
2104 return stems_dl + affixes_dl;
2107 // <<-------------------------------------------------------------------------------------------------------->>
2108 //====================================================================//
2109 // Description Length //
2110 //====================================================================//
2111 double CSignature::GetDLofMyAffixPointers( )
2113 if (m_DLofMyAffixPointers == 0)
2115 bool analyzedSuffixes = TRUE;
2116 CSuffix * pSuffix;
2117 CPrefix* pPrefix;
2118 if( GetAffixLocation() == STEM_INITIAL || GetAffixLocation() == WORD_INITIAL ) analyzedSuffixes = FALSE;
2119 if (analyzedSuffixes)
2121 for (int suffixno = 0; suffixno < GetSuffixPtrList()->size(); suffixno++)
2122 { pSuffix= GetSuffixPtrList()->at(suffixno);
2123 m_DLofMyAffixPointers += pSuffix->GetLengthOfPointerToMe ();
2126 else
2128 for (int prefixno = 0; prefixno < GetPrefixPtrList()->size(); prefixno++)
2130 pPrefix= GetPrefixPtrList()->at(prefixno);
2131 m_DLofMyAffixPointers += pPrefix->GetLengthOfPointerToMe ();
2135 return m_DLofMyAffixPointers;
2137 // <<-------------------------------------------------------------------------------------------------------->>
2138 double CSignature::GetDLofMyStemPointers()
2140 if (m_DLofMyStemPointers == 0)
2142 CStem * pStem;
2143 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
2145 pStem = GetStem(stemno);
2146 m_DLofMyStemPointers += pStem->GetLengthOfPointerToMe ();
2149 return m_DLofMyStemPointers;
2151 // <<-------------------------------------------------------------------------------------------------------->>
2152 double CSignature::ComputeDLofModel(int /* char_count, not used */)
2154 // XXX. take SignatureDL\CorpusBased{Stem,Affix}Count parameters
2155 // into account
2157 m_DLofMyStemPointers = GetDLofMyStemPointers();
2158 m_DLofMyAffixPointers = GetDLofMyAffixPointers();
2159 return m_DLofMyStemPointers + m_DLofMyAffixPointers;
2161 // <<-------------------------------------------------------------------------------------------------------->>
2162 double CSignature::ComputeDLofMyCorpus()
2164 using linguistica::implicit_cast;
2166 if (m_pMyMini == 0)
2167 return 0.0;
2169 m_DLofMyCorpus = 0.0;
2170 foreach (CStem* pWord, *m_WordPtrList) {
2171 CStringSurrogate stem_text = pWord->GetStem();
2172 CStem* stem = *m_pMyMini->GetStems() ^= stem_text;
2174 /***** DEBUG******/
2175 if(stem==NULL)
2177 std::cout << "NULL stem -- in CSignature::ComputeDLofMyCorpus() "<< std::endl;
2178 std::cout << " word: "<<pWord->Display().toStdString()<< std::endl;
2179 std::cout << " stem: "<< stem_text.Display().toStdString()<<std::endl;
2180 CStringSurrogate afx_str
2181 = (is_initial(m_AffixLocation) ? pWord->GetPrefix() : pWord->GetSuffix());
2182 std::cout << " affix:"<< afx_str.Display().toStdString() << std::endl;
2183 std::cout << std::endl;
2184 Q_ASSERT(stem);
2186 /* end DEBUG-s.w.*/
2188 CStringSurrogate affix_text = is_initial(m_AffixLocation)
2189 ? pWord->GetPrefix()
2190 : pWord->GetSuffix();
2191 if (affix_text.GetLength() == 0)
2192 affix_text = TheStringNULL;
2194 CAffix* affix = is_initial(m_AffixLocation)
2195 ? implicit_cast<CAffix*>(
2196 *m_pMyMini->GetPrefixes() ^= affix_text)
2197 : implicit_cast<CAffix*>(
2198 *m_pMyMini->GetSuffixes() ^= affix_text);
2200 CStem* word = *m_pMyMini->GetWords() ^= pWord;
2201 const double ThisWordDL =
2202 stem->GetLengthOfPointerToMe() +
2203 affix->GetLengthOfPointerToMe();
2204 m_DLofMyCorpus += word->GetCorpusCount() * ThisWordDL;
2206 return m_DLofMyCorpus;
2208 // <<-------------------------------------------------------------------------------------------------------->>
2210 namespace {
2211 /// Get the corpus counts of each suffix with this stem
2212 int* GetSuffixCounts(CStem* stem, int* output)
2214 if (output) delete output; // error if this occurs.
2215 output = new int[ stem->GetNumberOfSuffixes() ];
2217 for (int i = 1; i <= stem->GetSuffixList()->Size(); ++i) {
2218 QString Suffix = stem->GetSuffixList()->GetPiece(i).Display();
2219 if (Suffix == "NULL")
2220 Suffix = "";
2221 QString Word = stem->Display() + Suffix;
2222 CStem* pWord = *stem->GetMyMini()->GetWords() ^=
2223 CStringSurrogate(Word);
2225 output[i-1] = pWord->GetCorpusCount();
2227 return output;
2231 //the output is a vector of integers, whose length is
2232 // the number of stems times the number of suffixes. Pass it
2233 // an int pointer that points to NULL; it will delete the memory
2234 // that this function creates.
2235 int* CSignature::GetIndividualCountsForEachStem (int* output )
2237 int affixno, stemno;
2238 int* temp = NULL;
2239 CStem* pStem;
2241 if (output) delete output; //if this occurs, it's an error.
2242 output = new int [GetNumberOfStems() * GetNumberOfAffixes() ];
2244 CMiniLexicon* pMiniLexicon = GetLexicon();
2245 NOT FINISHED YET _--- use GETaWord -- JG
2246 for (stemno = 0; stemno < m_StemPtrList->size(); stemno++)
2248 pSt em = m_StemPtrList->at(stemno);
2249 temp = GetSuffixCounts(pStem, temp);
2250 for (affixno = 0; affixno < GetNumberOfAffixes(); affixno++)
2252 output[stemno * GetNumberOfAffixes() + affixno] = temp[affixno];
2254 delete temp;
2255 temp = NULL;
2257 return output;
2261 //===================================================================================================//
2263 // Description length
2265 //===================================================================================================//
2266 double CSignature::GetSumOfDLofInternalPointers()
2269 double StemTotal = 0, SuffixTotal = 0;
2270 CStem* pStem;
2271 CSuffix* pSuffix;
2272 CSS ssSuffix;
2273 CSuffixCollection& Suffixes = *m_pMyMini->GetSuffixes();
2274 for (int stemno = 0; stemno < m_StemPtrList->size(); stemno++)
2276 pStem = m_StemPtrList->at(stemno);
2277 StemTotal += pStem->GetLengthOfPointerToMe_2 ();
2280 for (int affixno = 1; affixno <= GetNumberOfAffixes(); affixno++)
2282 ssSuffix = GetPiece(affixno);
2283 pSuffix = Suffixes ^= ssSuffix;
2284 SuffixTotal += pSuffix->GetLengthOfPointerToMe();
2286 return StemTotal + SuffixTotal;
2288 // <<-------------------------------------------------------------------------------------------------------->>
2290 void CSignature::SetLengthOfPointerToMe(double L)
2292 m_LengthOfPointerToMe = L;
2293 return;
2296 // <<-------------------------------------------------------------------------------------------------------->>
2298 void CSignature::AppendSatelliteAffix(CParse& suffix)
2301 m_SatelliteAffixes.Append(suffix);
2304 //===================================================================================================//
2306 // Allomorphy
2308 //===================================================================================================//
2309 bool CSignature::Generalizes(CSignature* pSig)
2311 struct not_implemented { };
2312 throw not_implemented();
2314 // 1. Check they have the same length; find which one is longer.
2315 // 2. Go from longest to shortest pieces of the longer signature:
2316 // look for unambiguous correspondents in the other signature, and
2317 // put those pairs of corresponding affixes in some structure.
2318 // 3. After unambiguous cases, deal with ambiguous cases, if any exist.
2319 // 4. Find alignment
2321 // ed |NULL | NULL | ed |
2322 // ing|NULL | NULL | ing |
2323 // es |e | NULL | s |
2324 // e |e | NULL | NULL|
2327 // ed |e | <e> | ed |
2328 // ing|e | <e> | ing |
2329 // es |e | NULL | s |
2330 // e |e | NULL | NULL|
2333 // ien |ien | NULL | NULL |
2334 // ienne |ienn | NULL | e |
2335 // iens |ien | NULL | s |
2336 // iennes |ienn | NULL | es |
2338 // ien |ien | NULL | NULL |
2339 // ienne |ien | n | e |
2340 // iens |ien | NULL | s |
2341 // iennes |ien | n | es |
2343 CSignature* LongerSig, *ShorterSig;
2345 struct Row {
2346 QString LongAffix;
2347 QString Extension;
2348 QString Operation;
2349 QString ShortAffix;
2352 if (Size() != pSig->Size())
2353 return false;
2355 const int dif = GetKeyLength() - pSig->GetKeyLength();
2356 if (dif > 0) {
2357 LongerSig = this; ShorterSig = pSig;
2358 } else if (dif == 0) {
2359 return false;
2360 } else {
2361 LongerSig = pSig; ShorterSig = this;
2364 const int MAXAFFIXSIZE = 10;
2366 QStringList ShorterSigPieces;
2369 // Copy the affixes of ShorterSig,
2370 // from shortest to longest
2371 // onto the list ShorterSigPieces.
2372 if (ShorterSig->ContainsNULL())
2373 ShorterSigPieces.append(TheStringNULL);
2374 for (int m = 1; m < MAXAFFIXSIZE &&
2375 ShorterSigPieces.count() < ShorterSig->Size();
2376 ++m) {
2377 // XXX. this test makes no sense
2378 if (ShorterSig->ThisPieceLength(m) == m)
2379 ShorterSigPieces.prepend(
2380 ShorterSig->GetPiece(m).Display());
2382 Q_ASSERT(ShorterSigPieces.count() == ShorterSig->Size());
2385 QStringList LongerSigPieces;
2387 // Copy the affixes of LongerSig,
2388 // from shortest to longest
2389 // onto the list LongerSigPieces.
2390 if (LongerSig->ContainsNULL())
2391 LongerSigPieces.append(TheStringNULL);
2392 for (int m = 1; m < MAXAFFIXSIZE &&
2393 LongerSigPieces.count() < LongerSig->Size();
2394 ++m)
2395 if (LongerSig->ThisPieceLength(m) == m)
2396 LongerSigPieces.prepend(
2397 LongerSig->GetPiece(m).Display());
2398 Q_ASSERT(LongerSigPieces.count() == LongerSig->Size());
2401 CStringSurrogate ssIng, ssTing;
2402 foreach (QString shortersig_piece, ShorterSigPieces) {
2403 // example: "ing"
2404 CStringSurrogate short_affix(shortersig_piece);
2405 bool match = false;
2406 foreach (QString longersig_piece, LongerSigPieces) {
2407 // example "ting"
2408 CStringSurrogate long_affix(longersig_piece);
2409 if (long_affix.IsNULL())
2410 continue;
2411 if (short_affix != long_affix.Right(
2412 short_affix.GetLength()))
2413 continue;
2414 bool unambiguous_match = !match;
2415 if (!match)
2416 match = true;
2418 if (!unambiguous_match)
2419 continue;
2421 Row ThisRow;
2422 ThisRow.LongAffix =
2423 long_affix.Display();
2424 ThisRow.ShortAffix =
2425 short_affix.Display();
2426 ThisRow.Extension = long_affix.Left(
2427 long_affix.GetLength() -
2428 short_affix.GetLength())
2429 .Display();
2430 // XXX. use ThisRow...
2431 static_cast<void>(ThisRow);
2434 return false;
2436 // <<-------------------------------------------------------------------------------------------------------->>
2437 // <<-------------------------------------------------------------------------------------------------------->>
2438 void CSignature::CutMyWordsAsIDeclare()
2439 { CStem* stem;
2441 if ( is_initial (GetAffixLocation()) )
2443 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++) {
2444 stem = GetStem(stemno);
2446 // For each prefix in signature:
2447 for (int prefixno = 1; prefixno <= Size(); ++prefixno) {
2448 CStringSurrogate prefix = GetPiece(prefixno);
2450 prefix.SetBackwards(false);
2451 if (prefix.IsNULL())
2452 // NULL + stem prefix needs no cut
2453 continue;
2455 // get correspond word
2456 CParse word_text = prefix + stem->GetKey();
2457 CStem* word = *GetLexicon()->GetWords() ^= word_text;
2458 Q_ASSERT(word != 0);
2460 if (word->Size() > 1 )
2461 // already analyzed
2462 continue;
2463 GetLexicon()->LogFile ("", "", word->GetKey().Display());
2465 // analyze word
2466 const int cut_point = word->GetKeyLength() - stem->GetKeyLength();
2467 word->CutRightBeforeHere(cut_point);
2468 word->SetStemLoc(2);
2469 word->SetPrefixLoc(1);
2470 //m_pLexicon->UpdateWord(word);
2474 else
2476 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++) {
2477 stem = GetStem(stemno);
2479 // For each affix in signature:
2480 for (int suffixno = 1; suffixno <= Size(); ++suffixno) {
2481 CStringSurrogate suffix = GetPiece(suffixno);
2483 suffix.SetBackwards(false);
2484 if (suffix.IsNULL())
2485 // stem + NULL suffix needs no cut
2486 continue;
2488 // get correspond word
2489 CParse word_text = stem->GetKey() + suffix;
2490 CStem* word = *GetLexicon()->GetWords() ^= word_text;
2491 Q_ASSERT(word != 0);
2493 if (word->Size() > 1 )
2494 // already analyzed
2495 continue;
2496 GetLexicon()->LogFile ("", "", word->GetKey().Display());
2498 // analyze word
2499 const int cut_point = word->GetKeyLength() - stem->GetKeyLength();
2500 word->CutRightBeforeHere(cut_point);
2501 word->SetStemLoc(1);
2502 //m_pLexicon->UpdateWord(word);
2508 void CSignature::OutputSignatureXfst( QTextStream& outf, int count)
2510 QString strOutput;
2511 CParse StemList;
2512 QString string;
2514 outf << endl;
2516 outf << "# " << count << ": " << Display('.', m_pMyMini->GetOutFilter()) << endl;
2517 if (this->GetMentorList()->count() > 0)
2518 outf << "# MentorList() size: " << this->GetMentorList()->count() << endl;
2519 else
2520 outf << "# No MentorList() items" << endl;
2522 outf << "# robustness: " << m_Robustness << endl;
2524 if( GetMentor()!=NULL )
2526 outf << "# Has mentor: skipping" << endl;
2527 return;
2530 outf << "define STEM" << count << " "; // << " \\" << endl;
2532 //added
2533 QStringList stems;
2534 for (int i = 0; i < this->GetNumberOfStems(); i++)
2536 stems.append( this->GetStem(i)->Display() );
2539 // add stems from child sigs
2541 for (int z = 0; z < this->GetMentorList()->size(); z++)
2543 CSignature * qSig = this->GetMentorList()->at(z);
2545 QStringList qSufList;
2546 for (int i = 0; i < qSig->GetNumberOfAffixes(); i++)
2547 qSufList.append(qSig->GetSuffix(i)->Display());
2549 //generate new words here:
2550 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2552 outf << endl;
2553 CSuffix* pSuf = this->GetSuffix(i);
2554 QString sufStr = pSuf->Display( 0 );//, m_pMyMini->GetOutFilter() );
2555 if ( !qSufList.contains(sufStr) )
2557 outf<< "#### Suffix to be expanded: "<< sufStr << endl;
2558 for (int j = 0; j < qSig->GetNumberOfStems(); j++)
2560 QString stemStr = qSig->GetStem(j)->Display();
2561 if (sufStr.compare("NULL") == 0)
2562 outf << "### "<< stemStr << endl;
2563 else
2564 outf << "### "<< stemStr << " " << sufStr << endl;
2570 // add stems from child sigs
2571 for (int z = 0; z < this->GetMentorList()->size(); z++)
2573 CSignature * qSig = this->GetMentorList()->at(z);
2574 for (int i = 0; i < qSig->GetNumberOfStems(); i++)
2576 stems.append( qSig->GetStem(i)->Display( 0, m_pMyMini->GetOutFilter() ) );
2580 stems.sort();
2581 int m = 1;
2583 QStringList::Iterator strIt = stems.begin();
2584 outf << "[ {" << *strIt << "} ";
2585 ++strIt;
2588 for( ; strIt != stems.end(); ++strIt )
2590 if( m % 5 == 0 )
2592 outf << endl;
2593 outf << " ";
2595 outf << "| {" << *strIt << "} ";
2596 m++;
2599 outf << "]; "<<endl;
2600 outf << "define SUF" << count << " [ ";
2601 QStringList suffixes;
2602 bool first = 1;
2604 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2606 CSuffix* pSuffix = this->GetSuffix(i);
2607 if(first)
2608 first=0;
2609 else
2610 outf << "|";
2611 QString str = pSuffix->Display( 0 );
2612 if (str.compare("NULL") == 0)
2613 outf << " 0 ";
2614 else
2615 outf << " {" << str << "} ";
2618 outf << "];" << endl;
2620 outf << "define SIG" << count << " STEM" << count << " SUF"<< count << ";" << endl ;
2622 outf << "push SIG"<< count << endl;
2624 /* TEMP SOLN: now write cross product in comments */
2625 for ( QStringList::Iterator strIt = stems.begin() ; strIt != stems.end(); ++strIt )
2627 //QList<CSuffix*>::iterator suffix_it = m_SuffixPtrList->begin();
2628 //CSuffix* pSuffix;
2629 //while ( (pSuffix = *suffix_it) != 0 )
2631 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2633 CSuffix* pSuffix = this->GetSuffix(i);
2634 QString str = pSuffix->Display( 0 );//, m_pMyMini->GetOutFilter() );
2635 if (str.compare("NULL") == 0)
2636 outf << "## "<< *strIt << endl;
2637 else
2638 outf << "## "<< *strIt << str << endl;
2645 //--------------------------------------------------------------------------//
2646 void CSignature::RecalculateStemAndWordPointers()
2647 //--------------------------------------------------------------------------//
2650 for (int stemno = 0; stemno < GetNumberOfStems(); stemno++)
2652 QString stem = GetStem(stemno)->Display();
2653 switch (m_AffixLocation)
2655 case WORD_FINAL:
2656 case STEM_FINAL:
2657 for (int suffixno = 0; suffixno < GetNumberOfAffixes(); suffixno++)
2659 QString suffix = GetSuffix(suffixno)->Display();
2660 if (suffix == "NULL") suffix = "";
2661 QString word = stem + suffix;
2662 CStem* pWord = *GetLexicon()->GetWords() ^= word;
2663 AppendWordPointer( pWord);
2665 break;
2666 case WORD_INITIAL:
2667 case STEM_INITIAL:
2668 for (int prefixno = 0; prefixno < GetNumberOfAffixes(); prefixno++)
2670 QString prefix = GetPrefix(prefixno)->Display();
2671 if (prefix == "NULL") prefix = "";
2672 QString word = prefix + stem;
2673 CStem* pWord = *GetLexicon()->GetWords() ^= word;
2674 AppendWordPointer(pWord);
2677 } // end of stemno loop
2679 //--------------------------------------------------------------------------//