1 // Implementation of CSignature, CSignatureListViewItem methods
2 // Copyright © 2009 The University of Chicago
7 #include "linguisticamainwindow.h"
9 #include "MiniLexicon.h"
10 #include "LPreferences.h"
11 #include "CorpusWord.h"
15 #include "SignatureCollection.h"
16 #include "SuffixCollection.h"
17 #include "PrefixCollection.h"
18 #include "WordCollection.h"
19 #include "StemCollection.h"
20 #include "SparseIntVector.h"
21 #include "CompareFunc.h"
25 #include "implicit_cast.h"
27 bool stemlessthan(const QPair
<CStem
*, int> pair1
, const QPair
<CStem
*, int> pair2
);
29 bool stemlessthan(const QPair
<CStem
*, int> pair1
, const QPair
<CStem
*, int> pair2
)
31 return pair2
.second
< pair1
.second
;
35 //===================================================================================================//
37 // Signature listview item
39 //===================================================================================================//
40 CSignatureListViewItem::CSignatureListViewItem(Q3ListView
*parent
,
41 QString signature
, int mini
, CSignature
* pSig
,
42 QMap
<QString
, QString
>* filter
)
43 : Q3ListViewItem( parent
, signature
)
48 m_parentlist
= parent
;
53 CSignatureListViewItem::CSignatureListViewItem(Q3ListViewItem
*parent
,
54 QString signature
, int mini
, CSignature
* pSig
,
55 QMap
<QString
, QString
>* filter
)
56 : Q3ListViewItem( parent
, signature
)
61 m_parentlist
= parent
->listView();
65 int CSignatureListViewItem::compare(Q3ListViewItem
*item
, int col
, bool asc
) const
70 return MakeComparable ( m_signature
->ComputeDLofModel() , ((CSignatureListViewItem
*) item
)->GetSignature()->ComputeDLofModel() );
74 return MakeComparable ( m_signature
->GetCorpusCount() , ((CSignatureListViewItem
*) item
)->GetSignature()->GetCorpusCount() );
78 return MakeComparable ( m_signature
->GetNumberOfStems() , ((CSignatureListViewItem
*) item
)->GetSignature()->GetNumberOfStems() );
82 return MakeComparable ( ((CSignatureListViewItem
*) item
)->GetSignature()->GetRobustness(), m_signature
->GetRobustness() );
86 return Q3ListViewItem::compare(item
, col
, asc
);
91 QString
CSignatureListViewItem::text( int column
) const
95 CSignatureListViewItem
* child
= NULL
;
104 if( m_signature
&& m_parentlist
->sortColumn() == 0 && m_signature
->GetMentor() )
106 return " : " + m_label
;
110 if( m_signature
&& m_signature
->GetNumberOfStems() > 0 )
112 if (m_signature
->GetNumberOfStems() > 0 ) return m_signature
->GetStem(0)->Display( QChar(0), m_filter
);
116 if( m_signature
) return dummy
.setNum( m_signature
->ComputeDLofModel() );
119 if( m_signature
) return dummy
.setNum ( m_signature
->GetCorpusCount() );
123 child
= (CSignatureListViewItem
*) firstChild();
126 if( child
->GetSignature() )
128 count
+= child
->GetSignature()->GetCorpusCount();
130 child
= (CSignatureListViewItem
*) child
->nextSibling();
132 return dummy
.setNum( count
);
135 if( m_signature
&& m_signature
->GetNumberOfStems() > 0 ) return dummy
.setNum( m_signature
->GetNumberOfStems() );
139 child
= (CSignatureListViewItem
*) firstChild();
142 if( child
->GetSignature() &&
143 child
->GetSignature()->GetNumberOfStems() > 0 )
145 count
+= child
->GetSignature()->GetNumberOfStems();
147 child
= (CSignatureListViewItem
*) child
->nextSibling();
149 return dummy
.setNum( count
);
152 if( m_signature
) return m_signature
->GetRemark();
156 if( m_signature
) return dummy
.setNum( (int) m_signature
->GetRobustness() );
161 child
= (CSignatureListViewItem
*) firstChild();
164 if( child
->GetSignature() &&
165 child
->GetSignature()->GetNumberOfStems() > 0 )
167 count
+= child
->GetSignature()->GetNumberOfStems();
169 child
= (CSignatureListViewItem
*) child
->nextSibling();
171 return dummy
.setNum( count
);
176 return Q3ListViewItem::text( column
);
180 //===================================================================================================//
184 //===================================================================================================//
185 void CSignature::BorrowedSigsDisplay(Q3ListView
* List
,
186 QMap
<QString
, QString
>* filter
)
188 QString source
= "Unknown", dummy
;
189 for (int minino
= 0; minino
< m_pMyMini
->GetMiniSize(); ++minino
) {
190 CMiniLexicon
* mini
= m_pMyMini
->GetMiniLexicon(minino
);
194 CSignatureCollection
& sigs
= *mini
->GetSignatures();
197 source
= dummy
.setNum(minino
+ 1);
202 static_cast<void>(new Q3ListViewItem(
203 List
, Display('.', filter
), source
));
206 //===================================================================================================//
208 // Constructor/destructor
210 //===================================================================================================//
212 CSignature::CSignature( CMiniLexicon
* Lexicon
) : CLParse( Lexicon
)
216 m_StemPtrList
= new QList
<CStem
*>();
217 m_WordPtrList
= new QList
<CStem
*>();
218 m_MentorList
= new QList
<CSignature
*>();
219 m_SuffixPtrList
= new QList
<CSuffix
*>();
220 m_PrefixPtrList
= new QList
<CPrefix
*>();
221 m_SortStyle
= eAlphabetized
;
222 // Description Length
224 m_DLofMyStemPointers
= 0;
225 m_DLofMyAffixPointers
= 0;
226 m_LengthOfPointerToMe
= 0;
227 m_MyGeneralizer
= NULL
;
231 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
235 CSignature::CSignature( eAffixLocation AffixLocation
, CMiniLexicon
* Lexicon
) : CLParse( Lexicon
)
238 m_StemPtrList
= new QList
<CStem
*>();
239 m_WordPtrList
= new QList
<CStem
*>();
240 m_MentorList
= new QList
<CSignature
*>();
241 m_SuffixPtrList
= new QList
<CSuffix
*>();
242 m_PrefixPtrList
= new QList
<CPrefix
*>();
243 m_SortStyle
= eAlphabetized
;
244 m_MyGeneralizer
= NULL
;
245 m_AffixLocation
= AffixLocation
;
248 // Description Length
250 m_DLofMyStemPointers
= 0;
251 m_DLofMyAffixPointers
= 0;
252 m_LengthOfPointerToMe
= 0;
253 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
257 CSignature::CSignature (const CParse
& ParseSig
, CMiniLexicon
* Lexicon
) : CLParse ( ParseSig
, Lexicon
)
260 m_AffixLocation
= Lexicon
->GetAffixLocation();
261 m_StemPtrList
= new QList
<CStem
*>();
262 m_WordPtrList
= new QList
<CStem
*>();
263 m_MentorList
= new QList
<CSignature
*>();
264 m_SuffixPtrList
= new QList
<CSuffix
*>();
265 m_PrefixPtrList
= new QList
<CPrefix
*>();
266 m_SortStyle
= eAlphabetized
;
268 m_MyGeneralizer
= NULL
;
269 // Description Length
271 m_DLofMyStemPointers
= 0;
272 m_DLofMyAffixPointers
= 0;
273 m_LengthOfPointerToMe
= 0;
274 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
278 CSignature::CSignature (const CParse
* pParseSig
, CMiniLexicon
* Lexicon
) : CLParse ( *pParseSig
, Lexicon
)
282 m_StemPtrList
= new QList
<CStem
*>();
283 m_WordPtrList
= new QList
<CStem
*>();
284 m_MentorList
= new QList
<CSignature
*>();
285 m_SuffixPtrList
= new QList
<CSuffix
*>();
286 m_PrefixPtrList
= new QList
<CPrefix
*>();
287 m_SortStyle
= eAlphabetized
;
288 m_MyGeneralizer
= NULL
;
289 m_AffixLocation
= Lexicon
->GetAffixLocation();
291 // Description Length
293 m_DLofMyStemPointers
= 0;
294 m_DLofMyAffixPointers
= 0;
295 m_LengthOfPointerToMe
= 0;
296 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
300 CSignature::CSignature(const CSignature
& Sig
) : CLParse (Sig
, Sig
.GetLexicon())
304 m_AffixLocation
= Sig
.GetAffixLocation();
305 m_Remark
= Sig
.GetRemark();
306 m_pMyMini
= Sig
.GetLexicon();
307 m_MyGeneralizer
= Sig
.GetGeneralizer();
309 int NumberOfStems
= Sig
.GetNumberOfStems();
310 int NumberOfAffixes
= Sig
.Size();
311 int NumberOfWords
= NumberOfStems
*NumberOfAffixes
;
312 QVector
<double> m_WordCounts (NumberOfAffixes
* NumberOfStems
);
313 QVector
<double> m_StemCounts ( NumberOfStems
);
314 QVector
<double> m_AffixCounts( NumberOfAffixes
);
315 QVector
<double> m_WordFrequencies (NumberOfWords
);
316 QVector
<double> m_StemFrequencies (NumberOfStems
);
317 QVector
<double> m_AffixFrequencies (NumberOfAffixes
);
318 m_TotalCount
= Sig
.GetTotalCount();
320 m_StemPtrList
= new QList
<CStem
*>();
321 for ( stemno
= 0; stemno
< NumberOfStems
; stemno
++)
323 AppendStemPtr( Sig
.GetStem(stemno
));
324 m_StemCounts
[stemno
] = Sig
.GetStemCount(stemno
);
325 m_StemFrequencies
[stemno
] = Sig
.GetStemFrequency(stemno
);
327 if (m_AffixLocation
== WORD_FINAL
|| m_AffixLocation
== STEM_FINAL
) {
328 m_SuffixPtrList
= new QList
<CSuffix
*>();
329 for ( affixno
= 0; affixno
< NumberOfAffixes
; affixno
++)
331 AppendSuffixPtr ( Sig
.GetSuffix(affixno
) );
332 m_AffixCounts
[affixno
] = Sig
.GetAffixCount(affixno
);
333 m_AffixFrequencies
[affixno
] = Sig
.GetAffixFrequency(affixno
);
336 if (m_AffixLocation
== WORD_INITIAL
|| m_AffixLocation
== STEM_INITIAL
) {
337 m_PrefixPtrList
= new QList
<CPrefix
*>();
338 for ( affixno
= 0; affixno
< NumberOfAffixes
; affixno
++)
340 AppendPrefixPtr ( Sig
.GetPrefix(affixno
) );
341 m_AffixCounts
[affixno
] = Sig
.GetAffixCount(affixno
);
342 m_AffixFrequencies
[affixno
] = Sig
.GetAffixFrequency(affixno
);
347 m_WordPtrList
= new QList
<CStem
*>();
348 for (stemno
= 0; stemno
< NumberOfStems
; stemno
++) {
349 for (affixno
= 0; affixno
< NumberOfAffixes
; affixno
++) {
350 SetWordCount(stemno
, affixno
, 0);
351 AppendWordPointer (Sig
.GetWord(stemno
, affixno
));
355 m_Robustness
= Sig
.GetRobustness();
357 m_SortStyle
= eAlphabetized
;
358 m_MentorList
= new QList
<CSignature
*>();
367 CSignature::CSignature(const CStringSurrogate
& ssSig
, CMiniLexicon
* Lexicon
) : CLParse(ssSig
, Lexicon
)
369 Collapse( ssSig
, '.');
372 m_StemPtrList
= new QList
<CStem
*>();
373 m_WordPtrList
= new QList
<CStem
*>();
374 m_MentorList
= new QList
<CSignature
*>();
375 m_SuffixPtrList
= new QList
<CSuffix
*>();
376 m_PrefixPtrList
= new QList
<CPrefix
*>();
377 m_SortStyle
= eAlphabetized
;
378 m_MyGeneralizer
= NULL
;
379 // Description Length
381 m_DLofMyStemPointers
= 0;
382 m_DLofMyAffixPointers
= 0;
383 m_LengthOfPointerToMe
= 0;
385 if( Lexicon
) m_AffixLocation
= Lexicon
->GetAffixLocation();
389 m_SortStyle
= eAlphabetized
;
393 CSignature::~CSignature()
396 if( m_StemPtrList
) delete m_StemPtrList
;
397 if( m_WordPtrList
) delete m_WordPtrList
;
398 if( m_MentorList
) delete m_MentorList
;
399 if( m_SuffixPtrList
) delete m_SuffixPtrList
;
400 if( m_PrefixPtrList
) delete m_PrefixPtrList
;
402 //===================================================================================================//
406 //===================================================================================================//
407 QString
CSignature::Display(QChar sep
, QMap
<QString
, QString
>* filter
) const
411 sd
= m_pMyMini
->GetDocument()->GetPreferences()
412 ->GetPreference("Sig_Delimiter");
416 return CParse::Display(sd
.at(0), filter
);
419 QString
CSignature::Display(QMap
<QString
, QString
>* filter
) const
420 { return CParse::Display(filter
); }
422 QString
CSignature::Display() const
423 { return CParse::Display('.'); }
425 //===================================================================================================//
429 //===================================================================================================//
431 void CSignature::ConsumeParse( CParse
* pParse
)
435 CopyParseStructure( *pParse
);
439 void CSignature::Suicide()
441 //TODO: fill this in;
443 void CSignature::SetMyGeneralizer (CSignature
* pSig
)
445 m_MyGeneralizer
= pSig
;
447 //===================================================================================================//
451 //===================================================================================================//
452 void CSignature::operator=(const CSignature
* pSig
)
454 m_pMyMini
= pSig
->GetMyMini();
455 CLParse::operator=(*pSig
);
456 m_AffixLocation
= pSig
->GetAffixLocation();
458 int NumberOfStems
= pSig
->GetNumberOfStems();
459 int NumberOfAffixes
= pSig
->GetNumberOfAffixes();
460 int NumberOfWords
= NumberOfStems
*NumberOfAffixes
;
461 m_StemCounts
.resize(NumberOfStems
);
462 m_WordCounts
.resize(NumberOfWords
);
463 m_AffixCounts
.resize(NumberOfAffixes
);
465 m_StemCounts
.resize(NumberOfStems
);
466 for (int stemno
= 0; stemno
< pSig
->GetNumberOfStems(); stemno
++) {
467 m_StemPtrList
->append ( pSig
->GetStem(stemno
) );
468 m_StemCounts
[stemno
]=pSig
->GetStemCount(stemno
);
469 for (int affixno
= 0; affixno
< pSig
->GetNumberOfAffixes(); affixno
++)
471 m_WordPtrList
->append ( pSig
->GetWord(stemno
, affixno
));
472 SetWordCount(stemno
, affixno
, pSig
->GetWordCount(stemno
, affixno
));
476 if (m_AffixLocation
== WORD_FINAL
|| m_AffixLocation
== STEM_FINAL
) {
477 for (int suffixno
= 0; suffixno
< pSig
->GetNumberOfAffixes(); suffixno
++)
479 m_SuffixPtrList
->append ( pSig
->GetSuffix(suffixno
) );
480 m_AffixCounts
[suffixno
] = pSig
->GetAffixCount(suffixno
);
483 for (int prefixno
= 0; prefixno
< GetNumberOfAffixes(); prefixno
++) {
484 m_PrefixPtrList
->append(pSig
->GetPrefix(prefixno
) );
485 m_AffixCounts
[prefixno
] = pSig
->GetAffixCount(prefixno
);
491 m_Robustness
= pSig
->GetRobustness();
493 m_Remark
= pSig
->GetRemark();
497 QTextStream
& operator<< (QTextStream
& stream
, CSignature
* pSig
)
501 stream
<< endl
<< pSig
->Display();
503 stream
<< pSig
-> GetNumberOfStems() << " " << pSig
->GetCorpusCount();
505 for (int stemno
= 0; stemno
< pSig
->GetNumberOfStems(); stemno
++)
507 pStem
= pSig
->GetStem(stemno
);
508 if ( pStem
->GetKey() != CStringSurrogate() )
512 stream
<< pStem
->GetKey().Display();
524 // <<-------------------------------------------------------------------------------------------------------->>
525 void CSignature::operator<< (CStem
* pStem
) //add to tail of list.
530 if ( m_StemPtrList
->indexOf ( pStem
) < 0 )
532 Q_ASSERT (pStem
->GetKeyLength() > 0);
533 m_StemPtrList
->append(pStem
);
536 Q_ASSERT ( m_PieceCount
<= m_LengthOfPieceVector
) ;
538 for (int wordno
= 0; wordno
< pStem
->GetWordPtrList()->size(); wordno
++)
540 pWord
= pStem
->GetWord(wordno
);
541 Q_ASSERT (pWord
->GetKeyLength() > 0);
542 m_WordPtrList
->append (pWord
);
544 pStem
->SetSuffixSignature (this);
547 m_Robustness
= GetRobustness();
550 //===================================================================================================//
552 // Accessors and setters
554 //===================================================================================================//
555 CSignature
* CSignature::GetMentor ( ) { return m_Mentor
; }
556 // <<-------------------------------------------------------------------------------------------------------->>
557 void CSignature::SetMentor ( CSignature
* pSig
)
560 if( pSig
&& pSig
->GetMentorList() && pSig
->GetMentorList()->indexOf (this) < 0) {
561 pSig
->GetMentorList()->append( this );
566 int CSignature::GetNumberOfAffixes() const
569 if ( m_AffixLocation
== STEM_FINAL
|| m_AffixLocation
== WORD_FINAL
)
571 return m_SuffixPtrList
->count();
573 if ( m_AffixLocation
== STEM_INITIAL
|| m_AffixLocation
== WORD_INITIAL
)
575 return m_PrefixPtrList
->count();
581 void CSignature::AppendSuffixPtr (CSuffix
* pSuffix
) { m_SuffixPtrList
->append(pSuffix
);}
582 QList
<CSignature
*>* CSignature::GetMentorList( ) { return m_MentorList
; }
583 int CSignature::GetNumberOfStems() const { return m_StemPtrList
->count(); }
584 //int CSignature::GetNumberOfSuffixes () const { return m_SuffixPtrList->count(); }
585 void CSignature::SetRemark ( QString remark
) { m_Remark
= remark
; }
586 CPrefix
* CSignature::GetPrefix(int prefixno
) const { return m_PrefixPtrList
->at(prefixno
); }
587 QList
<CPrefix
*>* CSignature::GetPrefixPtrList() const { return m_PrefixPtrList
; }
588 QString
CSignature::GetRemark() const { return m_Remark
; }
589 QList
<CStem
*>* CSignature::GetStemPtrList() const { return m_StemPtrList
;}
590 CStem
* CSignature::GetStem(int stemno
) const { return m_StemPtrList
->at(stemno
); }
591 CSuffix
* CSignature::GetSuffix(int suffixno
) const { return m_SuffixPtrList
->at(suffixno
); }
592 QList
<CSuffix
*>* CSignature::GetSuffixPtrList() const { return m_SuffixPtrList
; }
593 int CSignature::GetTotalCount() const { return m_TotalCount
; }
594 double CSignature::GetCorpusCount() const { return corpus_count::GetCorpusCount();}
595 float CSignature::GetSortingQuantity() const { return (float) GetRobustness();}
597 bool CSignature::StemListContains(CStem
* pstem
) { return m_StemPtrList
->contains(pstem
); }
598 void CSignature::AppendStemPtr(CStem
* pStem
) const { m_StemPtrList
->append(pStem
);}
601 eAffixLocation
CSignature::GetAffixLocation() const { return m_AffixLocation
; }
602 // <<-------------------------------------------------------------------------------------------------------->>
603 CStem
* CSignature::GetWord(int stemno
, int affixno
) const
605 if (stemno
< 0 || affixno
< 0 || stemno
>= GetNumberOfStems() || affixno
>= GetNumberOfAffixes())
607 if (stemno
* GetNumberOfAffixes() + affixno
>= m_WordPtrList
->size() )
609 return m_WordPtrList
->at(stemno
* GetNumberOfAffixes() + affixno
);
611 CParse
CSignature::GetStems()
618 if ( m_StemPtrList
->count() == 0 ) { return List
; } // ********** This is clearly a mistake. Fix it.
619 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
621 List
.Append( GetStem(stemno
)->GetKey() );
625 // <<-------------------------------------------------------------------------------------------------------->>
634 // <<-------------------------------------------------------------------------------------------------------->>
635 double CSignature::GetStemFrequency(int stemno
) const {
636 if (stemno
< 0 || stemno
> GetNumberOfStems() ) return 0;
637 return m_StemFrequencies
[stemno
];
640 // <<-------------------------------------------------------------------------------------------------------->>
641 double CSignature::GetAffixFrequency(int affixno
) const {
642 if (affixno
< 0 || affixno
> GetNumberOfAffixes() ) {return 0; }
643 return m_AffixFrequencies
[affixno
];
645 // <<-------------------------------------------------------------------------------------------------------->>
646 double CSignature::GetStemCount(int stemno
) const {
647 if (stemno
< 0 || stemno
> GetNumberOfStems() ){ return 0; }
648 return m_StemCounts
[stemno
];
650 // <<-------------------------------------------------------------------------------------------------------->>
652 double CSignature::GetAffixCount(int affixno
) const
653 { if (affixno
< 0 || affixno
> GetNumberOfAffixes() ) return 0;
654 return m_AffixCounts
[affixno
];
656 // <<-------------------------------------------------------------------------------------------------------->>
657 double CSignature::GetWordCount(int wordno
)const {
658 if (wordno
< 0 || wordno
> GetNumberOfWords() ) { return 0;}
659 return m_WordCounts
[wordno
]; }
660 // <<-------------------------------------------------------------------------------------------------------->>
662 //===================================================================================================//
664 // Calculate frequencies and counts
666 //===================================================================================================//
667 void CSignature::CalculateFrequencies(CMiniLexicon
* Lexicon
)
669 CStringSurrogate Suffix
;
672 CCorpusWord
* pCorpusWord
;
673 Q_ASSERT( GetCorpusCount() > 0);
674 int TotalCorpusCount
= 0;
675 int* SuffixCount
= new int [ Size()+ 1 ];
676 for (int suffixno
= 1; suffixno
<= Size(); ++suffixno
)
677 { SuffixCount
[suffixno
] = 0; }
679 for (int suffixno
= 1; suffixno
<= Size(); suffixno
++)
681 Suffix
= GetPiece(suffixno
);
682 pSuffix
= new CSuffix(Suffix
);
684 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
686 pStem
= GetStem(stemno
);
687 pCorpusWord
= Lexicon
->FindAWord (pStem
, pSuffix
);
688 if( pCorpusWord
) // might not exist if we have collapsed signatures.
690 TotalCorpusCount
+= pCorpusWord
->GetCorpusCount();
691 SuffixCount
[suffixno
] += pCorpusWord
->GetCorpusCount();
696 delete [] SuffixCount
;
699 // <<-------------------------------------------------------------------------------------------------------->>
700 void CSignature::ListDisplay(Q3ListView
* List
,
701 QMap
<QString
, QString
>* filter
, bool ExpressDeletees
)
703 CSignature
sig(m_pMyMini
);
704 Express(sig
, ExpressDeletees
);
705 QString text
= sig
.Display('.', filter
);
707 static_cast<void>(new CSignatureListViewItem(
708 List
, text
, m_pMyMini
->GetIndex(), this, filter
));
711 // <<-------------------------------------------------------------------------------------------------------->>
712 void CSignature::FindCorpusCount( )
714 SetCorpusCount ( 0 );
715 for (int stemno
=0; stemno
< GetNumberOfStems(); stemno
++) {
716 for (int affixno
= 0; affixno
< GetNumberOfAffixes(); affixno
++)
717 IncrementCorpusCount ( GetWord(stemno
, affixno
)->GetCorpusCount() );
720 // <<-------------------------------------------------------------------------------------------------------->>
721 void CSignature::AttachToSuffixSig(CStem
* pStem
, bool bLookAtPreviousSig
) //add to tail of list.
724 int numberofaffixes
= GetNumberOfAffixes();
726 CSignature
* pOldSig
= pStem
->GetSuffixSignature();
727 QString stem
= pStem
->Display();
729 /* First, remove pStem from any other SuffixSignature it might be linked to.*/
730 if ( pOldSig
&& pOldSig
!= this ) {
731 pOldSig
->DetachStem( pStem
, eDo_Not_Call_Words
);
732 pOldSig
->RecalculateStemAndWordPointers();
735 stemno
= m_StemPtrList
->indexOf ( pStem
);
737 m_StemPtrList
->append( pStem
);
738 stemno
= GetNumberOfStems()-1;
741 switch( m_AffixLocation
){
744 for (int affixno
= 0; affixno
< numberofaffixes
; affixno
++)
746 pWord
= GetLexicon()->GetWordFromStemSuffix(pStem
, GetSuffix(affixno
));
749 AppendWordPointer( pWord
);
750 pWord
->SetSuffixSignature (this);
754 AppendWordPointer(NULL
);
760 for (int prefixno
= 0; prefixno
< numberofaffixes
; prefixno
++)
762 pWord
= GetLexicon()->GetWordFromStemPrefix(pStem
, GetPrefix(prefixno
));
765 AppendWordPointer( pWord
);
766 pWord
->SetPrefixSignature (this);
770 AppendWordPointer(NULL
);
776 pStem
->SetSuffixSignature( this );
777 IncrementCorpusCount( pStem
->GetCorpusCount()-1 );// first time CC is incremented
780 m_Robustness
= GetRobustness();
782 // <<-------------------------------------------------------------------------------------------------------->>
783 void CSignature::AttachToPrefixSig( CStem
* pStem
, bool bLookAtPreviousSig
) //add to tail of list.
786 CSignature
* pOldSig
= pStem
->GetPrefixSignature();
788 /* First, remove pStem from any other PrefixSignature it might be linked to.*/
789 if ( pOldSig
&& pOldSig
!= this ) {
790 pOldSig
->DetachStem( pStem
, eDo_Not_Call_Words
);
791 RecalculateStemAndWordPointers();
794 if( m_StemPtrList
->indexOf ( pStem
) < 0 ) {
795 AppendStemPtr( pStem
);
798 // move the Words from the old signature to this, the new one.
800 for (int wordno
= 0; wordno
< pStem
->GetNumberOfWords(); wordno
++) {
801 pWord
= pStem
->GetWord(wordno
);
802 m_WordPtrList
->append (pWord
);
803 pWord
->SetPrefixSignature (this);
808 pStem
->SetPrefixSignature( this );
809 IncrementCorpusCount( pStem
->GetCorpusCount()-1 );
810 m_Robustness
= GetRobustness();
813 // <<-------------------------------------------------------------------------------------------------------->>
814 double CSignature::GetRobustness() const
816 int SuffixLetters
= 0,
819 if (m_Robustness
== 0)
821 SuffixLetters
= GetKeyLength();
822 QString Null
= "NULL";
823 if ( Contains( CStringSurrogate(Null
.unicode(),0,Null
.length()) ) ) { SuffixLetters
-= 4; }
826 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++) {
827 pStem
= GetStem(stemno
);
828 StemLetters
+= pStem
->GetKeyLength();
831 m_Robustness
= ( Size() - 1 ) * StemLetters
+ (GetNumberOfStems() - 1) * SuffixLetters
;
836 // <<-------------------------------------------------------------------------------------------------------->>
837 void CSignature::SetRobustness ( double R
) { m_Robustness
= R
; }
838 // <<-------------------------------------------------------------------------------------------------------->>
840 // the counts of each individual word analyzed by this signature.
841 //double* CSignature::GetWordCounts() const { return m_WordCounts;
843 // <<-------------------------------------------------------------------------------------------------------->>
844 double CSignature::GetWordCount(int stemno
, int affixno
) const
846 if ( stemno
< 0 || affixno
< 0 || stemno
>= GetNumberOfStems() || affixno
>= GetNumberOfAffixes() ) return 0;
847 return m_WordCounts
[stemno
* GetNumberOfStems() + affixno
];
849 // <<-------------------------------------------------------------------------------------------------------->>
850 void CSignature::SetWordCount (int stemno
, int affixno
, double value
)
852 if ( stemno
< 0 || affixno
< 0 || stemno
>= GetNumberOfStems() || affixno
>= GetNumberOfAffixes() )
854 m_WordCounts
[stemno
* GetNumberOfAffixes() + affixno
] = value
;
859 // <<-------------------------------------------------------------------------------------------------------->>
861 void CSignature::CalculateWordCounts()
864 int numberofstems
= GetNumberOfStems();
865 int numberofaffixes
= GetNumberOfAffixes();
870 m_WordCounts
.clear();
871 m_WordCounts
.resize(numberofstems
*numberofaffixes
);
872 m_StemCounts
.clear();
873 m_StemCounts
.resize(numberofstems
);
874 m_AffixCounts
.clear();
875 m_AffixCounts
.resize(numberofaffixes
);
877 for (int affixno
= 0; affixno
< numberofaffixes
; affixno
++) { m_AffixCounts
[affixno
] = 0; }
878 for (int stemno
= 0; stemno
< numberofstems
; stemno
++) { m_StemCounts
[stemno
] = 0; }
882 for (int stemno
= 0; stemno
< numberofstems
; stemno
++)
884 for ( int affixno
= 0; affixno
< numberofaffixes
; affixno
++)
886 pWord
= GetWord(stemno
, affixno
);
887 count
= pWord
->GetCorpusCount();
888 // SetWordCount (stemno, affixno, count);
889 // m_StemCounts[stemno] = m_StemCounts[stemno] + count;
890 // m_AffixCounts[affixno] = m_AffixCounts[affixno] + count;
891 // m_TotalCount += count;
895 if (m_TotalCount <= 0) return;
897 m_WordFrequencies.resize(numberofstems*numberofaffixes);
898 m_StemFrequencies.resize(numberofstems);
899 m_AffixFrequencies.resize(numberofaffixes);
901 for ( int stemno = 0; stemno < numberofstems; stemno++)
903 m_StemFrequencies[stemno] = m_StemCounts[stemno]/m_TotalCount;
904 for ( affixno = 0; affixno < numberofaffixes; affixno++)
906 wordno = stemno * numberofaffixes + affixno;
907 m_WordFrequencies[wordno] = GetWordCount(stemno, affixno) / m_TotalCount;
911 for (int affixno = 0; affixno < numberofaffixes; affixno++){
912 m_AffixFrequencies[affixno] = m_AffixCounts[affixno] / m_TotalCount;
918 //=================================================================================================/
920 // TODO: make sure COST function is consistent with older versions and working right
921 double CSignature::FindCost(CMiniLexicon
* Lexicon
)
923 //=================================================================================================/
929 Sum over all of its stems :
931 log ( CorpusSize / Stem-count ) ( cost )
932 length ( stem ) * cost of a letter ( savings )
934 Sum over all of its suffixes:
936 log ( CorpusSize / suffix-count ) ( cost )
937 length ( suffix ) * cost of a letter ( savings )
947 CostOfALetter
= base2log (26),
949 NumberOfWords
= Lexicon
->GetWords()->GetCount();
953 for (int affixno
= 1; affixno
<= Size(); affixno
++)
955 if( m_AffixLocation
== WORD_FINAL
|| m_AffixLocation
== STEM_FINAL
)
957 pAffix
= *Lexicon
->GetSuffixes() ^= GetPiece(affixno
);
961 pAffix
= *Lexicon
->GetPrefixes() ^= GetPiece(affixno
);
964 if ( pAffix
) // it already exists
966 ThisAffixCost
= base2log ( NumberOfWords
/ pAffix
->GetUseCount() );
970 ThisAffixCost
= base2log ( NumberOfWords
/GetNumberOfStems() );
971 ThisAffixCost
+= GetPiece(affixno
).GetLength() * CostOfALetter
;
973 AffixCost
+= ThisAffixCost
;
975 AffixSavings
+= GetPiece(affixno
).GetLength() * CostOfALetter
;
977 SignatureCost
+= ThisAffixCost
;
981 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
983 pStem
= m_StemPtrList
->at(stemno
);
984 StemCost
+= base2log ( NumberOfWords
/ Size() ); // Size is the number of words that use stem, of course.
985 StemCost
+= pStem
->GetKeyLength() * CostOfALetter
;
986 StemSavings
+= pStem
->GetKeyLength() * CostOfALetter
* Size(); // save for each time stem appears, with each suffix
987 SignatureCost
+= StemCost
;
990 Cost
= AffixCost
+ StemCost
- AffixSavings
- StemSavings
+ SignatureCost
;
996 // <<-------------------------------------------------------------------------------------------------------->>
998 void CSignature::OutputSignature( QTextStream
& outf
)
1006 outf
<< " ------------------------------------------------------------------------------------------ " << endl
;
1007 outf
<< Display( '.', m_pMyMini
->GetOutFilter() );
1008 outf
<< endl
<< " ------------------------------------------------------------------------------------------ " << endl
;
1015 outf
<< "Number of stems: ";
1016 outf
<< QString("%1").arg( (int) GetNumberOfStems() );
1018 outf
<< " Corpus count: ";
1019 outf
<< QString("%1").arg( GetCorpusCount() );
1023 outf
<< GetRemark().replace( QChar(' '), "_" );
1026 outf
<< "Number of affixes: ";
1027 outf
<< GetNumberOfAffixes();
1028 outf
<< " Word Pointer List length: ";
1029 outf
<< m_WordPtrList
->count();
1035 CalculateWordCounts();
1039 outf
.setFieldAlignment( QTextStream::AlignLeft
);
1040 QList
< QPair
<CStem
*, int> > pstems
;
1041 for (int stemno
=0; stemno
< GetNumberOfStems(); stemno
++ )
1043 pStem
= GetStem(stemno
);
1044 pstems
.append( qMakePair(pStem
, pStem
->GetCorpusCount() ) );
1045 if (pStem
->GetKeyLength() > maxlength
) { maxlength
= pStem
->GetKeyLength();}
1047 qSort(pstems
.begin(), pstems
.end(), stemlessthan
);
1049 outf
<< "Sorted by stem frequency: " << endl
<< endl
;
1050 outf
<< "# Rank | Stem | Words .... " << endl
;
1051 outf
<< "# ------------------------------------------------------------------------------------------ " << endl
;
1054 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
1058 pStem
= pstems
[stemno
].first
;
1061 outf
. width( maxlength
+ 5);
1062 outf
<< pStem
->Display();
1064 outf
<< pstems
[stemno
].second
;
1067 outf
<< endl
<< "# ------------------------------------------------------------------------------------------ " << endl
;
1068 outf
<< endl
<< endl
<<"Display all words with counts: " << endl
;
1069 outf
<< "# ------------------------------------------------------------------------------------------ " << endl
;
1071 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
1073 for ( int affixno
= 0; affixno
< GetNumberOfAffixes(); affixno
++)
1075 pWord
= GetWord(stemno
, affixno
);
1078 outf
.setFieldWidth (maxlength
+ 5); outf
<< pWord
->Display();
1079 outf
.setFieldWidth (5) ; outf
<< string
.setNum( pWord
->GetCorpusCount() );
1084 outf
<< endl
<< endl
;
1090 /* This purpose of this function is to take a signature of the form A.SUFFIX
1091 and make it NULL.SUFFIX (the pAlternateSig), and move that letter A back onto its stems.
1094 // <<-------------------------------------------------------------------------------------------------------->>
1095 void CSignature::RemoveLetter (CStringSurrogate
& ssLetter
, CMiniLexicon
* Lexicon
, CSignature
* pAlternateSig
)
1104 QString OldKey
= Display();
1105 CStringSurrogate ssSuffix
,
1108 CSignature
NewSig ( WORD_FINAL
, Lexicon
);
1109 int LetterLength
= ssLetter
.GetLength();
1111 CSignature
*qSig
= NULL
,
1117 QMap
<QString
,CSuffix
*> SuffixPtrTranslation
;
1119 /* Create the NewSig */
1120 for (int affixno
= 1; affixno
<= Size(); affixno
++)
1122 ssSuffix
= GetPiece(affixno
);
1123 if(!NewSig
.GetSortStyle()== eAlphabetized
) NewSig
.Alphabetize();
1124 if ( ssSuffix
== ssLetter
)
1126 if(!NewSig
.GetSortStyle()==eAlphabetized
) NewSig
.Alphabetize();
1127 NewSig
.Append ( CStringSurrogate(Null
.unicode(),0,Null
.length()) );
1131 QString lt_brak
= "<", rt_brak
= ">";
1133 PSuffix
= CStringSurrogate(lt_brak
.unicode(),0,1);
1134 PSuffix
+= ssLetter
;
1135 PSuffix
+= CStringSurrogate(rt_brak
.unicode(),0,1);
1136 PSuffix
+= ssSuffix
;
1138 pSuffix
= *Lexicon
->GetSuffixes() << PSuffix
;
1140 Suffix
= "<" + ssLetter
.Display() + ">" + ssSuffix
.Display();
1141 SuffixPtrTranslation
[ ssSuffix
.Display() ] = pSuffix
; // based on old suffix
1142 // SuffixStringTranslation[ ssSuffix.Display() ] = Suffix;
1144 NewSig
.Append ( PSuffix
.GetKey() );
1148 /* Change the KEY of this signature */
1151 QString remark
= GetRemark() + " +allomorphy";
1152 SetRemark ( remark
);
1154 //-----------------------------------------------------------//
1155 // Change the signature, the stems, the words -- and the suffixes.
1156 //-----------------------------------//
1157 /* Deal with the stems */
1158 //-----------------------------------//
1161 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
1163 CStem
* pStem
= m_StemPtrList
->at(stemno
);
1164 ssStem
= pStem
->GetKey();
1165 PNewStem
= ssStem
+ ssLetter
;
1166 qStem
= *Lexicon
->GetStems() ^= PNewStem
;
1168 if (qStem
) // -- if the larger one already existed
1170 pOlderSig
= *Lexicon
->GetSignatures() ^= qStem
->GetSuffixList();
1172 // this removes both stem and word from signature:
1173 pOlderSig
-> DetachStem ( qStem
, eCall_Words
); // we might want to eliminate this sig if it has no more stems
1175 qStem
-> GetSuffixList()->MergeAndAlphabetizeParse( CParse(NewSig
) );
1177 qSig
= *Lexicon
->GetSignatures() << qStem
->GetSuffixList();
1179 // attaches both stems and words to qSig
1180 qSig
-> AttachToSuffixSig(qStem
, false);
1183 else // make the old stem into this new one
1185 pStem
-> RepairSuffixList ( Lexicon
);
1186 Lexicon
-> GetStems()-> SetKey( pStem
, PNewStem
);
1187 pStem
-> SetKey( PNewStem
);
1191 Q_ASSERT(m_StemPtrList
->size() > 0);
1192 CStem
* pStem
= m_StemPtrList
->at(m_StemPtrList
->size() - 1);
1194 //---------------------------------------------//
1195 /* Deal with the WORDs of this signature */
1196 //---------------------------------------------//
1198 for (int wordno
= 0; wordno
< m_WordPtrList
->size(); wordno
++)
1200 pWord
= m_WordPtrList
->at(wordno
);
1201 pNewSuffix
= SuffixPtrTranslation
[ pWord
->GetSuffix().Display() ];
1202 pWord
-> ShiftStemSuffixBoundary ( LetterLength
);
1204 pWord
-> SetSuffixPtr ( pNewSuffix
);
1205 pWord
-> AttachWordAndSuffixalStem ( pStem
);
1206 pWord
-> SetSuffixSignature ( this );
1210 //------------------------------------------------------------//
1212 //------------------------------------------------------------//
1213 /* Shift stems from AlternateSig to the NewSig, but NOT
1214 if the stem ends with Letter; if it does, we'll
1215 keep the old signature with that stem.
1218 This will replace some or all of pAlternateSig --
1219 "some" when there are any stems that don't allow removal of the Letter.
1220 For example, NULL.ing will not disappear when <e>ing.NULL is created,
1221 because the stem "be" still requires NULL.ing --
1224 // Deal with stems in AlternateSig....
1226 for (int stemno
= 0; stemno
< pAlternateSig
->GetNumberOfStems(); stemno
++)
1228 pStem
= pAlternateSig
->GetStem(stemno
);
1229 ssStem
= pStem
->GetKey();
1230 if ( ssStem
.Right(LetterLength
) == ssLetter
)
1233 pAlternateSig
->DetachStem( pStem
, eCall_Words
);
1234 AttachToSuffixSig( pStem
, false );
1236 // Deal with Words in Alternate signature
1238 for (int stemno
= 0; stemno
< pAlternateSig
->GetNumberOfStems(); stemno
++)
1240 pWord
= pAlternateSig
->GetStem(stemno
);
1241 pNewSuffix
= SuffixPtrTranslation
[ pWord
->GetSuffix().Display() ];
1243 pWord
-> SetSuffixPtr ( pNewSuffix
);
1244 pWord
-> AttachWordAndSuffixalStem ( pStem
);
1245 pWord
->SetSuffixSignature ( this );
1248 //------------------------------------------------------------//
1250 /* Get rid of the Alternate Sig ("NULL.ing" ) */
1252 if ( pAlternateSig
->GetNumberOfStems() == 0 )
1254 Lexicon
->GetSignatures()->Remove( pAlternateSig
);
1259 // <<------------------------------------------------------------------------>>
1260 bool CSignature::EachSuffixCanHaveThisLetterPrefixedToIt ( const QString
& Letter
)
1263 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1264 Suffix
= GetPiece(affixno
).Display();
1265 if (Suffix
== "NULL" ) { Suffix
= ""; }
1266 Suffix
= Letter
+ Suffix
;
1267 if(0)// TODO: if ( ! (*Lexicon->GetSuffixes() ^= Suffix ) )
1275 // <<------------------------------------------------------------------------>>
1276 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance
,
1277 const QString
& Piece
)
1279 struct not_implemented
{ };
1280 throw not_implemented();
1282 // XXX. suppresses “unused parameter” warnings
1283 static_cast<void>(Distance
);
1284 static_cast<void>(Piece
);
1286 foreach (CStem
* word
, *m_WordPtrList
) {
1287 word
->ShiftStemSuffixBoundary(-1);
1288 Q_ASSERT(word
->GetStemLoc() != 0);
1291 foreach (CStem
* stem
, *m_StemPtrList
) {
1292 CStringSurrogate stem_text
= stem
->GetKey();
1294 stem
->SetKey(stem_text
.Left(stem_text
.GetLength() - 1));
1296 // XXX. Check to see if the new stem already exists.
1297 // Lexicon->GetStems()->GetHash()->RemoveKey ( Stem );
1298 // Lexicon->GetStems()->GetHash()->SetAt( NewStem, pStem );
1299 // Lexicon->GetStems()->SetKey( pStem, NewStem );
1302 // XXX. fix the signature
1303 // AddLetter ( 1, Piece );
1305 // Lexicon->AddToScreen ( Display() );
1308 // Variant in which the shifted string varies from stem to stem.
1309 void CSignature::ShiftStemSuffixCutToTheLeft(int Distance
)
1311 // XXX. suppresses “unused parameter” warning
1312 static_cast<void>(Distance
);
1313 struct not_implemented
{ };
1314 throw not_implemented();
1316 // first, fix the words;
1317 foreach (CStem
* word
, *m_WordPtrList
) {
1318 word
->ShiftStemSuffixBoundary(-1);
1319 Q_ASSERT(word
->GetStemLoc() != 0);
1322 // XXX. fix the signature
1323 // AddLetter ( 1, Piece );
1325 // Lexicon->AddToScreen ( Display() );
1328 void CSignature::AddLetter(const QString
& Letter
)
1330 PrefixToAllPieces ( CStringSurrogate(Letter
.unicode(),0,Letter
.length() ) );
1334 /// Looks at the final ngrams of the stems, and calculates its entropy
1335 double CSignature::ComputeFinalNgramEntropyOfStems(int n
)
1337 TCollection
<CLParse
> Ngrams
;
1338 foreach (CStem
* pStem
, *m_StemPtrList
) {
1339 if (pStem
->GetKeyLength() <= n
)
1343 CStringSurrogate ssPiece
= pStem
->GetKey();
1344 ssPiece
= is_initial(GetAffixLocation()) ?
1345 ssPiece
.Left(n
) : ssPiece
.Right(n
);
1349 double Entropy
= 0.0;
1350 const double StemCount
= GetNumberOfStems();
1351 const int ngram_count
= Ngrams
.GetCount();
1352 for (int i
= 0; i
< ngram_count
; ++i
) {
1353 const double fraction
= StemCount
/ Ngrams
[i
]->GetCorpusCount();
1354 Entropy
+= log2(fraction
) / fraction
;
1359 //===================================================================================================//
1361 // CHECK OUT: major function
1363 //===================================================================================================//
1364 /// Test to see whether the break with its stems is a good one.
1365 int CSignature::CheckOut(CMiniLexicon
* Lexicon
)
1367 using linguistica::implicit_cast
;
1368 // Throughout, “DL” stands for “description length”.
1369 Lexicon
->LogFileSmallTitle( Display() );
1370 if (Lexicon
->LogFileOn()) {
1371 // dump stem list to log file.
1372 Lexicon
->LogFileStartTable();
1373 Lexicon
->LogFileStartRow();
1374 const int num_columns
= 5;
1377 CParse Stems
= GetStems();
1378 for (int stemno
= 1; stemno
<= GetNumberOfStems(); ++stemno
) {
1379 if (stemno
% num_columns
== 0) {
1380 Lexicon
->LogFileEndRow(); Lexicon
->LogFileStartRow();
1382 Lexicon
->LogFile( Stems
[stemno
].Display());
1384 Lexicon
->LogFileEndRow(); Lexicon
->LogFileEndTable();
1385 } // end of logfile on
1386 Lexicon
->LogFileHeader("Number of letters","Entropy", "Resolution?" );
1387 bool LowEntropyFlag
= false;
1388 int LargestSizeChunkToPullOffStem
= 0;
1389 // Use entropy to see how many letters to consider shifting
1390 // XXX. Make this user-changeable.
1391 const double EntropyThreshold
= 1.5;
1392 const int LengthToConsiderShifting
= 4;
1393 for (int n
= 1; n
<= LengthToConsiderShifting
; ++n
) {
1394 const double Entropy
= ComputeFinalNgramEntropyOfStems(n
);
1397 // Negative entropy:
1398 // stem too short to consider shortening.
1399 Lexicon
->LogFile("", "", "No reanalysis");
1403 if (Entropy
>= EntropyThreshold
) {
1404 Lexicon
->LogFile ("", "", "Entropy too large.");
1408 // set of n-suffixes of stems has low entropy:
1409 // maybe stems have a common suffix that should be
1410 // incorporated into the signature.
1411 LowEntropyFlag
= true;
1412 LargestSizeChunkToPullOffStem
= n
;
1413 Lexicon
->LogFile(n
, Entropy
, "Entropy sufficiently small.");
1414 } //end of loop on n
1415 Lexicon
->LogFileEndTable();
1416 if (!LowEntropyFlag
)
1417 // Not enough stems share common endings to restructure,
1418 // so leave this signature alone.
1421 const bool analyzingSuffixes
= !is_initial(GetAffixLocation());
1423 const double TotalNumberOfAnalyzedWords
=
1424 Lexicon
->GetSignatures()->GetTotalNumberOfWords();
1425 const double LogTotalNumberOfAnalyzedWords
=
1426 base2log(TotalNumberOfAnalyzedWords
);
1427 const double LengthOfPointerToThisSig
=
1428 LogTotalNumberOfAnalyzedWords
-
1429 base2log(Size() * GetNumberOfStems());
1431 // Description length of the original analysis
1434 // DL of this signature:
1436 // a. Length of pointers to its suffixes; var: LengthOfPointersToAllAffixesOfSig
1437 // b. Prorated responsibility for phonological content of suffixes
1438 // var: TotalResponsibilityForAffixListings
1439 // c. List of pointers from each stem to this signature
1440 // var: StemPointersToThisSig;
1441 // d. List of pointers from each word to its suffix
1443 // Compute DL of 'original' analysis.
1444 Lexicon
->LogFileSmallTitle ("Description length of current signature");
1445 Lexicon
->LogFileHeader("Affix", "Use count", "Pointer to this affix"); ;
1447 double LengthOfPointersToAllAffixesOfSig
= 0.0;
1448 double TotalResponsibilityForAffixListings
= 0.0;
1449 // for each suffix (resp. prefix) in this signature:
1450 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1451 QString Affix
= GetPiece(affixno
).Display();
1452 CAffix
* pAffix
= analyzingSuffixes
1453 ? implicit_cast
<CAffix
*>(
1454 *Lexicon
->GetSuffixes() ^= Affix
)
1455 : implicit_cast
<CAffix
*>(
1456 *Lexicon
->GetPrefixes() ^= Affix
);
1458 // Length of pointers to affixes
1460 const double LengthOfPointerToThisAffix
=
1461 LogTotalNumberOfAnalyzedWords
-
1462 base2log(pAffix
->GetUseCount());
1463 LengthOfPointersToAllAffixesOfSig
+=
1464 LengthOfPointerToThisAffix
;
1466 Lexicon
->LogFile(Affix
, pAffix
->GetUseCount(), LengthOfPointerToThisAffix
);
1468 // use count of affix; length of pointer to this affix.
1469 // Assign partial responsibility for this signature's
1470 // suffixes' entries.
1472 const double LocalProportion
=
1473 double(GetNumberOfStems()) / pAffix
->GetUseCount();
1474 const double ResponsibilityForThisAffixListing
=
1475 LocalProportion
* Affix
.length() * base2log(26);
1476 TotalResponsibilityForAffixListings
+=
1477 ResponsibilityForThisAffixListing
; // in *bits*
1478 }// end of affixno loop
1480 Lexicon
->LogFileEndTable();
1481 Lexicon
->LogFileStartTable();
1482 Lexicon
->LogFile("Part 1: Length of pointer to affixes", LengthOfPointersToAllAffixesOfSig
);
1483 Lexicon
->LogFile("Part 2: Prorated responsibility for phonology of affixes:", TotalResponsibilityForAffixListings
);
1486 const double StemPointersToThisSig
=
1487 GetNumberOfStems() * LengthOfPointerToThisSig
;
1490 const double total_dl
=
1491 LengthOfPointersToAllAffixesOfSig
+
1492 TotalResponsibilityForAffixListings
+
1493 StemPointersToThisSig
;
1494 Lexicon
->LogFile("Part 3: Stem poionters to this sig:", StemPointersToThisSig
);
1495 Lexicon
->LogFile("Length of 1 pointer to this sig: ", LengthOfPointerToThisSig
);
1496 Lexicon
->LogFile("Total", total_dl
);
1497 Lexicon
->LogFileEndTable();
1498 CurrentDL
= total_dl
;
1500 double WinningDL
= CurrentDL
;
1501 int WinningLengthOfStemToShift
= 0;
1503 // We might shift only those stems for which the EndPiece
1504 // occurs in more than 45% of the stems of this sig (that
1505 // leaves open the case of two closely related letters
1506 // comprising almost all of the cases).
1507 // But for now, we're not doing that.
1509 // The outer loop here is for the case where the entropy test
1510 // tells us that 2 or more letters can be shifted
1511 // (e.g., sig on.ve can be shifted either to ion.ive or
1512 // tion.tive), and we want to evaluate both.
1514 // Major loop through alternatives to the current signature
1516 // loop through different lengths to shift:
1517 for (int NumberOfLettersShifted
= LargestSizeChunkToPullOffStem
;
1518 NumberOfLettersShifted
> 0;
1519 --NumberOfLettersShifted
) {
1521 TCollection
<CLParse
> EndPieces
;
1522 foreach (CStem
* pStem
, *m_StemPtrList
) {
1523 if (pStem
->GetKeyLength() <= NumberOfLettersShifted
)
1526 CStringSurrogate stem_text
= pStem
->GetKey();
1527 CStringSurrogate ssPiece
= analyzingSuffixes
1528 ? stem_text
.Right(NumberOfLettersShifted
)
1529 : stem_text
.Left(NumberOfLettersShifted
);
1530 EndPieces
<< ssPiece
;
1533 // XXX. The function is supple enough to move material
1534 // from the stem to the affix in some cases but not in others.
1536 double AllNewSigsAnalysisDL
= 0.0;
1537 double TotalDecreaseInDLDueToShorterStems
= 0.0;
1538 // each of these is a distinct piece being, perhaps,
1539 // transferred from stem(s) to affixes
1540 // for each string of this length that would have to be shifted:
1542 for (int pieceno
= 0; pieceno
< EndPieces
.GetCount(); ++pieceno
) {
1543 CLParse
* pPiece
= EndPieces
.GetAt(pieceno
);
1545 // make a copy to play with.
1548 if (analyzingSuffixes
)
1549 Sig
.PrefixToAllPieces2(pPiece
->GetKey());
1551 Sig
.SuffixToAllPieces2(pPiece
->GetKey());
1553 // DL of this signature:
1555 // a. Length of pointers to its suffixes;
1556 // var: LengthOfPointersToAllAffixesOfSig
1557 // b. Prorated responsibility for phonological
1558 // content of suffixes
1559 // var: TotalResponsibilityForAffixListings
1560 // c. List of pointers from each stem to this
1562 // var: PointersToThisSig;
1563 // d. Savings because stems already existed
1564 // var: SavingsBecauseStemAlreadyExisted
1565 // e. Savings because stems are shorter
1566 // var: TotalDecreaseInDLDueToShorterStems :
1567 // once for each *length* being shifted from
1569 // f. List of pointers from each word to its
1571 // XXX. not implemented.
1573 double LengthOfPointersToAllAffixesOfSig
= 0.0;
1574 double TotalResponsibilityForAffixListings
= 0.0;
1575 if (*Lexicon
->GetSignatures() ^= Sig
) {
1576 // new signature already exists
1577 Lexicon
->LogFileSmallTitle("Alternative analysis already existed", Sig
.Display('-'));
1578 // XXX. address this case!
1581 Lexicon
->LogFileSmallTitle("Conjectured signature: ", Sig
.Display('-'));
1583 // iterate through suffixes of the signature
1584 Lexicon
->LogFileHeader("Suffix", "Previous count", "New count", "Pointer length to this affix", "Responsibility for this affix (phonology) in bits:", "New DL for this affix");
1585 double ThisNewSigDL
= 0.0;
1586 // for each suffix (resp prefix) in the new sig:
1587 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1588 CStringSurrogate ssAffix
=
1589 Sig
.GetPiece(affixno
);
1591 CAffix
* pAffix
= analyzingSuffixes
1592 ? implicit_cast
<CAffix
*>(
1593 *Lexicon
->GetSuffixes() ^= ssAffix
)
1594 : implicit_cast
<CAffix
*>(
1595 *Lexicon
->GetPrefixes() ^= ssAffix
);
1598 const double ResponsibilityForThisAffixListing
=
1599 double(ssAffix
.GetLength()) * base2log(26) *
1600 GetNumberOfStems() /
1601 (double(GetNumberOfStems()) +
1602 pAffix
->GetUseCount());
1603 const double LengthOfPointerToThisAffix
=
1604 LogTotalNumberOfAnalyzedWords
-
1605 base2log(pAffix
->GetUseCount() +
1606 GetNumberOfStems());
1608 TotalResponsibilityForAffixListings
+=
1609 ResponsibilityForThisAffixListing
;
1610 LengthOfPointersToAllAffixesOfSig
+=
1611 LengthOfPointerToThisAffix
;
1613 sum
= ResponsibilityForThisAffixListing
+
1614 LengthOfPointerToThisAffix
;
1615 Lexicon
->LogFile (ssAffix
.Display(), pAffix
->GetUseCount(), GetNumberOfStems() + pAffix
->GetUseCount(), LengthOfPointerToThisAffix
, ResponsibilityForThisAffixListing
, sum
);
1619 const double ResponsibilityForThisAffixListing
=
1620 double(ssAffix
.GetLength()) * base2log(26);
1621 const double LengthOfPointerToThisAffix
=
1622 LogTotalNumberOfAnalyzedWords
-
1623 base2log(GetNumberOfStems());
1625 LengthOfPointersToAllAffixesOfSig
+=
1626 LengthOfPointerToThisAffix
;
1627 TotalResponsibilityForAffixListings
+=
1628 ResponsibilityForThisAffixListing
;
1629 sum
= ResponsibilityForThisAffixListing
+
1630 LengthOfPointerToThisAffix
;
1631 Lexicon
->LogFile(ssAffix
.Display(), 0, GetNumberOfStems(), LengthOfPointerToThisAffix
, ResponsibilityForThisAffixListing
, sum
);
1633 ThisNewSigDL
+= sum
;
1635 Lexicon
->LogFile("Total", 0, 0, LengthOfPointersToAllAffixesOfSig
, TotalResponsibilityForAffixListings
, ThisNewSigDL
);
1638 // Length of the pointers to the sig from its stems:
1639 double SavingsBecauseStemAlreadyExisted
= 0.0;
1640 double StemPointersToThisSig
;
1641 IterateThroughStems(NumberOfLettersShifted
,
1644 TotalDecreaseInDLDueToShorterStems
,
1645 LogTotalNumberOfAnalyzedWords
,
1646 StemPointersToThisSig
,
1647 SavingsBecauseStemAlreadyExisted
,
1649 const double ThisNewSigDL
=
1650 LengthOfPointersToAllAffixesOfSig
+
1651 TotalResponsibilityForAffixListings
+
1652 StemPointersToThisSig
+
1653 -SavingsBecauseStemAlreadyExisted
+
1654 -TotalDecreaseInDLDueToShorterStems
;
1655 AllNewSigsAnalysisDL
+= ThisNewSigDL
;
1656 Lexicon
->LogFile("Part 1: Length of pointer to affixes: ", LengthOfPointersToAllAffixesOfSig
);
1657 Lexicon
->LogFile("Part 2: Prorated responsibility for phonology of affixes: ", TotalResponsibilityForAffixListings
);
1658 Lexicon
->LogFile("Part 3: Stem pointers to this sig:", StemPointersToThisSig
);
1659 Lexicon
->LogFile("Length of 1 poitner to this sig: ", LengthOfPointerToThisSig
);
1660 Lexicon
->LogFile("Part 4: Total savings from stems that had already existed", SavingsBecauseStemAlreadyExisted
);
1661 Lexicon
->LogFile("Part 5: Total decrease in DL due to shorter stems: ", TotalDecreaseInDLDueToShorterStems
);
1662 Lexicon
->LogFile("Total DL: ", ThisNewSigDL
);
1664 if (Lexicon
->LogFileOn()) *Lexicon
->GetLogFile() <<
1666 QString("If we add %1 letters, total TD is %2")
1667 .arg(NumberOfLettersShifted
).arg(AllNewSigsAnalysisDL
) <<
1668 endl
<< "******" << endl
<<
1671 if (AllNewSigsAnalysisDL
< WinningDL
) {
1672 WinningDL
= AllNewSigsAnalysisDL
;
1673 WinningLengthOfStemToShift
= NumberOfLettersShifted
;
1678 if (WinningDL
!= CurrentDL
) {
1679 if (Lexicon
->LogFileOn()) *Lexicon
->GetLogFile() <<
1681 "Change signature from \"%1\" to \"%2\"")
1682 .arg(Display(), WinningSig
.Display('.'))) <<
1684 Lexicon
->AddToScreen(
1686 .arg(Display('.'), WinningSig
.Display('.')));
1687 return WinningLengthOfStemToShift
;
1689 if (Lexicon
->LogFileOn()) *Lexicon
->GetLogFile() <<
1691 "%1: Conclusion: Keep original signature.")
1697 // <<-------------------------------------------------------------------------------------------------------->>
1698 void CSignature::IterateThroughStems( int NumberOfLettersShifted
,
1699 CMiniLexicon
* Lexicon
,
1701 double& TotalDecreaseInDLDueToShorterStems
,
1702 double LogTotalNumberOfAnalyzedWords
,
1703 double& StemPointersToThisSig
,
1704 double& SavingsBecauseStemAlreadyExisted
,
1705 bool analyzingSuffixes
)
1711 int HowManyStemsForThisSig
= 0; //check that
1712 int NumberOfShortenedStemsThatPreExisted
= 0;
1713 double ThisSavingBecauseStemAlreadyExisted
= 0;
1714 double DecreaseInDLDueToShorterStems
= 0;
1715 double LengthOfPointerToThisSig
= 0;
1718 TotalDecreaseInDLDueToShorterStems
= 0;
1719 SavingsBecauseStemAlreadyExisted
= 0;
1721 Lexicon
->LogFile (pPiece
->Display() );
1722 Lexicon
->LogFileHeader( "Current stem", "Proposed stem", "Savings from preexisting stem");
1725 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
1727 pStem
= m_StemPtrList
->at(stemno
);
1728 ThisSavingBecauseStemAlreadyExisted
=0;
1729 int StemLength
= pStem
->GetKeyLength();
1730 ssNewStem
= pStem
->GetKey().Left(
1731 StemLength
- NumberOfLettersShifted
);
1733 if ( analyzingSuffixes
) // Suffixes
1735 if ( pStem
->GetKey().Right(NumberOfLettersShifted
).Display() == pPiece
->Display() )
1737 HowManyStemsForThisSig
++;
1738 Lexicon
->LogFile (pStem
->Display(), ssNewStem
.Display());
1742 Lexicon
->LogFile(pStem
->Display(), ssNewStem
.Display());
1745 ssNewStem
= pStem
->GetKey().Left( pStem
->GetKeyLength() - NumberOfLettersShifted
);
1749 if ( pStem
->GetKey().Left(NumberOfLettersShifted
).Display() == pPiece
->Display() )
1751 HowManyStemsForThisSig
++;
1757 ssNewStem
= pStem
->GetKey().Right( pStem
->GetKeyLength() - NumberOfLettersShifted
);
1762 if ( Lexicon
->GetStems()->Contains( ssNewStem
) || // ** Was: "GetStems_Suffixed
1763 Lexicon
->GetWords()->Contains( ssNewStem
) )
1765 NumberOfShortenedStemsThatPreExisted
++;
1766 ThisSavingBecauseStemAlreadyExisted
= ssNewStem
.GetLength() * base2log (26);
1767 SavingsBecauseStemAlreadyExisted
+= ThisSavingBecauseStemAlreadyExisted
;
1769 // ** Add the cost of having a pointer to the stem ******
1773 if ( Lexicon
->LogFileOn() &&
1774 ( pStem
->GetKey().Right(NumberOfLettersShifted
).Display() == pPiece
->Display() ) )
1777 if ( ThisSavingBecauseStemAlreadyExisted
> 0)
1779 Lexicon
->LogFile("ThisSavingBecauseStemAlreadyExisted");
1782 Lexicon
->LogFile("none (did not exist)");
1788 DecreaseInDLDueToShorterStems
= ( HowManyStemsForThisSig
- NumberOfShortenedStemsThatPreExisted
) *
1789 NumberOfLettersShifted
* base2log (26);
1790 TotalDecreaseInDLDueToShorterStems
+= DecreaseInDLDueToShorterStems
;
1793 LengthOfPointerToThisSig
= LogTotalNumberOfAnalyzedWords
- base2log ( Size() * HowManyStemsForThisSig
) ;
1794 StemPointersToThisSig
= HowManyStemsForThisSig
* ( LengthOfPointerToThisSig
) ;
1795 if ( Lexicon
-> LogFileOn() )
1797 *Lexicon
->GetLogFile() << // FILL THIS IN --
1801 MakeTableHeader("Current stem") <<
1802 MakeTableHeader("Proposed stem") <<
1803 MakeTableHeader("Savings from preexisting stem") <<
1810 bool CSignature::IsValid()
1811 // tests that pieces of the signature are all non-null
1812 { for (int affixno
= 1; affixno
<= m_PieceCount
; affixno
++) {
1813 if ( GetPiece(affixno
).GetLength() < 1 ) {
1819 // <<-------------------------------------------------------------------------------------------------------->>
1820 void CSignature::DetachStem(CStem
* pStem
, detachment_parameter Parameter
)
1822 if( !m_StemPtrList
->isEmpty() &&
1823 m_StemPtrList
->indexOf( pStem
) >= 0 &&
1824 m_StemPtrList
->remove( pStem
) )
1826 IncrementCorpusCount( -1 * pStem
->GetCorpusCount() );
1828 if( Parameter
!= eDo_Not_Call_Words
)
1831 for (int wordno
= 0; wordno
< pStem
->GetNumberOfWords(); wordno
++)
1832 { pWord
= pStem
->GetWordPtrList()->at(wordno
);
1833 m_WordPtrList
->removeOne( pWord
);
1838 // <<-------------------------------------------------------------------------------------------------------->>
1839 void CSignature::DetachWord(CStem
* pWord
, enum detachment_parameter param
)
1841 struct not_implemented
{ };
1842 throw not_implemented();
1844 if( !m_WordPtrList
->empty() && m_WordPtrList
->indexOf( pWord
) >= 0 )
1845 m_WordPtrList
->removeOne(pWord
); //
1847 // Suppress a warning.
1848 static_cast<void>(param
);
1850 // <<-------------------------------------------------------------------------------------------------------->>
1851 void CSignature::TakeAllStems(CSignature
* source
)
1853 //QList<CStem*>& source_stems = *source->GetStemPtrList();
1855 for (int stemno
= 0; stemno
< source
->GetNumberOfStems(); stemno
++)
1857 pStem
=source
->GetStem(stemno
);;
1858 pStem
->SetSuffixList(this);
1859 AppendStemPtr(pStem
);
1860 IncrementCorpusCount(pStem
->GetCorpusCount());
1862 // Remove items from source.
1863 //Q_ASSERT(!source_stems.autoDelete());
1864 //source_stems.clear();
1865 source
->ClearStemPtrList();
1867 // XXX. Decrement source corpus count in turn?
1868 // Hard to tell, since there are no call sites.
1870 // <<-------------------------------------------------------------------------------------------------------->>
1871 void CSignature::AddWord (CStem
* pWord
)
1873 m_WordPtrList
->append (pWord
);
1874 IncrementCorpusCount (pWord
->GetCorpusCount() );
1877 void CSignature::ClearStemPtrList() { m_StemPtrList
->clear(); }
1878 void CSignature::AppendWordPointer(CStem
* pWord
) { m_WordPtrList
->append(pWord
); }
1879 void CSignature::AppendPrefixPtr(CPrefix
* pPrefix
) { m_PrefixPtrList
->append (pPrefix
);}
1880 int CSignature::GetNumberOfWords() const
1882 return m_WordPtrList
->count();
1885 // <<-------------------------------------------------------------------------------------------------------->>
1886 CParse
CSignature::CreateADeletingSignature( CParse
& Deletee
, CMiniLexicon
* Lexicon
)
1888 CStringSurrogate ssSuffix
;
1895 QString Null
= "NULL", lt_brak
= "<", rt_brak
= "<";
1898 Q_ASSERT (Deletee
.Size() == 1);
1900 for (int affixno
= 1; affixno
<= Size(); affixno
++)
1902 ssSuffix
= GetPiece(affixno
);
1903 if(NewSig
.GetSortStyle() != eAlphabetized
) NewSig
.Alphabetize();
1904 if ( ssSuffix
== Deletee
)
1906 NewSig
.Append ( CStringSurrogate(Null
.unicode(),0,Null
.length() ) );
1910 PSuffix
= CStringSurrogate(lt_brak
.unicode(),0,1);
1912 PSuffix
+= CStringSurrogate(rt_brak
.unicode(),0,1);
1913 PSuffix
.ClearParseStructure();
1914 PSuffix
+= ssSuffix
;
1915 NewSig
.Append ( PSuffix
.GetKey() );
1917 pSuffix
= *Lexicon
->GetSuffixes() << PSuffix
;
1919 QString line
= "<" + Deletee
.Display() + ">" + ssSuffix
.Display();
1920 Suffix
= CStringSurrogate( line
.unicode(),0,line
.length());
1922 NewSig
.Append (Suffix
.GetKey());
1923 // Lexicon->SetSuffixTranslation(this, ssSuffix, Suffix);
1930 // <<-------------------------------------------------------------------------------------------------------->>
1931 bool CSignature::RemoveStem(CStem
* pStem
)
1933 return m_StemPtrList
->remove( pStem
);
1935 // <<-------------------------------------------------------------------------------------------------------->>
1938 bool CSignature::RemoveWord(CStem
* pWord
)
1940 return m_WordPtrList
->remove( pWord
);
1942 // <<-------------------------------------------------------------------------------------------------------->>
1943 // copy out affixes, with null affix replaced with "NULL",
1944 // possibly with deletees marked with angle brackets
1945 CSignature
& CSignature::Express(CSignature
& Output
, bool bDisplayDeletees
)
1947 CSuffixCollection
* Suffixes
= 0;
1948 CPrefixCollection
* Prefixes
= 0;
1949 if (!is_initial(GetAffixLocation()))
1950 Suffixes
= GetSignatureCollection()->GetMySuffixes();
1952 Prefixes
= GetSignatureCollection()->GetMyPrefixes();
1954 Output
.ClearParse();
1956 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1957 CStringSurrogate affix_text
= GetPiece(affixno
);
1959 if (affix_text
.IsNULL()) {
1960 Output
.Append(TheStringNULL
);
1963 if (!is_initial(m_AffixLocation
)) {
1964 CSuffix
* suffix
= *Suffixes
^= affix_text
;
1965 Q_ASSERT(suffix
!= 0);
1969 suffix
->Express(Temp
, bDisplayDeletees
));
1971 CPrefix
* prefix
= *Prefixes
^= affix_text
;
1972 Q_ASSERT(prefix
!= 0);
1976 prefix
->Express(Temp
, bDisplayDeletees
));
1981 // <<-------------------------------------------------------------------------------------------------------->>
1982 /// concatenate affixes, separated by -.
1983 QString
CSignature::Express(bool bDisplayDeletees
)
1985 CSuffixCollection
* Suffixes
= 0;
1986 CPrefixCollection
* Prefixes
= 0;
1987 if (!is_initial(GetAffixLocation()))
1988 Suffixes
= GetSignatureCollection()->GetMySuffixes();
1990 Prefixes
= GetSignatureCollection()->GetMyPrefixes();
1993 for (int affixno
= 1; affixno
<= Size(); ++affixno
) {
1994 CStringSurrogate affix_text
= GetPiece(affixno
);
1996 if (affix_text
.IsNULL()) {
1997 if (!Outstring
.isEmpty())
1998 Outstring
.append('-');
1999 Outstring
.append(TheStringNULL
);
2003 if (is_initial(m_AffixLocation
)) {
2004 CPrefix
* prefix
= *Prefixes
^= affix_text
;
2005 Q_ASSERT(prefix
!= 0);
2006 if (!Outstring
.isEmpty())
2007 Outstring
.append('-');
2010 Outstring
.append(prefix
->Express(Temp
,
2011 bDisplayDeletees
).Display());
2013 CSuffix
* suffix
= *Suffixes
^= affix_text
;
2014 Q_ASSERT(suffix
!= 0);
2015 if (!Outstring
.isEmpty())
2016 Outstring
.append('-');
2019 Outstring
.append(suffix
->Express(Temp
,
2020 bDisplayDeletees
).Display());
2026 // <<-------------------------------------------------------------------------------------------------------->>
2028 // this should probably be replaced by ComputeDLofModel
2030 double CSignature::ComputeDL( int char_count )
2035 CStringSurrogate Affix;
2037 bool CORPUS_BASED_AFFIX_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedAffixCount", 0 );
2038 bool CORPUS_BASED_STEM_COUNT = m_pMyMini->GetIntParameter( "SignatureDL\\CorpusBasedStemCount", 1 );
2040 double stems_dl = 0.0,
2043 uint stem_total = 0,
2046 if( CORPUS_BASED_STEM_COUNT )
2048 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2050 stems_dl += ( (double) -1 ) * base2log( (double) pStem->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2055 for( pStem = m_StemPtrList->first(); pStem; pStem = m_StemPtrList->next() )
2057 stems_dl = ( (double) -1 ) * base2log( (double) pStem->GetWordPtrList()->count() / (double) m_pMyMini->GetWords()->GetCount() );
2061 bool analyzedSuffixes = TRUE;
2062 if( GetAffixLocation() == STEM_INITIAL || GetAffixLocation() == WORD_INITIAL ) analyzedSuffixes = FALSE;
2065 if( !CORPUS_BASED_AFFIX_COUNT )
2067 for( i = 1; i <= m_PieceCount; i++ )
2069 Affix = GetPiece(i);
2071 if( analyzedSuffixes )
2073 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2077 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2080 if( pAffix ) affix_total += pAffix->GetCorpusCount();
2084 for( i = 1; i <= m_PieceCount; i++ )
2086 Affix = GetPiece(i);
2088 if( analyzedSuffixes )
2090 pAffix = *m_pMyMini->GetSuffixes() ^= Affix;
2094 pAffix = *m_pMyMini->GetPrefixes() ^= Affix;
2099 if( CORPUS_BASED_AFFIX_COUNT ) affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) m_pMyMini->GetCorpusCount() );
2100 else affixes_dl += ( (double) -1 ) * base2log( (double) pAffix->GetCorpusCount() / (double) affix_total );
2104 return stems_dl + affixes_dl;
2107 // <<-------------------------------------------------------------------------------------------------------->>
2108 //====================================================================//
2109 // Description Length //
2110 //====================================================================//
2111 double CSignature::GetDLofMyAffixPointers( )
2113 if (m_DLofMyAffixPointers
== 0)
2115 bool analyzedSuffixes
= TRUE
;
2118 if( GetAffixLocation() == STEM_INITIAL
|| GetAffixLocation() == WORD_INITIAL
) analyzedSuffixes
= FALSE
;
2119 if (analyzedSuffixes
)
2121 for (int suffixno
= 0; suffixno
< GetSuffixPtrList()->size(); suffixno
++)
2122 { pSuffix
= GetSuffixPtrList()->at(suffixno
);
2123 m_DLofMyAffixPointers
+= pSuffix
->GetLengthOfPointerToMe ();
2128 for (int prefixno
= 0; prefixno
< GetPrefixPtrList()->size(); prefixno
++)
2130 pPrefix
= GetPrefixPtrList()->at(prefixno
);
2131 m_DLofMyAffixPointers
+= pPrefix
->GetLengthOfPointerToMe ();
2135 return m_DLofMyAffixPointers
;
2137 // <<-------------------------------------------------------------------------------------------------------->>
2138 double CSignature::GetDLofMyStemPointers()
2140 if (m_DLofMyStemPointers
== 0)
2143 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
2145 pStem
= GetStem(stemno
);
2146 m_DLofMyStemPointers
+= pStem
->GetLengthOfPointerToMe ();
2149 return m_DLofMyStemPointers
;
2151 // <<-------------------------------------------------------------------------------------------------------->>
2152 double CSignature::ComputeDLofModel(int /* char_count, not used */)
2154 // XXX. take SignatureDL\CorpusBased{Stem,Affix}Count parameters
2157 m_DLofMyStemPointers
= GetDLofMyStemPointers();
2158 m_DLofMyAffixPointers
= GetDLofMyAffixPointers();
2159 return m_DLofMyStemPointers
+ m_DLofMyAffixPointers
;
2161 // <<-------------------------------------------------------------------------------------------------------->>
2162 double CSignature::ComputeDLofMyCorpus()
2164 using linguistica::implicit_cast
;
2169 m_DLofMyCorpus
= 0.0;
2170 foreach (CStem
* pWord
, *m_WordPtrList
) {
2171 CStringSurrogate stem_text
= pWord
->GetStem();
2172 CStem
* stem
= *m_pMyMini
->GetStems() ^= stem_text
;
2177 std::cout
<< "NULL stem -- in CSignature::ComputeDLofMyCorpus() "<< std::endl
;
2178 std::cout
<< " word: "<<pWord
->Display().toStdString()<< std::endl
;
2179 std::cout
<< " stem: "<< stem_text
.Display().toStdString()<<std::endl
;
2180 CStringSurrogate afx_str
2181 = (is_initial(m_AffixLocation
) ? pWord
->GetPrefix() : pWord
->GetSuffix());
2182 std::cout
<< " affix:"<< afx_str
.Display().toStdString() << std::endl
;
2183 std::cout
<< std::endl
;
2188 CStringSurrogate affix_text
= is_initial(m_AffixLocation
)
2189 ? pWord
->GetPrefix()
2190 : pWord
->GetSuffix();
2191 if (affix_text
.GetLength() == 0)
2192 affix_text
= TheStringNULL
;
2194 CAffix
* affix
= is_initial(m_AffixLocation
)
2195 ? implicit_cast
<CAffix
*>(
2196 *m_pMyMini
->GetPrefixes() ^= affix_text
)
2197 : implicit_cast
<CAffix
*>(
2198 *m_pMyMini
->GetSuffixes() ^= affix_text
);
2200 CStem
* word
= *m_pMyMini
->GetWords() ^= pWord
;
2201 const double ThisWordDL
=
2202 stem
->GetLengthOfPointerToMe() +
2203 affix
->GetLengthOfPointerToMe();
2204 m_DLofMyCorpus
+= word
->GetCorpusCount() * ThisWordDL
;
2206 return m_DLofMyCorpus
;
2208 // <<-------------------------------------------------------------------------------------------------------->>
2211 /// Get the corpus counts of each suffix with this stem
2212 int* GetSuffixCounts(CStem* stem, int* output)
2214 if (output) delete output; // error if this occurs.
2215 output = new int[ stem->GetNumberOfSuffixes() ];
2217 for (int i = 1; i <= stem->GetSuffixList()->Size(); ++i) {
2218 QString Suffix = stem->GetSuffixList()->GetPiece(i).Display();
2219 if (Suffix == "NULL")
2221 QString Word = stem->Display() + Suffix;
2222 CStem* pWord = *stem->GetMyMini()->GetWords() ^=
2223 CStringSurrogate(Word);
2225 output[i-1] = pWord->GetCorpusCount();
2231 //the output is a vector of integers, whose length is
2232 // the number of stems times the number of suffixes. Pass it
2233 // an int pointer that points to NULL; it will delete the memory
2234 // that this function creates.
2235 int* CSignature::GetIndividualCountsForEachStem (int* output )
2237 int affixno, stemno;
2241 if (output) delete output; //if this occurs, it's an error.
2242 output = new int [GetNumberOfStems() * GetNumberOfAffixes() ];
2244 CMiniLexicon* pMiniLexicon = GetLexicon();
2245 NOT FINISHED YET _--- use GETaWord -- JG
2246 for (stemno = 0; stemno < m_StemPtrList->size(); stemno++)
2248 pSt em = m_StemPtrList->at(stemno);
2249 temp = GetSuffixCounts(pStem, temp);
2250 for (affixno = 0; affixno < GetNumberOfAffixes(); affixno++)
2252 output[stemno * GetNumberOfAffixes() + affixno] = temp[affixno];
2261 //===================================================================================================//
2263 // Description length
2265 //===================================================================================================//
2266 double CSignature::GetSumOfDLofInternalPointers()
2269 double StemTotal
= 0, SuffixTotal
= 0;
2273 CSuffixCollection
& Suffixes
= *m_pMyMini
->GetSuffixes();
2274 for (int stemno
= 0; stemno
< m_StemPtrList
->size(); stemno
++)
2276 pStem
= m_StemPtrList
->at(stemno
);
2277 StemTotal
+= pStem
->GetLengthOfPointerToMe_2 ();
2280 for (int affixno
= 1; affixno
<= GetNumberOfAffixes(); affixno
++)
2282 ssSuffix
= GetPiece(affixno
);
2283 pSuffix
= Suffixes
^= ssSuffix
;
2284 SuffixTotal
+= pSuffix
->GetLengthOfPointerToMe();
2286 return StemTotal
+ SuffixTotal
;
2288 // <<-------------------------------------------------------------------------------------------------------->>
2290 void CSignature::SetLengthOfPointerToMe(double L
)
2292 m_LengthOfPointerToMe
= L
;
2296 // <<-------------------------------------------------------------------------------------------------------->>
2298 void CSignature::AppendSatelliteAffix(CParse
& suffix
)
2301 m_SatelliteAffixes
.Append(suffix
);
2304 //===================================================================================================//
2308 //===================================================================================================//
2309 bool CSignature::Generalizes(CSignature
* pSig
)
2311 struct not_implemented
{ };
2312 throw not_implemented();
2314 // 1. Check they have the same length; find which one is longer.
2315 // 2. Go from longest to shortest pieces of the longer signature:
2316 // look for unambiguous correspondents in the other signature, and
2317 // put those pairs of corresponding affixes in some structure.
2318 // 3. After unambiguous cases, deal with ambiguous cases, if any exist.
2319 // 4. Find alignment
2321 // ed |NULL | NULL | ed |
2322 // ing|NULL | NULL | ing |
2323 // es |e | NULL | s |
2324 // e |e | NULL | NULL|
2327 // ed |e | <e> | ed |
2328 // ing|e | <e> | ing |
2329 // es |e | NULL | s |
2330 // e |e | NULL | NULL|
2333 // ien |ien | NULL | NULL |
2334 // ienne |ienn | NULL | e |
2335 // iens |ien | NULL | s |
2336 // iennes |ienn | NULL | es |
2338 // ien |ien | NULL | NULL |
2339 // ienne |ien | n | e |
2340 // iens |ien | NULL | s |
2341 // iennes |ien | n | es |
2343 CSignature
* LongerSig
, *ShorterSig
;
2352 if (Size() != pSig
->Size())
2355 const int dif
= GetKeyLength() - pSig
->GetKeyLength();
2357 LongerSig
= this; ShorterSig
= pSig
;
2358 } else if (dif
== 0) {
2361 LongerSig
= pSig
; ShorterSig
= this;
2364 const int MAXAFFIXSIZE
= 10;
2366 QStringList ShorterSigPieces
;
2369 // Copy the affixes of ShorterSig,
2370 // from shortest to longest
2371 // onto the list ShorterSigPieces.
2372 if (ShorterSig
->ContainsNULL())
2373 ShorterSigPieces
.append(TheStringNULL
);
2374 for (int m
= 1; m
< MAXAFFIXSIZE
&&
2375 ShorterSigPieces
.count() < ShorterSig
->Size();
2377 // XXX. this test makes no sense
2378 if (ShorterSig
->ThisPieceLength(m
) == m
)
2379 ShorterSigPieces
.prepend(
2380 ShorterSig
->GetPiece(m
).Display());
2382 Q_ASSERT(ShorterSigPieces
.count() == ShorterSig
->Size());
2385 QStringList LongerSigPieces
;
2387 // Copy the affixes of LongerSig,
2388 // from shortest to longest
2389 // onto the list LongerSigPieces.
2390 if (LongerSig
->ContainsNULL())
2391 LongerSigPieces
.append(TheStringNULL
);
2392 for (int m
= 1; m
< MAXAFFIXSIZE
&&
2393 LongerSigPieces
.count() < LongerSig
->Size();
2395 if (LongerSig
->ThisPieceLength(m
) == m
)
2396 LongerSigPieces
.prepend(
2397 LongerSig
->GetPiece(m
).Display());
2398 Q_ASSERT(LongerSigPieces
.count() == LongerSig
->Size());
2401 CStringSurrogate ssIng
, ssTing
;
2402 foreach (QString shortersig_piece
, ShorterSigPieces
) {
2404 CStringSurrogate
short_affix(shortersig_piece
);
2406 foreach (QString longersig_piece
, LongerSigPieces
) {
2408 CStringSurrogate
long_affix(longersig_piece
);
2409 if (long_affix
.IsNULL())
2411 if (short_affix
!= long_affix
.Right(
2412 short_affix
.GetLength()))
2414 bool unambiguous_match
= !match
;
2418 if (!unambiguous_match
)
2423 long_affix
.Display();
2424 ThisRow
.ShortAffix
=
2425 short_affix
.Display();
2426 ThisRow
.Extension
= long_affix
.Left(
2427 long_affix
.GetLength() -
2428 short_affix
.GetLength())
2430 // XXX. use ThisRow...
2431 static_cast<void>(ThisRow
);
2436 // <<-------------------------------------------------------------------------------------------------------->>
2437 // <<-------------------------------------------------------------------------------------------------------->>
2438 void CSignature::CutMyWordsAsIDeclare()
2441 if ( is_initial (GetAffixLocation()) )
2443 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++) {
2444 stem
= GetStem(stemno
);
2446 // For each prefix in signature:
2447 for (int prefixno
= 1; prefixno
<= Size(); ++prefixno
) {
2448 CStringSurrogate prefix
= GetPiece(prefixno
);
2450 prefix
.SetBackwards(false);
2451 if (prefix
.IsNULL())
2452 // NULL + stem prefix needs no cut
2455 // get correspond word
2456 CParse word_text
= prefix
+ stem
->GetKey();
2457 CStem
* word
= *GetLexicon()->GetWords() ^= word_text
;
2458 Q_ASSERT(word
!= 0);
2460 if (word
->Size() > 1 )
2463 GetLexicon()->LogFile ("", "", word
->GetKey().Display());
2466 const int cut_point
= word
->GetKeyLength() - stem
->GetKeyLength();
2467 word
->CutRightBeforeHere(cut_point
);
2468 word
->SetStemLoc(2);
2469 word
->SetPrefixLoc(1);
2470 //m_pLexicon->UpdateWord(word);
2476 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++) {
2477 stem
= GetStem(stemno
);
2479 // For each affix in signature:
2480 for (int suffixno
= 1; suffixno
<= Size(); ++suffixno
) {
2481 CStringSurrogate suffix
= GetPiece(suffixno
);
2483 suffix
.SetBackwards(false);
2484 if (suffix
.IsNULL())
2485 // stem + NULL suffix needs no cut
2488 // get correspond word
2489 CParse word_text
= stem
->GetKey() + suffix
;
2490 CStem
* word
= *GetLexicon()->GetWords() ^= word_text
;
2491 Q_ASSERT(word
!= 0);
2493 if (word
->Size() > 1 )
2496 GetLexicon()->LogFile ("", "", word
->GetKey().Display());
2499 const int cut_point
= word
->GetKeyLength() - stem
->GetKeyLength();
2500 word
->CutRightBeforeHere(cut_point
);
2501 word
->SetStemLoc(1);
2502 //m_pLexicon->UpdateWord(word);
2508 void CSignature::OutputSignatureXfst( QTextStream
& outf
, int count
)
2516 outf
<< "# " << count
<< ": " << Display('.', m_pMyMini
->GetOutFilter()) << endl
;
2517 if (this->GetMentorList()->count() > 0)
2518 outf
<< "# MentorList() size: " << this->GetMentorList()->count() << endl
;
2520 outf
<< "# No MentorList() items" << endl
;
2522 outf
<< "# robustness: " << m_Robustness
<< endl
;
2524 if( GetMentor()!=NULL
)
2526 outf
<< "# Has mentor: skipping" << endl
;
2530 outf
<< "define STEM" << count
<< " "; // << " \\" << endl;
2534 for (int i
= 0; i
< this->GetNumberOfStems(); i
++)
2536 stems
.append( this->GetStem(i
)->Display() );
2539 // add stems from child sigs
2541 for (int z = 0; z < this->GetMentorList()->size(); z++)
2543 CSignature * qSig = this->GetMentorList()->at(z);
2545 QStringList qSufList;
2546 for (int i = 0; i < qSig->GetNumberOfAffixes(); i++)
2547 qSufList.append(qSig->GetSuffix(i)->Display());
2549 //generate new words here:
2550 for (int i = 0; i < this->GetNumberOfAffixes(); i++)
2553 CSuffix* pSuf = this->GetSuffix(i);
2554 QString sufStr = pSuf->Display( 0 );//, m_pMyMini->GetOutFilter() );
2555 if ( !qSufList.contains(sufStr) )
2557 outf<< "#### Suffix to be expanded: "<< sufStr << endl;
2558 for (int j = 0; j < qSig->GetNumberOfStems(); j++)
2560 QString stemStr = qSig->GetStem(j)->Display();
2561 if (sufStr.compare("NULL") == 0)
2562 outf << "### "<< stemStr << endl;
2564 outf << "### "<< stemStr << " " << sufStr << endl;
2570 // add stems from child sigs
2571 for (int z
= 0; z
< this->GetMentorList()->size(); z
++)
2573 CSignature
* qSig
= this->GetMentorList()->at(z
);
2574 for (int i
= 0; i
< qSig
->GetNumberOfStems(); i
++)
2576 stems
.append( qSig
->GetStem(i
)->Display( 0, m_pMyMini
->GetOutFilter() ) );
2583 QStringList::Iterator strIt
= stems
.begin();
2584 outf
<< "[ {" << *strIt
<< "} ";
2588 for( ; strIt
!= stems
.end(); ++strIt
)
2595 outf
<< "| {" << *strIt
<< "} ";
2599 outf
<< "]; "<<endl
;
2600 outf
<< "define SUF" << count
<< " [ ";
2601 QStringList suffixes
;
2604 for (int i
= 0; i
< this->GetNumberOfAffixes(); i
++)
2606 CSuffix
* pSuffix
= this->GetSuffix(i
);
2611 QString str
= pSuffix
->Display( 0 );
2612 if (str
.compare("NULL") == 0)
2615 outf
<< " {" << str
<< "} ";
2618 outf
<< "];" << endl
;
2620 outf
<< "define SIG" << count
<< " STEM" << count
<< " SUF"<< count
<< ";" << endl
;
2622 outf
<< "push SIG"<< count
<< endl
;
2624 /* TEMP SOLN: now write cross product in comments */
2625 for ( QStringList::Iterator strIt
= stems
.begin() ; strIt
!= stems
.end(); ++strIt
)
2627 //QList<CSuffix*>::iterator suffix_it = m_SuffixPtrList->begin();
2629 //while ( (pSuffix = *suffix_it) != 0 )
2631 for (int i
= 0; i
< this->GetNumberOfAffixes(); i
++)
2633 CSuffix
* pSuffix
= this->GetSuffix(i
);
2634 QString str
= pSuffix
->Display( 0 );//, m_pMyMini->GetOutFilter() );
2635 if (str
.compare("NULL") == 0)
2636 outf
<< "## "<< *strIt
<< endl
;
2638 outf
<< "## "<< *strIt
<< str
<< endl
;
2645 //--------------------------------------------------------------------------//
2646 void CSignature::RecalculateStemAndWordPointers()
2647 //--------------------------------------------------------------------------//
2650 for (int stemno
= 0; stemno
< GetNumberOfStems(); stemno
++)
2652 QString stem
= GetStem(stemno
)->Display();
2653 switch (m_AffixLocation
)
2657 for (int suffixno
= 0; suffixno
< GetNumberOfAffixes(); suffixno
++)
2659 QString suffix
= GetSuffix(suffixno
)->Display();
2660 if (suffix
== "NULL") suffix
= "";
2661 QString word
= stem
+ suffix
;
2662 CStem
* pWord
= *GetLexicon()->GetWords() ^= word
;
2663 AppendWordPointer( pWord
);
2668 for (int prefixno
= 0; prefixno
< GetNumberOfAffixes(); prefixno
++)
2670 QString prefix
= GetPrefix(prefixno
)->Display();
2671 if (prefix
== "NULL") prefix
= "";
2672 QString word
= prefix
+ stem
;
2673 CStem
* pWord
= *GetLexicon()->GetWords() ^= word
;
2674 AppendWordPointer(pWord
);
2677 } // end of stemno loop
2679 //--------------------------------------------------------------------------//