1 // Implementation of CSignatureCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "SignatureCollection.h"
12 #include "ui/Status.h"
13 #include "MiniLexicon.h"
15 #include "Allomorphy.h"
16 #include "Signature.h"
20 #include "PrefixCollection.h"
21 #include "SuffixCollection.h"
22 #include "StemCollection.h"
25 class CSignatureAlignment
;
26 // <<-------------------------------------------------------------------------------------------------------->>
27 CSignatureCollection::CSignatureCollection()
31 m_MemberName
= "Signatures";
34 m_DLofPointersToMyMembers
=0;
36 CSignatureCollection::CSignatureCollection( CMiniLexicon
* Lex
)
39 if( m_pMiniLex
) m_pLexicon
= m_pMiniLex
->GetLexicon();
40 m_MemberName
= "Signatures";
41 m_SignatureType
= m_pMiniLex
->GetAffixLocation();
44 m_DLofPointersToMyMembers
=0;
47 CSignatureCollection::CSignatureCollection (CMiniLexicon
* Lex
, CSuffixCollection
* suffixes
, eAffixLocation AfLoc
)
50 if( m_pMiniLex
) m_pLexicon
= m_pMiniLex
->GetLexicon();
51 m_MemberName
= "Signatures";
52 MySuffixes
= suffixes
;
53 m_SignatureType
= AfLoc
;
55 m_DLofPointersToMyMembers
=0;
58 CSignatureCollection::CSignatureCollection (CMiniLexicon
* Lex
, CPrefixCollection
* Prefixes
, eAffixLocation AfLoc
)
61 if( m_pMiniLex
) m_pLexicon
= m_pMiniLex
->GetLexicon();
62 m_MemberName
= "Signatures";
63 MyPrefixes
= Prefixes
;
64 m_SignatureType
= AfLoc
;
66 m_DLofPointersToMyMembers
=0;
69 CSignatureCollection::CSignatureCollection (eAffixLocation SigType
)
73 m_SignatureType
= SigType
;
74 m_MemberName
= "Signatures";
77 m_DLofPointersToMyMembers
=0;
80 // <<-------------------------------------------------------------------------------------------------------->>
81 CSignatureCollection::~CSignatureCollection()
87 for( int signo
= 0; signo
< GetCount(); signo
++ )
92 switch( m_SignatureType
)
96 m_pLexicon
->RemovePrefixSig( pSig
);
101 m_pLexicon
->RemoveSuffixSig( pSig
);
108 //==============================================================================================//
113 //==============================================================================================//
115 CSignature
* CSignatureCollection::operator^= (QString Signature
) //" Lookup"
117 if( Signature
.length() < 1 ) return NULL
;
119 CNode
*pNode
= Find1 ( CStringSurrogate( Signature
.unicode(),0,Signature
.length() ) );
122 return (CSignature
*) pNode
->Get_T_Pointer();
124 else { return NULL
; }
127 CSignature
* CSignatureCollection::operator^= (CParse
& Parse
) //" Lookup"
129 // Return NULL if parse is empty.
130 if( Parse
.GetKeyLength() < 1 || Parse
.Size() < 1 ) return NULL
;
134 QString display
= Parse
.Display('.');
135 CNode
*pNode
= Find1 ( CStringSurrogate( display
.unicode(),0,display
.length() ) );
138 return (CSignature
*) pNode
->Get_T_Pointer();
140 else { return NULL
; }
143 // <<-------------------------------------------------------------------------------------------------------->>
144 CSignature
* CSignatureCollection::operator^= (CParse
* pParse
) //" Lookup"
146 // Return NULL if parse is empty.
147 if( pParse
->GetKeyLength() < 1 || pParse
->Size() < 1 ) return NULL
;
149 pParse
->Alphabetize();
151 QString display
= pParse
->Display('.');
152 CNode
*pNode
= Find1 ( CStringSurrogate( display
.unicode(),0,display
.length() ) );
155 return (CSignature
*) pNode
->Get_T_Pointer();
157 else { return NULL
; }
159 // <<-------------------------------------------------------------------------------------------------------->>
160 CSignature
* CSignatureCollection::operator^= (CStringSurrogate
& Signature
) //" Lookup"
162 // Return NULL if Signature is empty.
163 if( Signature
.GetLength() < 1 ) return NULL
;
165 CNode
*pNode
= Find1 ( Signature
);
168 return (CSignature
*) pNode
->Get_T_Pointer();
170 else { return NULL
; }
174 // <<-------------------------------------------------------------------------------------------------------->>
175 CSignature
* CSignatureCollection::operator<< (CParse
* pParse
)
183 pParse
->Alphabetize(); // Jan 2009 JG
185 QString display
= pParse
->Display('.');
186 CParse SpelledOutSig
= CStringSurrogate(display
.unicode(),0,display
.length());
189 pTerminal
= Insert (SpelledOutSig
.GetKey(), &Result
);
192 pSig
= new CSignature( pParse
, m_pMiniLex
);
193 pTerminal
->SetPointer( pSig
);
196 switch (m_SignatureType
)
200 for (int affixno
= 1; affixno
<= pParse
->Size(); affixno
++)
202 pPrefix
= *m_pMiniLex
->GetPrefixes() ^= pParse
->GetPiece(affixno
);
205 pSig
->AppendPrefixPtr( pPrefix
);
212 for (int affixno
= 1; affixno
<= pParse
->Size(); affixno
++)
214 pSuffix
= *m_pMiniLex
->GetSuffixes() ^= pParse
->GetPiece(affixno
);
217 pSig
->AppendSuffixPtr( pSuffix
);
225 pSig
=(CSignature
*) pTerminal
->Get_T_Pointer();
230 switch( m_SignatureType
)
234 m_pLexicon
->InsertPrefixSig( pSig
);
239 m_pLexicon
->InsertSuffixSig( pSig
);
243 IncrementCorpusCount(1);
244 pSig
->IncrementCorpusCount(1);
246 m_SortValidFlag
= FALSE
;
247 m_HashHasChangedFlag
= TRUE
;
249 pSig
->SetLexicon( m_pMiniLex
);
250 pSig
->SetSignatureCollection ( this );
253 pSig
->SetAffixLocation ( m_SignatureType
);
260 CSignature
* CSignatureCollection::operator<< (CSignature
* Sig
)
262 CSignature
* pSig
= NULL
;
268 CParse SpelledOutSig
= CStringSurrogate(Sig
->Display('.').unicode(),0,Sig
->Display('.').length());
271 pTerminal
= Insert (SpelledOutSig
.GetKey(), &Result
);// CAUSED PROBLEM!!!!!!
275 pSig
= new CSignature(*Sig
);
276 pTerminal
->SetPointer (pSig
);
279 switch (m_SignatureType
)
284 for ( affixno
= 1; affixno
<= Sig
->Size(); affixno
++)
286 CPrefix
* pPrefix
= *m_pMiniLex
->GetPrefixes() ^= Sig
->GetPiece(affixno
);
288 pSig
->AppendPrefixPtr( pPrefix
);
297 for ( affixno
= 1; affixno
<= Sig
->Size(); affixno
++)
299 CSuffix
* pSuffix
= *m_pMiniLex
->GetSuffixes() ^= Sig
->GetPiece(affixno
);
301 pSig
->AppendSuffixPtr( pSuffix
);
310 pSig
=(CSignature
*) pTerminal
->Get_T_Pointer();
316 switch( m_SignatureType
) {
319 m_pLexicon
->InsertPrefixSig( pSig
);
324 m_pLexicon
->InsertSuffixSig( pSig
);
327 IncrementCorpusCount(1);
328 pSig
->IncrementCorpusCount(1);
329 m_SortValidFlag
= FALSE
;
330 m_HashHasChangedFlag
= TRUE
;
331 pSig
->SetLexicon( m_pMiniLex
);
332 pSig
->SetSignatureCollection ( this );
334 pSig
->SetAffixLocation ( m_SignatureType
);
339 //==============================================================================================//
344 //==============================================================================================//
345 void CSignatureCollection::SetMyPrefixes(CPrefixCollection
* pAC
){ MyPrefixes
= pAC
;}
346 void CSignatureCollection::SetMySuffixes(CSuffixCollection
* pAC
){ MySuffixes
= pAC
;}
349 void CSignatureCollection::FindDisplayOrdering()
351 int Size
= GetCount();
358 for (int signo
= 0; signo
< Size
; signo
++)
360 pSig
= GetAtSort(signo
); // We're looking for pSig's mentor, if it has one
361 if (pSig
->Size() < 2) continue;
362 for (int signo2
= 0; signo2
< signo
; signo2
++)
364 qSig
= GetAtSort(signo2
);
365 if ( qSig
->Contains(pSig
) )
367 pSig
->SetMentor (qSig
);
370 else pSig
->SetMentor( NULL
);
374 m_SortStyle
= SIG_MENTORS
;
375 m_SortValidFlag
= TRUE
;
378 void CSignatureCollection::ListDisplay(
379 Q3ListView
* pView
, QMap
<QString
, QString
>* filter
)
381 CLexicon
& lex
= *m_pLexicon
;
382 linguistica::ui::status_user_agent
& status
= lex
.status_display();
384 // XXX. make these adjustable by user.
385 int MinimumNumberOfStemsForDisplay
= 2;
386 int MinimumNumberOfAffixesForDisplay
= 2;
389 MinimumNumberOfStemsForDisplay
= 1;
391 pView
->setSorting(6);
393 // Remove all previous columns
394 while (pView
->columns() != 0)
395 pView
->removeColumn(0);
398 // Add Column headers
399 pView
->addColumn("Signatures");
400 pView
->addColumn("Exemplar");
401 pView
->addColumn("Descr. Length", 100);
402 pView
->addColumn("Corpus Count", 100);
403 pView
->addColumn("Stem Count", 100);
404 pView
->addColumn("Source");
405 pView
->addColumn("Robustness");
407 pView
->setColumnAlignment(0, Qt::AlignLeft
);
408 pView
->setColumnAlignment(1, Qt::AlignCenter
);
409 pView
->setColumnAlignment(2, Qt::AlignRight
);
410 pView
->setColumnAlignment(3, Qt::AlignCenter
);
411 pView
->setColumnAlignment(4, Qt::AlignCenter
);
412 pView
->setColumnAlignment(5, Qt::AlignCenter
);
413 pView
->setColumnAlignment(6, Qt::AlignCenter
);
415 status
.major_operation
= "Creating signature list for display";
416 status
.progress
.clear();
417 FindDisplayOrdering();
418 status
.progress
.set_denominator(GetCount()-1);
419 for (int signo
= GetCount()-1; signo
>=0 ; signo
--) {
420 CSignature
* pSig
= GetAtSort(signo
);
421 status
.progress
= GetCount()-1 - signo
;
422 if (pSig
->GetMentor())
424 if (pSig
->GetNumberOfStems() < MinimumNumberOfStemsForDisplay
)
426 if (pSig
->Size() < MinimumNumberOfAffixesForDisplay
)
429 CSignatureListViewItem
* item
= new CSignatureListViewItem(
430 pView
, pSig
->Express(), m_pMiniLex
->GetIndex(), pSig
, filter
);
431 if (pSig
->GetMentorList()) {
432 for (int signo2
= 0; signo2
< pSig
->GetMentorList()->size(); signo2
++) {
433 CSignature
* qSig
= pSig
->GetMentorList()->at(signo2
);
434 if (qSig
->GetNumberOfStems() < MinimumNumberOfStemsForDisplay
)
436 static_cast<void>(new CSignatureListViewItem(
437 item
, qSig
->Display(), m_pMiniLex
->GetIndex(), qSig
, filter
));
442 status
.progress
.clear();
443 status
.major_operation
.clear();
446 void CSignatureCollection::BorrowedSigsDisplay(
447 Q3ListView
* pView
, QMap
<QString
, QString
>* filter
)
449 CLexicon
& lex
= *m_pLexicon
;
450 linguistica::ui::status_user_agent
& status
= lex
.status_display();
452 // Remove all previous columns
453 while (pView
->columns() != 0)
454 pView
->removeColumn(0);
456 // Add Column headers
457 pView
->addColumn("Signatures");
458 pView
->addColumn("Source");
461 status
.major_operation
= "Creating signature list for display";
462 status
.progress
.clear();
463 status
.progress
.set_denominator(GetCount());
464 for (int signo
= 0; signo
< (int)GetCount(); signo
++) {
465 GetAt(signo
)->BorrowedSigsDisplay(pView
, filter
);
466 status
.progress
= signo
;
468 status
.progress
.clear();
469 status
.major_operation
.clear();
472 ////////////////////////////////////////////////////
473 ////////////////////////////////////////////////////
479 ////////////////////////////////////////////////////
480 ////////////////////////////////////////////////////
483 void CSignatureCollection::OutputSignatures( QString FileName
)
485 QFile
file( FileName
);
487 if( file
.open( QIODevice::WriteOnly
) )
489 QTextStream
outf( &file
);
490 outf
.setEncoding( QTextStream::Unicode
);
492 outf
<< "# Signature Count" << endl
;
493 outf
<< "# ---------------" << endl
;
494 outf
<< " " << GetCount() << " signatures" << endl
<< endl
;
498 for (int i
= 0; i
< (int)GetCount(); i
++)
500 GetAtSort(i
)->OutputSignature( outf
);
507 void CSignatureCollection::OutputXfst( QString FileName
)
509 QFile
file( FileName
);
511 if( file
.open( IO_WriteOnly
) )
513 QTextStream
outf( &file
); //Should be ascii file, not unicode
515 outf
<< "# " << endl
;
516 outf
<< "# File: " << FileName
<< endl
;
517 outf
<< "# Signature count: " << GetCount() << endl
;
518 outf
<< "# " << endl
;
519 // Sort( CORPUSCOUNT );
521 for (int i
= 0; i
< (int)GetCount(); i
++)
523 GetAtSort(i
)->OutputSignatureXfst( outf
, i
+1 );
527 outf
<< "union net" << endl
<< endl
;
528 outf
<< "print words" << endl
<< endl
;
536 ////////////////////////////////////////////////////
537 ////////////////////////////////////////////////////
543 ////////////////////////////////////////////////////
544 ////////////////////////////////////////////////////
546 /*void CSignatureCollection::LimitedOutput (QString Filename)
548 QFile file( Filename );
550 if( file.open( IO_WriteOnly ) )
552 QTextStream outf( &file );
553 int TotalWordCount = 0;
558 int NumEntries = GetCount();
560 outf.setf(2); // Set fields left justified
561 outf << "# Index Signature StemCount AffixCount log(StemCount)*log(AffixCount)" << endl << endl;
564 for( i = 0; i < NumEntries; i++ )
568 outf << ++counter << " ";
569 outf << pSig -> Display( '.', m_pLexicon->GetOutFilter() ) << " ";
570 outf << pSig -> GetStems().Size() << " ";
571 outf << pSig->GetNumberOfAffixes()
572 << " " << log( pSig->GetStems().Size() ) * log ( pSig->GetNumberOfAffixes() )
575 TotalWordCount += pSig->GetStemPtrList()->count() * pSig->GetNumberOfAffixes();
578 outf << endl << "Total number of words covered: " << TotalWordCount;
586 struct cannot_parse_input
: virtual std::exception
{ };
588 /// skip blank lines and comments
589 QString
get_line(QTextStream
& in
)
594 } while (buf
.isEmpty() || buf
[0] == '#');
598 /// swallow end of line, throwing an exception if that involves
600 void check_end_of_line(QTextStream
& in
)
602 QString remainder
= in
.readLine();
603 if (!remainder
.isEmpty())
604 throw cannot_parse_input();
607 int string_to_int(QString s
)
610 int result
= s
.toInt(&ok
);
612 throw cannot_parse_input();
617 void CSignatureCollection::ReadSignatureFile(QString Filename
,
618 enum eAffixLocation SigType
) { try
620 QFile
file(Filename
);
621 if (!file
.open(QIODevice::ReadOnly
))
624 QTextStream
inf(&file
);
626 const int signature_count
= string_to_int(
627 get_line(inf
).trimmed());
629 delete[] m_PointerArray
;
630 m_PointerArray
= new CSignature
*[signature_count
];
632 for (int count
= 1; count
<= signature_count
; ++count
) {
633 QString sig_header
= get_line(inf
).trimmed();
636 // SP+ signature SP+ stem count SP+ corpus count SP+
637 QTextStream
line_in(&sig_header
, QIODevice::ReadOnly
);
638 QString sig_graphemes
, stem_count_text
,
640 line_in
>> sig_graphemes
>>
641 stem_count_text
>> corpus_count_text
;
642 check_end_of_line(line_in
);
644 const QString sig_text
= Filter(m_pLexicon
->GetInFilter(),
646 const int stem_count
= string_to_int(stem_count_text
);
647 const int corpus_count
= string_to_int(corpus_count_text
);
649 // line 2: signature origin
650 QString remark
= get_line(inf
).trimmed();
651 remark
.replace(QChar('_'), QChar(' '));
653 std::auto_ptr
<CSignature
> sig(new CSignature(
654 SigType
, m_pMiniLex
));
655 sig
->IngestSignature(sig_text
);
656 sig
->SetCorpusCount(corpus_count
);
657 sig
->SetRemark(remark
);
658 sig
->SetSignatureCollection(this);
660 for (int i
= 0; i
< stem_count
; ++i
) {
663 // We haven’t read the Stems.txt file
664 // yet, so just swallow each stem here.
665 // The stems will be read from Signatures.txt
666 // when it is read again in
667 // ReadSignatureFileBis.
670 CNode
* terminal
= Insert(sig_text
);
671 m_PointerArray
[GetCount() - 1] = sig
.get();
672 terminal
->SetPointer(sig
.release());
674 } catch (cannot_parse_input
) {
675 // XXX. report to user
676 std::cerr
<< "Signature.txt: cannot parse" << std::endl
;
680 void CSignatureCollection::ReadSignatureFileBis(QString Filename
) { try
682 CStemCollection
* stems_ptr
= m_pMiniLex
->GetStems();
685 CStemCollection
& stems
= *stems_ptr
;
687 QFile
file(Filename
);
688 if (!file
.open(QIODevice::ReadOnly
))
690 QTextStream
inf(&file
);
692 const int signature_count
= string_to_int(
693 get_line(inf
).trimmed());
695 for (int signo
= 0; signo
< signature_count
; ++signo
) {
696 // see ReadSignatureFile().
697 QString sig_header
= get_line(inf
).trimmed();
698 QTextStream
line_in(&sig_header
, QIODevice::ReadOnly
);
699 QString sig_graphemes
, stem_count_text
,
701 line_in
>> sig_graphemes
>>
702 stem_count_text
>> corpus_count_text
;
703 check_end_of_line(line_in
);
705 const QString sig_text
= Filter(m_pLexicon
->GetInFilter(),
707 const int stem_count
= string_to_int(stem_count_text
);
710 sig_parse
.IngestSignature(sig_text
);
711 CSignature
* sig
= *this ^= sig_parse
;
714 for (int stemno
= 0; stemno
< stem_count
; ++stemno
) {
715 QString stem_graphemes
;
716 inf
>> stem_graphemes
;
718 const QString stem_text
= Filter(
719 m_pLexicon
->GetInFilter(), stem_graphemes
);
721 CStem
* stem
= stems
^= stem_text
;
723 // XXX. stem missing from Stems.txt
726 sig
->AppendStemPtr(stem
);
729 if (is_initial(sig
->GetAffixLocation())) {
730 for (int stemno
= 0; stemno
< sig
->GetNumberOfStems(); stemno
++)
732 CStem
* stem
= sig
->GetStem(stemno
);
733 for (int affixno
= 1; affixno
<= sig
->Size(); ++affixno
) {
735 *m_pMiniLex
->GetPrefixes() ^=
736 sig
->GetPiece(affixno
);
738 throw cannot_parse_input();
739 affix
->AddStem(stem
);
743 for (int stemno
= 0; stemno
< sig
->GetNumberOfStems(); stemno
++)
745 CStem
* stem
= sig
->GetStem(stemno
);
746 for (int affixno
= 1; affixno
<= sig
->Size(); ++affixno
) {
748 *m_pMiniLex
->GetSuffixes() ^=
749 sig
->GetPiece(affixno
);
751 throw cannot_parse_input();
752 affix
->AddStem(stem
);
758 } catch (cannot_parse_input
) {
759 // XXX. report to user
760 std::cerr
<< "Signature.txt: cannot re-parse" << std::endl
;
764 void CSignatureCollection::CheckRobustness()
766 CLexicon
& lex
= *m_pLexicon
;
767 linguistica::ui::status_user_agent
& status
= lex
.status_display();
769 status
.major_operation
= "Checking sig robustness";
770 status
.progress
.clear();
772 status
.progress
.set_denominator(GetCount());
773 for (int signo
= 1; signo
< (int)GetCount(); signo
++) {
774 CSignature
* pSig
= GetAtSort(signo
);
775 status
.progress
= signo
;
776 for (int signo2
= 0; signo2
< signo
; signo2
++) {
777 CSignature
* qSig
= GetAtSort(signo2
);
778 if (qSig
->Contains(pSig
)) {
779 pSig
->SetRobustness(qSig
->GetRobustness());
784 status
.progress
.clear();
786 // XXX. not an operation
787 status
.major_operation
= "Robustness checking complete.";
790 int CSignatureCollection::GetTotalNumberOfWords()
793 for (int signo
= 0; signo
< (int)GetCount(); signo
++)
795 Total
+= GetAt(signo
)->GetNumberOfStems() * GetAt(signo
)->Size();
801 int CSignatureCollection::TheseTwoSuffixesShareHowManyStems(CSuffix
* pSuffix1
, CSuffix
* pSuffix2
)
805 for (int signo
= 0; signo
< (int)GetCount(); signo
++)
808 if ( pSig
->Contains (pSuffix1
) && pSig
->Contains (pSuffix2
) )
810 count
+= pSig
->GetNumberOfStems();
816 void CSignatureCollection::CleanUp()
820 for (int signo
= 0; signo
< (int) GetCount(); signo
++)
823 if ( pSig
->GetNumberOfStems() <= 0 || pSig
->GetCorpusCount() <= 0 ) // -cs- 20040906 : added the second argument
825 // -cs- 20040602 : DeleteMarkedMembers wasn't actually finding any of the
826 // members to be to be deleted, so I changed it to remove them automatically,
827 // this fixed our word display bug (words weren't connected to their signature
833 void CSignatureCollection::AddPointer( CSignature
* pSignature
)
835 TCollection
<CSignature
>::AddPointer( pSignature
);
839 switch( m_SignatureType
)
843 m_pLexicon
->InsertPrefixSig( pSignature
);
848 m_pLexicon
->InsertSuffixSig( pSignature
);
854 CSignature
* CSignatureCollection::AddToCollection( CParse
& Signature
)
856 CSignature
* pSignature
= TCollection
<CSignature
>::AddToCollection( Signature
);
860 switch( m_SignatureType
)
864 m_pLexicon
->InsertPrefixSig( pSignature
);
869 m_pLexicon
->InsertSuffixSig( pSignature
);
877 CSignature
* CSignatureCollection::AddToCollection( CStringSurrogate
& Signature
)
879 CSignature
* pSignature
= TCollection
<CSignature
>::AddToCollection( Signature
);
883 switch( m_SignatureType
)
887 m_pLexicon
->InsertPrefixSig( pSignature
);
892 m_pLexicon
->InsertSuffixSig( pSignature
);
900 void CSignatureCollection::Empty()
904 CSignature
* pSignature
;
906 for( int signo
= 0; signo
< GetCount(); signo
++ )
908 pSignature
= GetAt(signo
);
910 switch( m_SignatureType
)
914 Q_ASSERT( m_pLexicon
->RemovePrefixSig( pSignature
) );
919 Q_ASSERT( m_pLexicon
->RemoveSuffixSig( pSignature
) );
924 TCollection
<CSignature
>::Empty();
928 void CSignatureCollection::RemoveAll()
932 CSignature
* pSignature
;
934 for( int signo
= 0; signo
< GetCount(); signo
++ )
936 pSignature
= GetAt(signo
);
938 switch( m_SignatureType
)
942 Q_ASSERT( m_pLexicon
->RemovePrefixSig( pSignature
) );
947 Q_ASSERT( m_pLexicon
->RemoveSuffixSig( pSignature
) );
952 TCollection
<CSignature
>::RemoveAll();
956 bool CSignatureCollection::Remove( CSignature
* pSignature
)
961 switch( m_SignatureType
)
965 Q_ASSERT( m_pLexicon
->RemovePrefixSig( pSignature
) );
970 Q_ASSERT( m_pLexicon
->RemoveSuffixSig( pSignature
) );
975 return TCollection
<CSignature
>::Remove( pSignature
);
979 bool CSignatureCollection::RemoveMember( CSignature
* pSignature
)
984 switch( m_SignatureType
)
988 Q_ASSERT( m_pLexicon
->RemovePrefixSig( pSignature
) );
993 Q_ASSERT( m_pLexicon
->RemoveSuffixSig( pSignature
) );
997 return TCollection
<CSignature
>::RemoveMember( pSignature
);
1001 bool CSignatureCollection::RemoveMember( CStringSurrogate
& Signature
)
1003 CSignature
* pSignature
= (CSignature
*)Find1( Signature
)->Get_T_Pointer();
1007 switch( m_SignatureType
)
1011 m_pLexicon
->RemovePrefixSig( pSignature
) ;
1016 Q_ASSERT( m_pLexicon
->RemoveSuffixSig( pSignature
) );
1020 return TCollection
<CSignature
>::RemoveMember( Signature
);
1024 bool CSignatureCollection::RemoveMember( CStringSurrogate
& Signature
, bool b
)
1026 CSignature
* pSignature
= (CSignature
*)Find1( Signature
)->Get_T_Pointer();
1030 switch( m_SignatureType
)
1034 m_pLexicon
->RemovePrefixSig( pSignature
) ;
1039 m_pLexicon
->RemoveSuffixSig( pSignature
) ;
1043 return TCollection
<CSignature
>::RemoveMember( Signature
, b
);
1047 void CSignatureCollection::DeleteMarkedMembers()
1049 if ( m_DeletionArray
== NULL
) { return; }
1051 int count
= GetCount();
1052 for (int signo
= 0; signo
< count
; signo
++)
1054 if ( m_DeletionArray
[signo
] == 1 )
1058 switch( m_SignatureType
)
1062 m_pLexicon
->RemovePrefixSig( m_PointerArray
[signo
] );
1067 m_pLexicon
->RemoveSuffixSig( m_PointerArray
[signo
] );
1073 TCollection
<CSignature
>::DeleteMarkedMembers();
1077 void CSignatureCollection::GetIndividualCountsForEachStem ()
1080 for (int signo
= 0; signo
< GetCount(); signo
++)
1082 pSig
= GetAt(signo
);
1087 double CSignatureCollection::ComputeDLofInternalPointersOfEachMember(
1088 enum eMDL_STYLE
/*unused*/)
1090 m_SumOfDLofPointersInternalToEachMember
= 0;
1091 for (int signo
= 0; signo
< GetCount(); ++signo
)
1093 CSignature
* sig
= GetAt(signo
);
1094 m_SumOfDLofPointersInternalToEachMember
+=
1095 sig
->GetSumOfDLofInternalPointers();
1097 return m_SumOfDLofPointersInternalToEachMember
;
1100 // MDL JG August 2006
1101 double CSignatureCollection::ComputeLengthOfPointersToEachOfMyMembers (eMDL_STYLE style
)
1103 double Denominator
= 0;
1105 int m_DLofPointersToMyMembers
= 0;
1108 if (style
== CorpusCount
)
1110 for (int signo
= 0; signo
< GetCount(); signo
++)
1112 Denominator
+= GetAt(signo
)->GetCorpusCount();
1114 for (int signo
= 0; signo
< GetCount(); signo
++)
1116 ptr
= base2log ( Denominator
/ GetAt(signo
)->GetCorpusCount() );
1117 GetAt(signo
)->SetLengthOfPointerToMe (ptr
) ;
1118 m_DLofPointersToMyMembers
+= ptr
;
1122 else if (style
== GrammarCount
)
1124 for (int signo
= 0; signo
< GetCount(); signo
++)
1126 Denominator
+= GetAt(signo
)->GetNumberOfStems() * GetAt(signo
)->GetNumberOfAffixes();
1128 for (int signo
= 0; signo
< GetCount(); signo
++)
1130 ptr
= base2log ( Denominator
/ GetAt(signo
)->GetCorpusCount() );
1131 GetAt(signo
)->SetLengthOfPointerToMe (ptr
) ;
1132 m_DLofPointersToMyMembers
+= ptr
;
1135 return m_DLofPointersToMyMembers
;
1137 ///----------------------------------------------------------->>>>>
1138 ///----------------------------------------------------------->>>>>
1139 // CompareSignaturePairsForTotalOverlap()
1140 ///----------------------------------------------------------->>>>>
1141 void CSignatureCollection::CompareSignaturePairsForTotalOverlap()
1143 CSignatureAlignment
* pSigAlignment
;
1144 CMiniLexicon
& lex
= *GetMiniLexicon();
1145 lex
.LogFileLargeTitle("Comparing pairs of stems for total overlap");
1146 CSignature
*pSig
, *qSig
=NULL
;
1148 GetMiniLexicon()->LogFileStartTable();
1150 for (int signo
=0; signo
< GetCount(); signo
++)
1152 pSig
= GetAtSort(signo
);
1153 if (pSig
->Size() < 2 ) {continue;}
1154 //if (pSig->GetNumberOfStems() < 5) continue;
1155 lex
.LogFileStartRow();
1156 lex
.LogFileSimpleString(pSig
->Display());
1157 for (int signo2
= signo
+1; signo2
< GetCount(); signo2
++)
1159 qSig
= GetAtSort(signo2
);
1160 if (qSig
->Size() < 2 ) {continue;}
1161 if (pSig
->Size() != qSig
->Size() ) continue;
1162 pSigAlignment
= new CSignatureAlignment (pSig
, qSig
);
1163 if ( pSigAlignment
->FindWhetherOneIsSuffixOfTheOther() )
1166 lex
.LogFileSimpleString(qSig
->Display());
1167 RecutLongerSigToMatchTheShorter (pSigAlignment
);
1170 delete pSigAlignment
;
1172 lex
.LogFileEndRow();
1174 lex
.LogFileEndTable();
1177 void CSignatureCollection::RecutLongerSigToMatchTheShorter(CSignatureAlignment
* pSigAlignment
)
1179 CSignature
* pSig
= pSigAlignment
->LongerSig();
1180 for (int stemno
= 0; stemno
< pSig
->GetNumberOfStems(); stemno
++)
1182 for (int suffixno
= 0; suffixno
< pSig
->GetNumberOfAffixes(); suffixno
++)
1184 CStem
* pWord
= pSig
->GetWord(stemno
, suffixno
);
1185 pWord
->MoveThisManyLettersLeftwardFromArg1(1,pSigAlignment
->difference().length());
1194 ///----------------------------------------------------------->>>>>
1196 ///----------------------------------------------------------->>>>>
1197 void CSignatureCollection::FindAllomorphy()
1199 CSignatureAlignment
* pSigAlignment
;
1200 CSignature
*pSig
, *qSig
=NULL
;
1201 int MinimumNumberOfStems
= 5;
1203 CMiniLexicon
& lex
= *GetMiniLexicon();
1204 lex
.LogFileLargeTitle("Allomorphy");
1205 lex
.LogFileStartTable();
1207 for (int signo
=0; signo
< GetCount(); signo
++)
1209 pSig
= GetAtSort(signo
);
1210 if (pSig
->GetNumberOfStems() < MinimumNumberOfStems
) {continue;}
1211 int size
= pSig
->Size();
1212 if ( size
< 2 ) {continue;}
1214 for (int signo2
= signo
+1; signo2
< GetCount(); signo2
++)
1216 qSig
= GetAtSort(signo2
);
1217 if (qSig
->GetNumberOfStems() < MinimumNumberOfStems
) {continue;}
1218 if (qSig
->Size() < 2 ) {continue;}
1219 if ( size
!= qSig
->Size() ) continue;
1220 pSigAlignment
= new CSignatureAlignment (pSig
, qSig
);
1221 pSigAlignment
->FindBestAlignment();
1222 if (GetMiniLexicon()->LogFileOn() && pSigAlignment
->GetAffixAlignments()->count() == size
)
1224 pSigAlignment
->Display( *GetMiniLexicon()->GetLogFile());
1226 delete pSigAlignment
;
1229 lex
.LogFileEndTable();