CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / SignatureCollection.cpp
blob864394f2825a7de790c4a19ab16aa44a35e71c56
1 // Implementation of CSignatureCollection methods
2 // Copyright © 2009 The University of Chicago
3 #include "SignatureCollection.h"
5 #include <exception>
6 #include <iostream>
7 #include <memory>
8 #include <QTextStream>
9 #include <QMessageBox>
10 #include <QIODevice>
11 #include <QFile>
12 #include "ui/Status.h"
13 #include "MiniLexicon.h"
14 #include "Lexicon.h"
15 #include "Allomorphy.h"
16 #include "Signature.h"
17 #include "Prefix.h"
18 #include "Suffix.h"
19 #include "Stem.h"
20 #include "PrefixCollection.h"
21 #include "SuffixCollection.h"
22 #include "StemCollection.h"
23 #include "HTML.h"
24 #include "log2.h"
25 class CSignatureAlignment;
26 // <<-------------------------------------------------------------------------------------------------------->>
27 CSignatureCollection::CSignatureCollection()
29 m_pMiniLex = NULL;
30 m_pLexicon = NULL;
31 m_MemberName = "Signatures";
32 MySuffixes = NULL;
33 MyPrefixes = NULL;
34 m_DLofPointersToMyMembers =0;
36 CSignatureCollection::CSignatureCollection( CMiniLexicon* Lex )
38 m_pMiniLex = Lex;
39 if( m_pMiniLex ) m_pLexicon = m_pMiniLex->GetLexicon();
40 m_MemberName = "Signatures";
41 m_SignatureType = m_pMiniLex->GetAffixLocation();
42 MySuffixes = NULL;
43 MyPrefixes = NULL;
44 m_DLofPointersToMyMembers =0;
47 CSignatureCollection::CSignatureCollection (CMiniLexicon* Lex, CSuffixCollection* suffixes, eAffixLocation AfLoc)
49 m_pMiniLex = Lex;
50 if( m_pMiniLex ) m_pLexicon = m_pMiniLex->GetLexicon();
51 m_MemberName = "Signatures";
52 MySuffixes = suffixes;
53 m_SignatureType = AfLoc;
54 MyPrefixes = NULL;
55 m_DLofPointersToMyMembers =0;
58 CSignatureCollection::CSignatureCollection (CMiniLexicon* Lex, CPrefixCollection* Prefixes, eAffixLocation AfLoc)
60 m_pMiniLex = Lex;
61 if( m_pMiniLex ) m_pLexicon = m_pMiniLex->GetLexicon();
62 m_MemberName = "Signatures";
63 MyPrefixes = Prefixes;
64 m_SignatureType = AfLoc;
65 MySuffixes = NULL;
66 m_DLofPointersToMyMembers =0;
69 CSignatureCollection::CSignatureCollection (eAffixLocation SigType)
71 m_pMiniLex = NULL;
72 m_pLexicon = NULL;
73 m_SignatureType = SigType;
74 m_MemberName = "Signatures";
75 MySuffixes = NULL;
76 MyPrefixes = NULL;
77 m_DLofPointersToMyMembers =0;
80 // <<-------------------------------------------------------------------------------------------------------->>
81 CSignatureCollection::~CSignatureCollection()
83 if( m_pLexicon )
85 CSignature* pSig;
87 for( int signo = 0; signo < GetCount(); signo++ )
89 pSig = GetAt(signo);
90 if( m_pLexicon )
92 switch( m_SignatureType )
94 case STEM_INITIAL:
95 case WORD_INITIAL:
96 m_pLexicon->RemovePrefixSig( pSig );
97 break;
98 case WORD_FINAL:
99 case STEM_FINAL:
100 default:
101 m_pLexicon->RemoveSuffixSig( pSig );
108 //==============================================================================================//
111 // Operators
113 //==============================================================================================//
115 CSignature* CSignatureCollection::operator^= (QString Signature) //" Lookup"
117 if( Signature.length() < 1 ) return NULL;
119 CNode *pNode = Find1 ( CStringSurrogate( Signature.unicode(),0,Signature.length() ) );
120 if ( pNode )
122 return (CSignature*) pNode->Get_T_Pointer();
124 else { return NULL; }
127 CSignature* CSignatureCollection::operator^= (CParse& Parse) //" Lookup"
129 // Return NULL if parse is empty.
130 if( Parse.GetKeyLength() < 1 || Parse.Size() < 1 ) return NULL;
132 Parse.Alphabetize();
134 QString display = Parse.Display('.');
135 CNode *pNode = Find1 ( CStringSurrogate( display.unicode(),0,display.length() ) );
136 if ( pNode )
138 return (CSignature*) pNode->Get_T_Pointer();
140 else { return NULL; }
143 // <<-------------------------------------------------------------------------------------------------------->>
144 CSignature* CSignatureCollection::operator^= (CParse* pParse) //" Lookup"
146 // Return NULL if parse is empty.
147 if( pParse->GetKeyLength() < 1 || pParse->Size() < 1 ) return NULL;
149 pParse->Alphabetize();
151 QString display = pParse->Display('.');
152 CNode *pNode = Find1 ( CStringSurrogate( display.unicode(),0,display.length() ) );
153 if ( pNode )
155 return (CSignature*) pNode->Get_T_Pointer();
157 else { return NULL; }
159 // <<-------------------------------------------------------------------------------------------------------->>
160 CSignature* CSignatureCollection::operator^= (CStringSurrogate& Signature) //" Lookup"
162 // Return NULL if Signature is empty.
163 if( Signature.GetLength() < 1 ) return NULL;
165 CNode *pNode = Find1 ( Signature );
166 if ( pNode )
168 return (CSignature*) pNode->Get_T_Pointer();
170 else { return NULL; }
174 // <<-------------------------------------------------------------------------------------------------------->>
175 CSignature* CSignatureCollection::operator<< (CParse* pParse)
177 CSignature* pSig;
178 CNode* pTerminal;
179 int Result;
180 CSuffix* pSuffix;
181 CPrefix* pPrefix;
183 pParse->Alphabetize(); // Jan 2009 JG
185 QString display = pParse->Display('.');
186 CParse SpelledOutSig = CStringSurrogate(display.unicode(),0,display.length());
189 pTerminal = Insert (SpelledOutSig.GetKey(), &Result);
190 if ( Result == 1)
192 pSig = new CSignature( pParse, m_pMiniLex );
193 pTerminal->SetPointer( pSig );
194 if (m_pMiniLex)
196 switch (m_SignatureType)
198 case (WORD_INITIAL):
199 case (STEM_INITIAL):
200 for (int affixno = 1; affixno <= pParse->Size(); affixno++)
202 pPrefix = *m_pMiniLex->GetPrefixes() ^= pParse->GetPiece(affixno);
203 if(pPrefix)
205 pSig->AppendPrefixPtr( pPrefix );
208 break;
209 case (WORD_FINAL):
210 case (STEM_FINAL):
211 default:
212 for (int affixno = 1; affixno <= pParse->Size(); affixno++)
214 pSuffix = *m_pMiniLex->GetSuffixes() ^= pParse->GetPiece(affixno);
215 if(pSuffix)
217 pSig->AppendSuffixPtr( pSuffix );
223 else
225 pSig =(CSignature*) pTerminal->Get_T_Pointer();
228 if( m_pLexicon )
230 switch( m_SignatureType )
232 case STEM_INITIAL:
233 case WORD_INITIAL:
234 m_pLexicon->InsertPrefixSig( pSig );
235 break;
236 case WORD_FINAL:
237 case STEM_FINAL:
238 default:
239 m_pLexicon->InsertSuffixSig( pSig );
243 IncrementCorpusCount(1);
244 pSig->IncrementCorpusCount(1);
246 m_SortValidFlag = FALSE;
247 m_HashHasChangedFlag = TRUE;
249 pSig->SetLexicon( m_pMiniLex );
250 pSig->SetSignatureCollection ( this );
252 //todo
253 pSig->SetAffixLocation ( m_SignatureType );
255 return pSig;
260 CSignature* CSignatureCollection::operator<< (CSignature* Sig)
262 CSignature* pSig = NULL;
263 CNode* pTerminal;
264 int Result = 0;
265 int affixno;
267 Sig ->Alphabetize();
268 CParse SpelledOutSig = CStringSurrogate(Sig->Display('.').unicode(),0,Sig->Display('.').length());
271 pTerminal = Insert (SpelledOutSig.GetKey(), &Result);// CAUSED PROBLEM!!!!!!
273 if ( Result == 1)
275 pSig = new CSignature(*Sig);
276 pTerminal->SetPointer (pSig);
277 if (m_pMiniLex)
279 switch (m_SignatureType)
281 case WORD_INITIAL:
282 case STEM_INITIAL:
284 for ( affixno = 1; affixno <= Sig->Size(); affixno++)
286 CPrefix* pPrefix = *m_pMiniLex->GetPrefixes() ^= Sig->GetPiece(affixno);
287 if(pPrefix) {
288 pSig->AppendPrefixPtr( pPrefix );
292 break;
293 case WORD_FINAL:
294 case STEM_FINAL:
295 default:
297 for ( affixno = 1; affixno <= Sig->Size(); affixno++)
299 CSuffix* pSuffix = *m_pMiniLex->GetSuffixes() ^= Sig->GetPiece(affixno);
300 if(pSuffix) {
301 pSig->AppendSuffixPtr( pSuffix );
308 else
310 pSig =(CSignature*) pTerminal->Get_T_Pointer();
314 if( m_pLexicon )
316 switch( m_SignatureType ) {
317 case STEM_INITIAL:
318 case WORD_INITIAL:
319 m_pLexicon->InsertPrefixSig( pSig );
320 break;
321 case WORD_FINAL:
322 case STEM_FINAL:
323 default:
324 m_pLexicon->InsertSuffixSig( pSig );
327 IncrementCorpusCount(1);
328 pSig ->IncrementCorpusCount(1);
329 m_SortValidFlag = FALSE;
330 m_HashHasChangedFlag = TRUE;
331 pSig->SetLexicon( m_pMiniLex );
332 pSig->SetSignatureCollection ( this );
333 //todo
334 pSig ->SetAffixLocation ( m_SignatureType );
335 return pSig;
339 //==============================================================================================//
344 //==============================================================================================//
345 void CSignatureCollection::SetMyPrefixes(CPrefixCollection* pAC){ MyPrefixes = pAC;}
346 void CSignatureCollection::SetMySuffixes(CSuffixCollection* pAC){ MySuffixes = pAC;}
349 void CSignatureCollection::FindDisplayOrdering()
351 int Size = GetCount();
352 CSignature* pSig,
353 *qSig;
356 Sort (SIGS);
358 for (int signo = 0; signo < Size; signo++)
360 pSig = GetAtSort(signo); // We're looking for pSig's mentor, if it has one
361 if (pSig->Size() < 2) continue;
362 for (int signo2 = 0; signo2 < signo; signo2++)
364 qSig = GetAtSort(signo2);
365 if ( qSig->Contains(pSig) )
367 pSig->SetMentor (qSig);
368 break;
370 else pSig->SetMentor( NULL );
374 m_SortStyle = SIG_MENTORS;
375 m_SortValidFlag = TRUE;
378 void CSignatureCollection::ListDisplay(
379 Q3ListView* pView, QMap<QString, QString>* filter)
381 CLexicon& lex = *m_pLexicon;
382 linguistica::ui::status_user_agent& status = lex.status_display();
384 // XXX. make these adjustable by user.
385 int MinimumNumberOfStemsForDisplay = 2;
386 int MinimumNumberOfAffixesForDisplay = 2;
388 if (GetCount() < 20)
389 MinimumNumberOfStemsForDisplay = 1;
391 pView->setSorting(6);
393 // Remove all previous columns
394 while (pView->columns() != 0)
395 pView->removeColumn(0);
396 pView->clear();
398 // Add Column headers
399 pView->addColumn("Signatures");
400 pView->addColumn("Exemplar");
401 pView->addColumn("Descr. Length", 100);
402 pView->addColumn("Corpus Count", 100);
403 pView->addColumn("Stem Count", 100);
404 pView->addColumn("Source");
405 pView->addColumn("Robustness");
407 pView->setColumnAlignment(0, Qt::AlignLeft);
408 pView->setColumnAlignment(1, Qt::AlignCenter);
409 pView->setColumnAlignment(2, Qt::AlignRight);
410 pView->setColumnAlignment(3, Qt::AlignCenter);
411 pView->setColumnAlignment(4, Qt::AlignCenter);
412 pView->setColumnAlignment(5, Qt::AlignCenter);
413 pView->setColumnAlignment(6, Qt::AlignCenter);
415 status.major_operation = "Creating signature list for display";
416 status.progress.clear();
417 FindDisplayOrdering();
418 status.progress.set_denominator(GetCount()-1);
419 for (int signo = GetCount()-1; signo >=0 ; signo--) {
420 CSignature* pSig = GetAtSort(signo);
421 status.progress = GetCount()-1 - signo;
422 if (pSig->GetMentor())
423 continue;
424 if (pSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay)
425 continue;
426 if (pSig->Size() < MinimumNumberOfAffixesForDisplay)
427 continue;
429 CSignatureListViewItem* item = new CSignatureListViewItem(
430 pView, pSig->Express(), m_pMiniLex->GetIndex(), pSig, filter);
431 if (pSig->GetMentorList()) {
432 for (int signo2 = 0; signo2 < pSig->GetMentorList()->size(); signo2++) {
433 CSignature* qSig = pSig->GetMentorList()->at(signo2);
434 if (qSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay)
435 continue;
436 static_cast<void>(new CSignatureListViewItem(
437 item, qSig->Display(), m_pMiniLex->GetIndex(), qSig, filter));
439 item->setOpen(true);
442 status.progress.clear();
443 status.major_operation.clear();
446 void CSignatureCollection::BorrowedSigsDisplay(
447 Q3ListView* pView, QMap<QString, QString>* filter)
449 CLexicon& lex = *m_pLexicon;
450 linguistica::ui::status_user_agent& status = lex.status_display();
452 // Remove all previous columns
453 while (pView->columns() != 0)
454 pView->removeColumn(0);
456 // Add Column headers
457 pView->addColumn("Signatures");
458 pView->addColumn("Source");
460 // Display each item
461 status.major_operation = "Creating signature list for display";
462 status.progress.clear();
463 status.progress.set_denominator(GetCount());
464 for (int signo = 0; signo < (int)GetCount(); signo++) {
465 GetAt(signo)->BorrowedSigsDisplay(pView, filter);
466 status.progress = signo;
468 status.progress.clear();
469 status.major_operation.clear();
472 ////////////////////////////////////////////////////
473 ////////////////////////////////////////////////////
476 // Verbose Output
479 ////////////////////////////////////////////////////
480 ////////////////////////////////////////////////////
483 void CSignatureCollection::OutputSignatures( QString FileName )
485 QFile file( FileName );
487 if( file.open( QIODevice::WriteOnly ) )
489 QTextStream outf( &file );
490 outf.setEncoding( QTextStream::Unicode );
492 outf << "# Signature Count" << endl;
493 outf << "# ---------------" << endl;
494 outf << " " << GetCount() << " signatures" << endl << endl;
497 Sort( CORPUSCOUNT );
498 for (int i = 0; i < (int)GetCount(); i++)
500 GetAtSort(i)->OutputSignature( outf );
503 file.close();
507 void CSignatureCollection::OutputXfst( QString FileName )
509 QFile file( FileName );
511 if( file.open( IO_WriteOnly ) )
513 QTextStream outf( &file ); //Should be ascii file, not unicode
515 outf << "# " << endl;
516 outf << "# File: " << FileName << endl;
517 outf << "# Signature count: " << GetCount() << endl;
518 outf << "# " << endl;
519 // Sort( CORPUSCOUNT );
520 Sort( SIG_MENTORS );
521 for (int i = 0; i < (int)GetCount(); i++)
523 GetAtSort(i)->OutputSignatureXfst( outf, i+1 );
526 outf << endl;
527 outf << "union net" << endl << endl;
528 outf << "print words" << endl << endl;
530 file.close();
536 ////////////////////////////////////////////////////
537 ////////////////////////////////////////////////////
540 // Limited Output
543 ////////////////////////////////////////////////////
544 ////////////////////////////////////////////////////
546 /*void CSignatureCollection::LimitedOutput (QString Filename)
548 QFile file( Filename );
550 if( file.open( IO_WriteOnly ) )
552 QTextStream outf( &file );
553 int TotalWordCount = 0;
554 CSignature* pSig;
555 QString dummy;
556 int i = 0,
557 counter = 0;
558 int NumEntries = GetCount();
560 outf.setf(2); // Set fields left justified
561 outf << "# Index Signature StemCount AffixCount log(StemCount)*log(AffixCount)" << endl << endl;
563 Sort(SIGS);
564 for( i = 0; i < NumEntries; i++ )
566 pSig = GetAtSort(i);
568 outf << ++counter << " ";
569 outf << pSig -> Display( '.', m_pLexicon->GetOutFilter() ) << " ";
570 outf << pSig -> GetStems().Size() << " ";
571 outf << pSig->GetNumberOfAffixes()
572 << " " << log( pSig->GetStems().Size() ) * log ( pSig->GetNumberOfAffixes() )
573 << endl;
575 TotalWordCount += pSig->GetStemPtrList()->count() * pSig->GetNumberOfAffixes();
578 outf << endl << "Total number of words covered: " << TotalWordCount;
580 file.close();
582 return;
585 namespace {
586 struct cannot_parse_input : virtual std::exception { };
588 /// skip blank lines and comments
589 QString get_line(QTextStream& in)
591 QString buf;
592 do {
593 buf = in.readLine();
594 } while (buf.isEmpty() || buf[0] == '#');
595 return buf;
598 /// swallow end of line, throwing an exception if that involves
599 /// useful data.
600 void check_end_of_line(QTextStream& in)
602 QString remainder = in.readLine();
603 if (!remainder.isEmpty())
604 throw cannot_parse_input();
607 int string_to_int(QString s)
609 bool ok;
610 int result = s.toInt(&ok);
611 if (!ok)
612 throw cannot_parse_input();
613 return result;
617 void CSignatureCollection::ReadSignatureFile(QString Filename,
618 enum eAffixLocation SigType) { try
620 QFile file(Filename);
621 if (!file.open(QIODevice::ReadOnly))
622 return;
624 QTextStream inf(&file);
626 const int signature_count = string_to_int(
627 get_line(inf).trimmed());
629 delete[] m_PointerArray;
630 m_PointerArray = new CSignature*[signature_count];
632 for (int count = 1; count <= signature_count; ++count) {
633 QString sig_header = get_line(inf).trimmed();
635 // line 1:
636 // SP+ signature SP+ stem count SP+ corpus count SP+
637 QTextStream line_in(&sig_header, QIODevice::ReadOnly);
638 QString sig_graphemes, stem_count_text,
639 corpus_count_text;
640 line_in >> sig_graphemes >>
641 stem_count_text >> corpus_count_text;
642 check_end_of_line(line_in);
644 const QString sig_text = Filter(m_pLexicon->GetInFilter(),
645 sig_graphemes);
646 const int stem_count = string_to_int(stem_count_text);
647 const int corpus_count = string_to_int(corpus_count_text);
649 // line 2: signature origin
650 QString remark = get_line(inf).trimmed();
651 remark.replace(QChar('_'), QChar(' '));
653 std::auto_ptr<CSignature> sig(new CSignature(
654 SigType, m_pMiniLex));
655 sig->IngestSignature(sig_text);
656 sig->SetCorpusCount(corpus_count);
657 sig->SetRemark(remark);
658 sig->SetSignatureCollection(this);
660 for (int i = 0; i < stem_count; ++i) {
661 QString stem;
662 inf >> stem;
663 // We haven’t read the Stems.txt file
664 // yet, so just swallow each stem here.
665 // The stems will be read from Signatures.txt
666 // when it is read again in
667 // ReadSignatureFileBis.
670 CNode* terminal = Insert(sig_text);
671 m_PointerArray[GetCount() - 1] = sig.get();
672 terminal->SetPointer(sig.release());
674 } catch (cannot_parse_input) {
675 // XXX. report to user
676 std::cerr << "Signature.txt: cannot parse" << std::endl;
680 void CSignatureCollection::ReadSignatureFileBis(QString Filename) { try
682 CStemCollection* stems_ptr = m_pMiniLex->GetStems();
683 if (stems_ptr == 0)
684 return;
685 CStemCollection& stems = *stems_ptr;
687 QFile file(Filename);
688 if (!file.open(QIODevice::ReadOnly))
689 return;
690 QTextStream inf(&file);
692 const int signature_count = string_to_int(
693 get_line(inf).trimmed());
695 for (int signo = 0; signo < signature_count; ++signo) {
696 // see ReadSignatureFile().
697 QString sig_header = get_line(inf).trimmed();
698 QTextStream line_in(&sig_header, QIODevice::ReadOnly);
699 QString sig_graphemes, stem_count_text,
700 corpus_count_text;
701 line_in >> sig_graphemes >>
702 stem_count_text >> corpus_count_text;
703 check_end_of_line(line_in);
705 const QString sig_text = Filter(m_pLexicon->GetInFilter(),
706 sig_graphemes);
707 const int stem_count = string_to_int(stem_count_text);
709 CParse sig_parse;
710 sig_parse.IngestSignature(sig_text);
711 CSignature* sig = *this ^= sig_parse;
712 Q_ASSERT(sig != 0);
714 for (int stemno = 0; stemno < stem_count; ++stemno) {
715 QString stem_graphemes;
716 inf >> stem_graphemes;
718 const QString stem_text = Filter(
719 m_pLexicon->GetInFilter(), stem_graphemes);
721 CStem* stem = stems ^= stem_text;
722 if (stem == 0)
723 // XXX. stem missing from Stems.txt
724 continue;
726 sig->AppendStemPtr(stem);
729 if (is_initial(sig->GetAffixLocation())) {
730 for (int stemno = 0; stemno < sig->GetNumberOfStems(); stemno++)
732 CStem* stem = sig->GetStem(stemno);
733 for (int affixno = 1; affixno <= sig->Size(); ++affixno) {
734 CPrefix* affix =
735 *m_pMiniLex->GetPrefixes() ^=
736 sig->GetPiece(affixno);
737 if (affix == 0)
738 throw cannot_parse_input();
739 affix->AddStem(stem);
742 } else {
743 for (int stemno = 0; stemno < sig->GetNumberOfStems(); stemno++)
745 CStem* stem = sig->GetStem(stemno);
746 for (int affixno = 1; affixno <= sig->Size(); ++affixno) {
747 CSuffix* affix =
748 *m_pMiniLex->GetSuffixes() ^=
749 sig->GetPiece(affixno);
750 if (affix == 0)
751 throw cannot_parse_input();
752 affix->AddStem(stem);
757 Sort(SIGS);
758 } catch (cannot_parse_input) {
759 // XXX. report to user
760 std::cerr << "Signature.txt: cannot re-parse" << std::endl;
764 void CSignatureCollection::CheckRobustness()
766 CLexicon& lex = *m_pLexicon;
767 linguistica::ui::status_user_agent& status = lex.status_display();
769 status.major_operation = "Checking sig robustness";
770 status.progress.clear();
771 Sort(SIGS);
772 status.progress.set_denominator(GetCount());
773 for (int signo = 1; signo < (int)GetCount(); signo++) {
774 CSignature* pSig = GetAtSort(signo);
775 status.progress = signo;
776 for (int signo2 = 0; signo2 < signo; signo2++) {
777 CSignature* qSig = GetAtSort(signo2);
778 if (qSig->Contains(pSig)) {
779 pSig->SetRobustness(qSig->GetRobustness());
780 break;
784 status.progress.clear();
786 // XXX. not an operation
787 status.major_operation = "Robustness checking complete.";
790 int CSignatureCollection::GetTotalNumberOfWords()
792 int Total = 0;
793 for (int signo = 0; signo < (int)GetCount(); signo++)
795 Total += GetAt(signo)->GetNumberOfStems() * GetAt(signo)->Size();
797 return Total;
801 int CSignatureCollection::TheseTwoSuffixesShareHowManyStems(CSuffix* pSuffix1, CSuffix* pSuffix2)
803 CSignature * pSig;
804 int count = 0;
805 for (int signo = 0; signo < (int)GetCount(); signo++)
807 pSig= GetAt(signo);
808 if ( pSig->Contains (pSuffix1) && pSig->Contains (pSuffix2) )
810 count+= pSig->GetNumberOfStems();
813 return count;
816 void CSignatureCollection::CleanUp()
818 CSignature* pSig;
820 for (int signo = 0; signo < (int) GetCount(); signo++)
822 pSig = GetAt(signo);
823 if ( pSig->GetNumberOfStems() <= 0 || pSig->GetCorpusCount() <= 0 ) // -cs- 20040906 : added the second argument
825 // -cs- 20040602 : DeleteMarkedMembers wasn't actually finding any of the
826 // members to be to be deleted, so I changed it to remove them automatically,
827 // this fixed our word display bug (words weren't connected to their signature
828 RemoveMember(pSig);
833 void CSignatureCollection::AddPointer( CSignature* pSignature )
835 TCollection<CSignature>::AddPointer( pSignature );
837 if( m_pLexicon )
839 switch( m_SignatureType )
841 case STEM_INITIAL:
842 case WORD_INITIAL:
843 m_pLexicon->InsertPrefixSig( pSignature );
844 break;
845 case WORD_FINAL:
846 case STEM_FINAL:
847 default:
848 m_pLexicon->InsertSuffixSig( pSignature );
854 CSignature* CSignatureCollection::AddToCollection( CParse& Signature )
856 CSignature* pSignature = TCollection<CSignature>::AddToCollection( Signature );
858 if( m_pLexicon )
860 switch( m_SignatureType )
862 case STEM_INITIAL:
863 case WORD_INITIAL:
864 m_pLexicon->InsertPrefixSig( pSignature );
865 break;
866 case WORD_FINAL:
867 case STEM_FINAL:
868 default:
869 m_pLexicon->InsertSuffixSig( pSignature );
873 return pSignature;
877 CSignature* CSignatureCollection::AddToCollection( CStringSurrogate& Signature )
879 CSignature* pSignature = TCollection<CSignature>::AddToCollection( Signature );
881 if( m_pLexicon )
883 switch( m_SignatureType )
885 case STEM_INITIAL:
886 case WORD_INITIAL:
887 m_pLexicon->InsertPrefixSig( pSignature );
888 break;
889 case WORD_FINAL:
890 case STEM_FINAL:
891 default:
892 m_pLexicon->InsertSuffixSig( pSignature );
896 return pSignature;
900 void CSignatureCollection::Empty()
902 if( m_pLexicon )
904 CSignature* pSignature;
906 for( int signo = 0; signo < GetCount(); signo++ )
908 pSignature = GetAt(signo);
910 switch( m_SignatureType )
912 case STEM_INITIAL:
913 case WORD_INITIAL:
914 Q_ASSERT( m_pLexicon->RemovePrefixSig( pSignature ) );
915 break;
916 case WORD_FINAL:
917 case STEM_FINAL:
918 default:
919 Q_ASSERT( m_pLexicon->RemoveSuffixSig( pSignature ) );
924 TCollection<CSignature>::Empty();
928 void CSignatureCollection::RemoveAll()
930 if( m_pLexicon )
932 CSignature* pSignature;
934 for( int signo = 0; signo < GetCount(); signo++ )
936 pSignature = GetAt(signo);
938 switch( m_SignatureType )
940 case STEM_INITIAL:
941 case WORD_INITIAL:
942 Q_ASSERT( m_pLexicon->RemovePrefixSig( pSignature ) );
943 break;
944 case WORD_FINAL:
945 case STEM_FINAL:
946 default:
947 Q_ASSERT( m_pLexicon->RemoveSuffixSig( pSignature ) );
952 TCollection<CSignature>::RemoveAll();
956 bool CSignatureCollection::Remove( CSignature* pSignature )
959 if( m_pLexicon )
961 switch( m_SignatureType )
963 case STEM_INITIAL:
964 case WORD_INITIAL:
965 Q_ASSERT( m_pLexicon->RemovePrefixSig( pSignature ) );
966 break;
967 case WORD_FINAL:
968 case STEM_FINAL:
969 default:
970 Q_ASSERT( m_pLexicon->RemoveSuffixSig( pSignature ) );
971 //int dummy;
975 return TCollection<CSignature>::Remove( pSignature );
979 bool CSignatureCollection::RemoveMember( CSignature* pSignature )
982 if( m_pLexicon )
984 switch( m_SignatureType )
986 case STEM_INITIAL:
987 case WORD_INITIAL:
988 Q_ASSERT( m_pLexicon->RemovePrefixSig( pSignature ) );
989 break;
990 case WORD_FINAL:
991 case STEM_FINAL:
992 default:
993 Q_ASSERT( m_pLexicon->RemoveSuffixSig( pSignature ) );
997 return TCollection<CSignature>::RemoveMember( pSignature );
1001 bool CSignatureCollection::RemoveMember( CStringSurrogate& Signature )
1003 CSignature* pSignature = (CSignature*)Find1( Signature )->Get_T_Pointer();
1005 if( m_pLexicon )
1007 switch( m_SignatureType )
1009 case STEM_INITIAL:
1010 case WORD_INITIAL:
1011 m_pLexicon->RemovePrefixSig( pSignature ) ;
1012 break;
1013 case WORD_FINAL:
1014 case STEM_FINAL:
1015 default:
1016 Q_ASSERT( m_pLexicon->RemoveSuffixSig( pSignature ) );
1020 return TCollection<CSignature>::RemoveMember( Signature );
1024 bool CSignatureCollection::RemoveMember( CStringSurrogate& Signature, bool b )
1026 CSignature* pSignature = (CSignature*)Find1( Signature )->Get_T_Pointer();
1028 if( m_pLexicon )
1030 switch( m_SignatureType )
1032 case STEM_INITIAL:
1033 case WORD_INITIAL:
1034 m_pLexicon->RemovePrefixSig( pSignature ) ;
1035 break;
1036 case WORD_FINAL:
1037 case STEM_FINAL:
1038 default:
1039 m_pLexicon->RemoveSuffixSig( pSignature ) ;
1043 return TCollection<CSignature>::RemoveMember( Signature, b );
1047 void CSignatureCollection::DeleteMarkedMembers()
1049 if ( m_DeletionArray == NULL ) { return; }
1051 int count = GetCount();
1052 for (int signo = 0; signo < count; signo++)
1054 if ( m_DeletionArray[signo] == 1 )
1056 if( m_pLexicon )
1058 switch( m_SignatureType )
1060 case STEM_INITIAL:
1061 case WORD_INITIAL:
1062 m_pLexicon->RemovePrefixSig( m_PointerArray[signo] );
1063 break;
1064 case WORD_FINAL:
1065 case STEM_FINAL:
1066 default:
1067 m_pLexicon->RemoveSuffixSig( m_PointerArray[signo] );
1073 TCollection<CSignature>::DeleteMarkedMembers();
1077 void CSignatureCollection::GetIndividualCountsForEachStem ()
1079 CSignature* pSig;
1080 for (int signo = 0; signo < GetCount(); signo++)
1082 pSig = GetAt(signo);
1087 double CSignatureCollection::ComputeDLofInternalPointersOfEachMember(
1088 enum eMDL_STYLE /*unused*/)
1090 m_SumOfDLofPointersInternalToEachMember = 0;
1091 for (int signo = 0; signo < GetCount(); ++signo)
1093 CSignature* sig = GetAt(signo);
1094 m_SumOfDLofPointersInternalToEachMember +=
1095 sig->GetSumOfDLofInternalPointers();
1097 return m_SumOfDLofPointersInternalToEachMember;
1100 // MDL JG August 2006
1101 double CSignatureCollection::ComputeLengthOfPointersToEachOfMyMembers (eMDL_STYLE style )
1103 double Denominator = 0;
1104 double ptr;
1105 int m_DLofPointersToMyMembers = 0;
1108 if (style == CorpusCount )
1110 for (int signo = 0; signo < GetCount(); signo++)
1112 Denominator += GetAt(signo)->GetCorpusCount();
1114 for (int signo = 0; signo < GetCount(); signo++)
1116 ptr = base2log ( Denominator/ GetAt(signo)->GetCorpusCount() );
1117 GetAt(signo)->SetLengthOfPointerToMe (ptr ) ;
1118 m_DLofPointersToMyMembers += ptr;
1122 else if (style == GrammarCount )
1124 for (int signo = 0; signo < GetCount(); signo++)
1126 Denominator += GetAt(signo)->GetNumberOfStems() * GetAt(signo)->GetNumberOfAffixes();
1128 for (int signo = 0; signo < GetCount(); signo++)
1130 ptr = base2log ( Denominator/ GetAt(signo)->GetCorpusCount() );
1131 GetAt(signo)->SetLengthOfPointerToMe (ptr ) ;
1132 m_DLofPointersToMyMembers += ptr;
1135 return m_DLofPointersToMyMembers;
1137 ///----------------------------------------------------------->>>>>
1138 ///----------------------------------------------------------->>>>>
1139 // CompareSignaturePairsForTotalOverlap()
1140 ///----------------------------------------------------------->>>>>
1141 void CSignatureCollection::CompareSignaturePairsForTotalOverlap()
1143 CSignatureAlignment* pSigAlignment;
1144 CMiniLexicon& lex = *GetMiniLexicon();
1145 lex.LogFileLargeTitle("Comparing pairs of stems for total overlap");
1146 CSignature *pSig, *qSig=NULL;
1148 GetMiniLexicon()->LogFileStartTable();
1150 for (int signo =0; signo < GetCount(); signo++)
1152 pSig = GetAtSort(signo);
1153 if (pSig->Size() < 2 ) {continue;}
1154 //if (pSig->GetNumberOfStems() < 5) continue;
1155 lex.LogFileStartRow();
1156 lex.LogFileSimpleString(pSig->Display());
1157 for (int signo2 = signo+1; signo2 < GetCount(); signo2++)
1159 qSig = GetAtSort(signo2);
1160 if (qSig->Size() < 2 ) {continue;}
1161 if (pSig->Size() != qSig->Size() ) continue;
1162 pSigAlignment = new CSignatureAlignment (pSig, qSig);
1163 if ( pSigAlignment->FindWhetherOneIsSuffixOfTheOther() )
1166 lex.LogFileSimpleString(qSig->Display());
1167 RecutLongerSigToMatchTheShorter (pSigAlignment);
1170 delete pSigAlignment;
1172 lex.LogFileEndRow();
1174 lex.LogFileEndTable();
1177 void CSignatureCollection::RecutLongerSigToMatchTheShorter(CSignatureAlignment* pSigAlignment)
1179 CSignature* pSig = pSigAlignment->LongerSig();
1180 for (int stemno = 0; stemno < pSig->GetNumberOfStems(); stemno++)
1182 for (int suffixno = 0; suffixno < pSig->GetNumberOfAffixes(); suffixno++)
1184 CStem* pWord = pSig->GetWord(stemno, suffixno);
1185 pWord->MoveThisManyLettersLeftwardFromArg1(1,pSigAlignment->difference().length());
1194 ///----------------------------------------------------------->>>>>
1195 // Allomorphy
1196 ///----------------------------------------------------------->>>>>
1197 void CSignatureCollection::FindAllomorphy()
1199 CSignatureAlignment* pSigAlignment;
1200 CSignature *pSig, *qSig=NULL;
1201 int MinimumNumberOfStems = 5;
1203 CMiniLexicon& lex = *GetMiniLexicon();
1204 lex.LogFileLargeTitle("Allomorphy");
1205 lex.LogFileStartTable();
1207 for (int signo =0; signo < GetCount(); signo++)
1209 pSig = GetAtSort(signo);
1210 if (pSig->GetNumberOfStems() < MinimumNumberOfStems ) {continue;}
1211 int size = pSig->Size();
1212 if ( size < 2 ) {continue;}
1214 for (int signo2 = signo+1; signo2 < GetCount(); signo2++)
1216 qSig = GetAtSort(signo2);
1217 if (qSig->GetNumberOfStems() < MinimumNumberOfStems ) {continue;}
1218 if (qSig->Size() < 2 ) {continue;}
1219 if ( size != qSig->Size() ) continue;
1220 pSigAlignment = new CSignatureAlignment (pSig, qSig);
1221 pSigAlignment->FindBestAlignment();
1222 if (GetMiniLexicon()->LogFileOn() && pSigAlignment->GetAffixAlignments()->count() == size )
1224 pSigAlignment->Display( *GetMiniLexicon()->GetLogFile());
1226 delete pSigAlignment;
1229 lex.LogFileEndTable();