CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / Compound.cpp
blob96559ccd383f812525183a20cb32e205d3eb5814
1 // Implementation of CCompound, CCompoundListViewItem methods
2 // Copyright © 2009 The University of Chicago
3 #include "Compound.h"
5 #include <QList>
6 #include "Lexicon.h"
7 #include "Linker.h"
8 #include "Edge.h"
9 #include "Stem.h"
10 #include "CompoundCollection.h"
11 #include "LinkerCollection.h"
12 #include "StringFunc.h"
13 #include "Typedefs.h"
15 CCompoundListViewItem::CCompoundListViewItem( Q3ListView *parent,
16 QString compound,
17 CCompound* pCompound,
18 int parse,
19 double score,
20 QString mostFreqPiece,
21 double MFPCount,
22 QString pieceCounts,
23 double prefixness,
24 double suffixness,
25 int parseCount,
26 StringToString* filter )
27 : Q3ListViewItem( parent, compound )
29 m_compound = pCompound;
30 m_parse = parse;
31 m_score = score;
32 m_mostFreqPiece = mostFreqPiece;
33 m_MFPCount = MFPCount;
34 m_pieceCounts = pieceCounts;
35 m_prefixness = prefixness;
36 m_suffixness = suffixness;
37 m_parseCount = parseCount;
38 m_filter = filter;
42 CCompoundListViewItem::CCompoundListViewItem( Q3ListViewItem *parent,
43 QString compound,
44 CCompound* pCompound,
45 int parse,
46 double score,
47 QString mostFreqPiece,
48 double MFPCount,
49 QString pieceCounts,
50 double prefixness,
51 double suffixness,
52 int parseCount,
53 StringToString* filter )
54 : Q3ListViewItem( parent, compound )
56 m_compound = pCompound;
57 m_parse = parse;
58 m_score = score;
59 m_mostFreqPiece = mostFreqPiece;
60 m_MFPCount = MFPCount;
61 m_pieceCounts = pieceCounts;
62 m_prefixness = prefixness;
63 m_suffixness = suffixness;
64 m_parseCount = parseCount;
65 m_filter = filter;
69 QString CCompoundListViewItem::key(int col, bool asc) const
71 switch (col) {
72 case 1:
73 return QString("%1").arg(m_parseCount, 10);
74 case 2:
75 return QString("%1").arg(static_cast<int>(
76 1000 * m_MFPCount), 10);
77 case 3:
78 return QString("%1").arg(static_cast<int>(
79 1000 * m_prefixness), 10);
80 case 4:
81 return QString("%1").arg(static_cast<int>(
82 1000 * m_suffixness), 10);
83 default:
84 return Q3ListViewItem::key(col, asc);
88 QString CCompoundListViewItem::text(int col) const
90 switch (col) {
91 case 1:
92 return QString::number(m_parseCount);
93 case 2:
94 if (m_parse < 0)
95 return QString();
96 return Filter(m_filter, m_mostFreqPiece);
97 case 3:
98 if (m_parse < 0)
99 return QString();
100 return QString::number(m_prefixness, 'f', 4);
101 case 4:
102 if (m_parse < 0)
103 return QString();
104 return QString::number(m_suffixness, 'f', 4);
105 default:
106 return Q3ListViewItem::text(col);
110 CCompound::CCompound( CMiniLexicon* mini ) : CLParse( mini )
112 m_MyComponents = new Components();
113 m_MyLinkers = new Linkers();
114 m_Parses = NULL;
115 m_BestParse = -1;
116 m_pLexicon = NULL;
121 CCompound::CCompound ( const CStringSurrogate& SS, CMiniLexicon* mini ) : CLParse (SS, mini)
123 m_MyComponents = new Components();
124 m_MyLinkers = new Linkers();
125 m_Parses = NULL;
126 m_BestParse = -1;
127 m_pLexicon = NULL;
131 CCompound::~CCompound()
133 if( m_MyComponents ) delete m_MyComponents;
134 if( m_MyLinkers ) delete m_MyLinkers;
135 if( m_Parses ) delete m_Parses;
140 void CCompound::CompoundListDisplay( Q3ListView* List, StringToString* filter, QChar separator )
142 int i;
144 double usage, score,
145 MFPCount = 0.0,
146 mostMFPCount = 0.0,
147 bestScore = 0.0,
148 prefixness = 0.0,
149 suffixness = 0.0,
150 highest_prefixness = 0.0,
151 highest_suffixness = 0.0;
153 usage = 0.0;
154 score = 0.0;
156 QStringList pieceCounts;
157 QString mostFreqStem;
159 CStem* pStem;
160 CLinker* pLinker;
161 CParse parse;
163 StemSet* pStemSet;
165 int MINIMUM_STEM_LENGTH = m_pLexicon->GetIntParameter( "Main\\MinimumStemLength", 3 );
167 CCompoundListViewItem* parent, * item;
169 if( m_Parses )
171 CEdge* pEdge;
172 int index = 0;
174 if( m_Parses->count() > 1 )
176 parent = new CCompoundListViewItem( List, Display( filter ), this, -1, 0.0, QString::null, 0.0, QString::null, 0.0, 0.0, m_Parses->count(), filter );
177 parent->setOpen( TRUE );
179 int parseNumber = 0;
180 for( pEdge = m_Parses->first(); pEdge; pEdge = m_Parses->next() )
181 //for (int z = 0; z < m_Parses->size(); z++)
182 { // pEdge = m_Parses->at(z);
183 pEdge->GetParse( &parse );
185 mostMFPCount = 0.0;
186 bestScore = 0.0;
188 pieceCounts.clear();
190 score = 1.0;
191 for( i = 1; i <= parse.Size(); i++ )
193 pLinker = NULL;
194 pStemSet = NULL;
196 if( parse.GetPiece(i).Display().length() >= MINIMUM_STEM_LENGTH )
198 pStemSet = m_pLexicon->GetAllStems()->find( parse.GetPiece(i).Display() );
199 if( !pStemSet ) pStemSet = m_pLexicon->GetAllWords()->find( parse.GetPiece(i).Display() );
202 if( !pStemSet ) pLinker = *m_pLexicon->GetLinkers() ^= parse.GetPiece(i);
204 if( pStemSet || pLinker )
206 if( pLinker )
208 usage = 2.0; // This gives us the identity (when 1 is subtracted) for linking elements, we don't want to count them
209 pieceCounts.append( QString("%1").arg( pLinker->GetCompoundCount(), 0, 'f', 1 ) );
211 else
213 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
214 for (int z= 0; z < pStemSet->size(); z++)
215 { pStem = pStemSet->at(z);
216 usage = pStem->GetCompoundCount();
217 pieceCounts.append( QString("%1").arg( usage, 0, 'f', 1 ) );
219 if( usage == 0.0 ) continue;
221 if( usage > MFPCount )
223 mostFreqStem = pStem->Display();
224 MFPCount = usage;
227 break;
231 else
233 score = -1.0;
234 break;
237 score *= usage - 1.0;
239 if( pStemSet && i == 1 ) prefixness = pStemSet->at(0)->GetAffixness();
240 if( pStemSet && i == parse.Size() ) suffixness = pStemSet->at(0)->GetAffixness();
243 if( MFPCount > mostMFPCount ) parent->SetMFSCount( MFPCount );
244 if( score > bestScore ) parent->SetScore( score );
245 if( prefixness > highest_prefixness ) parent->SetPrefixness( prefixness );
246 if( suffixness > highest_suffixness ) parent->SetSuffixness( suffixness );
248 if( parseNumber == GetBestParse() )
250 item = new CCompoundListViewItem( parent, "*" + parse.Display( separator, filter ),
251 this, index++, score, mostFreqStem, MFPCount,
252 pieceCounts.join(", "),
253 prefixness, suffixness, 1, filter );
255 else
257 item = new CCompoundListViewItem( parent, parse.Display( separator, filter ),
258 this, index++, score, mostFreqStem, MFPCount,
259 pieceCounts.join(", "),
260 prefixness, suffixness, 1, filter );
262 parseNumber++;
265 else
267 m_Parses->first()->GetParse( &parse );
269 score = 1.0;
270 for( i = 1; i <= parse.Size(); i++ )
272 pLinker = NULL;
274 pStemSet = NULL;
276 if( parse.GetPiece(i).Display().length() >= MINIMUM_STEM_LENGTH )
278 pStemSet = m_pLexicon->GetAllStems()->find( parse.GetPiece(i).Display() );
279 if( !pStemSet ) pStemSet = m_pLexicon->GetAllWords()->find( parse.GetPiece(i).Display() );
282 if( !pStemSet ) pLinker = *m_pLexicon->GetLinkers() ^= parse.GetPiece(i);
284 if( pStemSet || pLinker )
286 if( pLinker )
288 usage = 2.0; // This gives us the identity (when 1 is subtracted) for linking elements, we don't want to count them
289 pieceCounts.append( QString("%1").arg( pLinker->GetCompoundCount(), 0, 'f', 1 ) );
291 else
293 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
294 for (int z = 0; z < pStemSet->size(); z++)
295 { pStem = pStemSet->at(z);
296 usage = pStem->GetCompoundCount();
297 pieceCounts.append( QString("%1").arg( usage, 0, 'f', 1 ) );
299 if( usage == 0.0 ) continue;
301 if( usage > MFPCount )
303 mostFreqStem = pStem->Display();
304 MFPCount = usage;
307 break;
311 else
313 score = -1.0;
314 break;
317 score *= usage - 1.0;
319 if( pStemSet && i == 1 ) prefixness = pStemSet->at(0)->GetAffixness();
320 if( pStemSet && i == parse.Size() ) suffixness = pStemSet->at(0)->GetAffixness();
323 m_Parses->first()->GetParse( &parse );
324 item = new CCompoundListViewItem( List, parse.Display( separator, filter ),
325 this, index++, score, mostFreqStem, MFPCount,
326 pieceCounts.join(", "),
327 prefixness, suffixness, m_Parses->count(), filter );
330 else
332 item = new CCompoundListViewItem( List, Display( separator, filter ), this );
337 void CCompound::DetachAllPieces()
339 CLinker* pLinker;
341 // Not necessary unless we have some tracking of compounds going on in
342 // the stems, which we don't ... yet. (TODO : this + corpus count?)
343 m_MyComponents->clear();
345 Linkers::Iterator it;
346 for( it = m_MyLinkers->begin(); it != m_MyLinkers->end(); ++it )
348 pLinker = it.data();
350 pLinker->IncrementCorpusCount( -1 * GetCorpusCount() );
352 // The linker is not deleted because it may be part of another
353 // parse of the same compound
355 m_MyLinkers->clear();
359 void CCompound::SetBestParse(int i)
361 if (i >= 0 && static_cast<unsigned int>(i) >= m_Parses->count())
362 return;
363 if (i < 0) {
364 m_BestParse = i;
365 return;
368 const int MINIMUM_STEM_LENGTH = m_pLexicon->GetIntParameter(
369 "Main\\MinimumStemLength", 3);
371 CEdge* pEdge = m_Parses->at(i);
372 CParse oneParse;
373 pEdge->GetParse(&oneParse);
374 this->Collapse(CStringSurrogate(oneParse.Display('.')), '.');
376 // Detach components from stems and linkers
377 DetachAllPieces();
379 // Attach components to stems and linkers
380 for (int j = 1; j <= Size(); ++j) {
381 CStringSurrogate piece_surrogate = GetPiece(j);
382 QString piece = piece_surrogate.Display();
384 QList<CStem*>* pStemSet = 0;
386 if (oneParse.GetPiece(j).Display().size() >=
387 MINIMUM_STEM_LENGTH) {
388 pStemSet = m_pLexicon->GetAllStems()->find(piece);
389 if (pStemSet == 0)
390 pStemSet = m_pLexicon->GetAllWords()->find(
391 piece);
394 if (pStemSet != 0) {
395 m_MyComponents->insert(j, pStemSet);
396 // XXX. corpus count?
397 } else {
398 // it is a linker element
399 if (CLinker* pLinker = *m_pLexicon->GetLinkers() ^=
400 piece_surrogate) {
401 m_MyLinkers->insert(j, pLinker);
402 pLinker->IncrementCorpusCount(
403 GetCorpusCount());
407 m_BestParse = i;
410 void CCompound::SetParses( Q3PtrList<CEdge>* parses, double* pComponentCount, double* pLinkerCount )
412 CStem* pStem;
413 CEdge* pEdge;
414 CParse oneParse;
415 QString compound;
416 CSS ssCompound;
417 CLinker* pLinker;
418 StemSet* pStemSet;
420 // unused variable 'componentCoun'
421 // double componentCount = 0.0;
423 ComponentMap* allComponents = m_pLexicon->GetCompounds()->GetComponentMap();
425 int MINIMUM_STEM_LENGTH = m_pLexicon->GetIntParameter( "Main\\MinimumStemLength", 3 );
427 // Detach components, stems, and linkers
428 if( m_Parses )
430 for( pEdge = m_Parses->first(); pEdge; pEdge = m_Parses->next() )
431 //for (int z = 0; z < m_Parses->size(); z++)
432 { // pEdge = m_Parses->at(z);
433 pEdge->GetParse( &oneParse );
434 compound = oneParse.Display('.');
435 ssCompound = compound;
436 oneParse.Collapse( ssCompound, '.' );
438 for( int j = 1; j <= oneParse.Size(); j++ )
440 pStemSet = NULL;
441 if( allComponents->find( oneParse.GetPiece(j).Display() ) != allComponents->end() )
443 pStemSet = allComponents->find( oneParse.GetPiece(j).Display() ).data();
446 if( pStemSet )
448 // TODO: remove stem to compound links (these don't exist yet)
449 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
450 for (int z= 0; z < pStemSet->size(); z++)
451 { pStem = pStemSet->at(z);
452 pStem->IncrementCompoundCount(
453 -double(GetCorpusCount()) /
454 m_Parses->count());
455 if( pStem->GetCompoundCount() <= 0.0 )
457 allComponents->remove( oneParse.GetPiece(j).Display() );
458 pStem->SetCompoundCount( 0.0 );
459 pStem = NULL;
463 else
465 // It is a linker element
466 pLinker = *m_pLexicon->GetLinkers() ^= oneParse.GetPiece(j);
468 if( pLinker )
470 pLinker->RemoveCompound( this );
471 pLinker->IncrementCompoundCount(
472 -double(1.0) /
473 m_Parses->count());
474 pLinker->IncrementCorpusCount(
475 // XXX. why integer?
476 static_cast<int>(
477 double(-1) *
478 GetCorpusCount() /
479 m_Parses->count()));
481 if( pLinker->GetCompoundCount() <= 0.0 )
483 m_pLexicon->GetLinkers()->RemoveMember( pLinker );
484 pLinker = NULL;
492 if( m_Parses ) delete m_Parses;
493 m_Parses = parses;
495 for( pEdge = m_Parses->first(); pEdge; pEdge = m_Parses->next() )
496 //for (int z = 0; z < m_Parses->size(); z++)
497 { //pEdge = m_Parses->at(z);
498 pEdge->GetParse( &oneParse );
500 compound = oneParse.Display('.');
501 ssCompound = compound;
502 oneParse.Collapse( ssCompound, '.' );
504 // Attach components, stems, and linkers
505 for( int j = 1; j <= oneParse.Size(); j++ )
507 pStemSet = NULL;
509 if( allComponents->find( oneParse.GetPiece(j).Display() ) != allComponents->end() )
511 pStemSet = allComponents->find( oneParse.GetPiece(j).Display() ).data();
513 else if( oneParse.GetPiece(j).Display().length() >= MINIMUM_STEM_LENGTH )
515 pStemSet = m_pLexicon->GetAllStems()->find( oneParse.GetPiece(j).Display() );
517 if( !pStemSet ) pStemSet = m_pLexicon->GetAllWords()->find( oneParse.GetPiece(j).Display() );
519 if( pStemSet ) allComponents->insert( oneParse.GetPiece(j).Display(), pStemSet );
522 if( pStemSet )
524 m_MyComponents->insert( j, pStemSet );
525 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
526 for (int z = 0; z < pStemSet->size(); z++)
528 pStem= pStemSet->at(z);
529 pStem->IncrementCompoundCount(
530 double(GetCorpusCount()) /
531 m_Parses->count());
534 *pComponentCount +=
535 double(1.0) / m_Parses->count();
537 else
539 // It is a linker element
540 pLinker = *m_pLexicon->GetLinkers() << oneParse.GetPiece(j);
542 if( pLinker )
544 m_MyLinkers->insert( j, pLinker );
545 pLinker->AddCompound( this );
546 pLinker->IncrementCompoundCount(
547 double(1.0) /
548 m_Parses->count());
549 pLinker->IncrementCorpusCount(
550 // XXX. why integer?
551 static_cast<int>(
552 double(GetCorpusCount()) /
553 m_Parses->count()));
554 *pLinkerCount += double(1.0) /
555 m_Parses->count();
563 void CCompound::AddParse( CEdge* pEdge )
565 CStem* pStem;
566 CEdge* qEdge;
567 CParse oneParse;
568 QString compound;
569 CSS ssCompound;
570 CLinker* pLinker;
571 StemSet* pStemSet;
573 if( !pEdge ) return;
575 int MINIMUM_STEM_LENGTH = m_pLexicon->GetIntParameter( "Main\\MinimumStemLength", 3 );
577 m_Parses->append( pEdge );
579 for( qEdge = m_Parses->first(); qEdge; qEdge = m_Parses->next() )
580 //for (int z =0; z < m_Parses->size(); z++)
581 { //qEdge = m_Parses->at(z);
582 qEdge->GetParse( &oneParse );
583 compound = oneParse.Display('.');
584 ssCompound = compound;
585 oneParse.Collapse( ssCompound, '.' );
587 // Attach components, stems, and linkers
588 // Recalculate counts
589 for( int j = 1; j <= oneParse.Size(); j++ )
591 pStemSet = NULL;
593 if( oneParse.GetPiece(j).Display().length() >= MINIMUM_STEM_LENGTH )
595 pStemSet = m_pLexicon->GetAllStems()->find( oneParse.GetPiece(j).Display() );
597 if( !pStemSet ) pStemSet = m_pLexicon->GetAllWords()->find( oneParse.GetPiece(j).Display() );
600 if( pStemSet )
602 if( qEdge == pEdge )
604 m_MyComponents->insert( j, pStemSet );
605 // TODO: add stem to compound links
607 else
609 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
610 for (int z = 0; z < pStemSet->size(); z++)
611 { pStem = pStemSet->at(z);
612 pStem->IncrementCompoundCount(
613 -double(GetCorpusCount()) /
614 (double(m_Parses->count()) -
615 1.0));
619 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
620 for (int z = 0; z < pStemSet->size(); z++)
621 { pStem = pStemSet->at(z);
622 pStem->IncrementCompoundCount(
623 double(GetCorpusCount()) /
624 m_Parses->count());
627 else
629 // It is a linker element
630 pLinker = *m_pLexicon->GetLinkers() << oneParse.GetPiece(j);
632 if( pLinker )
634 if( qEdge == pEdge )
636 m_MyLinkers->insert( j, pLinker );
637 pLinker->AddCompound( this );
639 else
641 pLinker->IncrementCompoundCount(
642 -double(1.0) /
643 (double(m_Parses->count()) -
644 1.0));
645 pLinker->IncrementCorpusCount(
646 // XXX. why integer?
647 static_cast<int>(
648 -double(GetCorpusCount()) /
649 (double(m_Parses->count()) -
650 1.0)));
653 pLinker->IncrementCompoundCount(
654 // XXX. why integer?
655 static_cast<int>(
656 double(1.0) /
657 m_Parses->count()));
658 pLinker->IncrementCorpusCount(
659 // XXX. why integer?
660 static_cast<int>(
661 double(GetCorpusCount()) /
662 m_Parses->count()));
670 bool CCompound::RemoveParse( CEdge* pEdge )
672 CStem* pStem;
673 CEdge* qEdge;
674 CParse oneParse;
675 QString compound;
676 CSS ssCompound;
677 CLinker* pLinker;
678 StemSet* pStemSet;
680 if( !pEdge ) return FALSE;
682 int MINIMUM_STEM_LENGTH = m_pLexicon->GetIntParameter( "Main\\MinimumStemLength", 3 );
684 int pos = m_Parses->find( pEdge );
685 if( pos < 0 ) return FALSE;
687 for( qEdge = m_Parses->first(); qEdge; qEdge = m_Parses->next() )
688 //for (int z = 0; z < m_Parses->size(); z++)
690 // qEdge = m_Parses->at(z);
691 qEdge->GetParse( &oneParse );
692 compound = oneParse.Display('.');
693 ssCompound = compound;
694 oneParse.Collapse( ssCompound, '.' );
696 // Attach components, stems, and linkers
697 // Recalculate counts
698 for( int j = 1; j <= oneParse.Size(); j++ )
700 pStemSet = NULL;
702 if( oneParse.GetPiece(j).Display().length() >= MINIMUM_STEM_LENGTH )
704 pStemSet = m_pLexicon->GetAllStems()->find( oneParse.GetPiece(j).Display() );
706 if( !pStemSet ) pStemSet = m_pLexicon->GetAllWords()->find( oneParse.GetPiece(j).Display() );
709 if( pStemSet )
711 if( qEdge == pEdge )
713 // TODO: remove stem to compound links
715 else
717 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
718 for (int y = 0; y < pStemSet->size(); y++)
719 { pStem = pStemSet->at(y);
720 pStem->IncrementCompoundCount(
721 double(GetCorpusCount()) /
722 (double(m_Parses->count()) -
723 1.0));
727 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
728 for (int w = 0; w < pStemSet->size(); w++)
729 { pStem= pStemSet->at(w);
730 pStem->IncrementCompoundCount(
731 -double(GetCorpusCount()) /
732 m_Parses->count());
735 else
737 // It is a linker element
738 pLinker = *m_pLexicon->GetLinkers() << oneParse.GetPiece(j);
740 if( pLinker )
742 if( qEdge == pEdge )
744 pLinker->RemoveCompound( this );
746 else
748 pLinker->IncrementCompoundCount(
749 double(1.0) /
750 (double(m_Parses->count()) -
751 1.0));
752 pLinker->IncrementCorpusCount(
753 static_cast<int>(
754 double(GetCorpusCount()) /
755 (double(m_Parses->count()) -
756 1.0)));
759 pLinker->IncrementCompoundCount(
760 -double(1.0) / m_Parses->count());
761 pLinker->IncrementCorpusCount(
762 // XXX. why integer?
763 static_cast<int>(
764 -double(GetCorpusCount()) /
765 m_Parses->count()));
767 if( pLinker->GetCompoundCount() <= 0.0 )
769 m_pLexicon->GetLinkers()->RemoveMember( pLinker );
776 m_Parses->remove( qEdge ) ; //@@@@ check that this is right -- JG
778 if( pos == m_BestParse ) SetBestParse(-1);
780 return TRUE;
784 StemSet* CCompound::GetComponent( int i ) const
786 if( m_MyComponents->find(i) == m_MyComponents->end() ) return NULL;
787 return m_MyComponents->find(i).data();
791 CLinker* CCompound::GetLinker( int i ) const
793 if( m_MyLinkers->find(i) == m_MyLinkers->end() ) return NULL;
794 return m_MyLinkers->find(i).data();
798 double CCompound::GetPrefixness()
800 StemSet* compound = GetComponent(0);
801 if( compound ) return compound->first()->GetAffixness();
802 else return 0.0;
806 double CCompound::GetSuffixness()
808 StemSet* compound = GetComponent( m_MyComponents->count() - 1 );
809 if( compound ) return compound->first()->GetAffixness();
810 else return 0.0;
814 QString CCompound::DisplayParse( int i, StringToString* filter )
816 CEdge* pEdge = m_Parses->at(i);
817 if( !pEdge ) return "";
819 return pEdge->DisplayParse( filter );