1 // Compound discovery and analysis within the Lexicon class
2 // Copyright © 2009 The University of Chicago
7 #include "linguisticamainwindow.h"
9 #include "EarleyParser.h"
10 #include "MiniLexicon.h"
11 #include "GrammarRule.h"
12 #include "Signature.h"
19 #include "TerminalRuleCollection.h"
20 #include "CompoundCollection.h"
21 #include "LinkerCollection.h"
22 #include "WordCollection.h"
23 #include "StemCollection.h"
27 * Displays all compounds in the collection view <i>pView</i>. All joined characters are
28 * re-filtered on output to separate characters with <i>filter</i>. The <i>separator</i> will be
29 * placed between components of a compound in the first column.
31 void CLexicon::CompoundListDisplay( Q3ListView
* pView
, StringToString
* filter
, QChar separator
)
37 pView
->setRootIsDecorated( FALSE
);
39 for (int i
= 0; i
< (int)m_pCompounds
->GetCount(); i
++) {
40 if( !m_pCompounds
->GetAt(i
)->GetParses() ) continue;
42 n
= m_pCompounds
->GetAt(i
)->GetParses()->count();
43 if( n
> MostParses
) MostParses
= n
;
46 // Remove all previous columns
47 while( pView
->columns() ) pView
->removeColumn( 0 );
50 pView
->addColumn( "Compound" );
51 pView
->addColumn( "# Parses" );
52 pView
->addColumn( "Most Frequent Stem" );
53 pView
->addColumn( "Prefixness" );
54 pView
->addColumn( "Suffixness" );
55 m_pCompounds
->Sort(KEY
);
57 linguistica::ui::status_user_agent
& status
= status_display();
58 status
.major_operation
= "Creating compound list for display";
59 status
.progress
.clear();
60 status
.progress
.set_denominator(m_pCompounds
->GetCount());
61 for (int i
= 0; i
< (int)m_pCompounds
->GetCount(); i
++) {
63 pCompound
= m_pCompounds
->GetAtSort(i
);
64 pCompound
->CompoundListDisplay(pView
, filter
, separator
);
66 status
.progress
.clear();
67 status
.major_operation
.clear();
72 * Displays all linkers in the collection view <i>pView</i>. All joined characters are
73 * re-filtered on output to separate characters with <i>filter</i>.
75 void CLexicon::LinkerListDisplay(Q3ListView
* pView
, QMap
<QString
, QString
>* filter
)
77 pView
->setRootIsDecorated(false);
79 // Remove all previous columns
80 while( pView
->columns() ) pView
->removeColumn( 0 );
83 pView
->addColumn( "Linker" );
84 pView
->addColumn( "Corpus Count" );
85 pView
->addColumn( "Compound Count" );
86 pView
->addColumn( "Compounds" );
87 m_pLinkers
->Sort(KEY
);
89 linguistica::ui::status_user_agent
& status
= status_display();
90 status
.major_operation
= "Creating linker list for display";
91 status
.progress
.clear();
92 status
.progress
.set_denominator(m_pLinkers
->GetCount());
93 for (int i
= 0; i
< (int)m_pLinkers
->GetCount(); i
++) {
95 CLinker
* pLinker
= m_pLinkers
->GetAtSort(i
);
96 pLinker
->ListDisplay( pView
, filter
);
98 status
.progress
.clear();
99 status
.major_operation
.clear();
103 * Displays all compound components in the collection view <i>pView</i>.
104 * All joined characters are re-filtered on output to separate characters
105 * with the lexicon’s filter.
107 void CLexicon::CompoundComponentListDisplay(Q3ListView
* pView
)
109 pView
->setRootIsDecorated(false);
111 // Remove all previous columns
112 while( pView
->columns() ) pView
->removeColumn( 0 );
114 // Add Column headers
115 pView
->addColumn( "Compound component" );
116 pView
->addColumn( "Corpus Count" );
117 pView
->addColumn( "Compound Count" );
119 linguistica::ui::status_user_agent
& status
= status_display();
120 status
.progress
.clear();
121 status
.progress
.set_denominator(m_pCompounds
->GetComponents()->GetCount());
122 for (int i
= 0; i
< (int)m_pCompounds
->GetComponents()->GetCount(); i
++) {
124 CStem
* pStem
= m_pCompounds
->GetComponents()->GetAtSort(i
);
125 static_cast<void>(new Q3ListViewItem(
126 pView
, pStem
->GetKey().Display(),
127 IntToStringWithCommas(pStem
->GetCorpusCount())));
129 status
.progress
.clear();
132 status
.major_operation
.clear();
135 void CLexicon::FromStemsFindFlatCompounds( QList
<CStem
*>* compounds
, QList
<CStem
*>* components
, QString linker
, int maxNumberOfRoots
)
141 int MaximumParseDepth
= GetIntParameter ("EarleyParser\\MaximumParseDepth", 6 );
143 // We need to have analyzed some stems.
146 for (int i
= 0; i
< static_cast <int> ( m_pMiniLexica
->size() ) ; i
++ )
148 pMini
= (*m_pMiniLexica
)[i
];
149 if( pMini
) stemCount
+= pMini
->GetStems()->GetCount();
151 if( stemCount
== 0 ) return;
155 CTerminalRule
* tRule
;
156 CWildCardRule
* wRule
;
157 QString rule
, compound
;
158 Q3PtrList
<CEdge
>* allParses
= NULL
;
159 CCompound
* pCompound
;
165 QTextStream
* logFile
= NULL
;
167 int longestCompound
= 0;
168 int shortestComponent
= 0;
170 m_pCompounds
->SetComponentCount( 0.0 );
171 m_pCompounds
->SetLinkerCount( 0.0 );
173 int MINIMUM_STEM_LENGTH
= GetIntParameter( "Main\\MinimumStemLength", 3 );
174 int MAXIMUM_LINKER_LENGTH
= GetIntParameter( "Compounds\\MaximumLinkerLength", 1 );
176 // We will attempt to parse all stems and unanalyzed
177 // words (assumed to be stems), so we should collect
178 // them all into one place.
179 CStemCollection Stems
;
180 Stems
.CreateReverseTrie();
183 for (int i
= GetMiniSize()-1; i
>= 0; i
--) {
184 mini
= GetMiniLexicon(i
);
185 if( !mini
) continue;
187 for( j
= 0; j
< mini
->GetStems()->GetCount(); j
++ )
189 Stems
<< mini
->GetStems()->GetAt(j
);
192 for( j
= 0; j
< mini
->GetWords()->GetCount(); j
++ )
194 // We don't want to parse analyzed words
195 if( mini
->GetWords()->GetAt(j
)->Size() > 1 ) continue;
197 // We don't want words that are too short
198 if( mini
->GetWords()->GetAt(j
)->GetKeyLength() < MINIMUM_STEM_LENGTH
) continue;
200 Stems
<< mini
->GetWords()->GetAt(j
);
204 for (int z
= 0; z
< compounds
->size(); z
++) {
205 CStem
* pStem
= compounds
->at(z
);
207 if (pStem
->GetKeyLength() > longestCompound
)
208 longestCompound
= pStem
->GetKeyLength();
211 for (int y
= 0; y
< components
->size(); y
++) {
212 CStem
* pStem
= components
->at(y
);
214 if( shortestComponent
== 0 || pStem
->GetKeyLength() < shortestComponent
)
216 shortestComponent
= pStem
->GetKeyLength();
222 // We need a grammar to parse from
223 RuleCollection grammar
;
226 grammar
.insert( "Start", RuleList() );
227 //grammar[ "Start" ].setAutoDelete( TRUE );
229 pRule
= new CGrammarRule();
230 rule
= "Start Word"; // Start --> Word
231 pRule
->Collapse( rule
);
232 grammar
[ "Start" ].append( pRule
);
235 grammar
.insert( "Word", RuleList() );
236 //grammar[ "Word" ].setAutoDelete( TRUE );
238 pRule
= new CGrammarRule();
239 rule
= "Word Compound"; // Word --> Compound
240 pRule
->Collapse( rule
);
241 grammar
[ "Word" ].append( pRule
);
244 grammar
.insert( "Compound", RuleList() );
245 //grammar[ "Compound" ].setAutoDelete( TRUE );
247 for (int i
= 2; i
<= maxNumberOfRoots
; i
++) { // Compound --> Root Root+
248 pRule
= new CGrammarRule();
251 for( j
= 0; j
< i
; j
++ ) rule
+= " Root";
253 pRule
->Collapse( rule
);
254 grammar
[ "Compound" ].append( pRule
);
259 CTerminalRuleCollection linkers
;
261 if( linker
== QString::null
)
263 if( MAXIMUM_LINKER_LENGTH
> 0 )
265 grammar
.insert( "Linker", RuleList() );
266 //grammar[ "Linker" ].setAutoDelete( TRUE );
268 for (int i
= 1; i
<= MAXIMUM_LINKER_LENGTH
; i
++) {
270 wRule
= new CWildCardRule( rule
, i
);
271 grammar
[ "Linker" ].append( wRule
);
278 tRule
= new CTerminalRule( rule
);
279 tRule
->SetKey( CStringSurrogate( linker
) );
280 Q_ASSERT( tRule
== ( linkers
<< tRule
) );
284 if( grammar
.find( "Linker" ) != grammar
.end() ||
285 linker
!= QString::null
)
288 for (int i
= 2; i
<= maxNumberOfRoots
; i
++) { // Compound --> Root (Linker Root)+
289 pRule
= new CGrammarRule();
292 for( j
= 0; j
< i
- 1; j
++ ) rule
+= " Root Linker";
295 pRule
->Collapse( rule
);
296 grammar
[ "Compound" ].append( pRule
);
300 // Add all the stems as terminal root rules
301 CTerminalRuleCollection stems
;
305 for (int i
= 0; i
< Stems
.GetCount(); i
++) {
306 CStem
* pStem
= Stems
.GetAtSort(i
);
308 tRule
= new CTerminalRule( rule
);
309 tRule
->Append( pStem
);
310 Q_ASSERT( tRule
== ( stems
<< tRule
) );
313 for (int z
= 0; z
< components
->size(); z
++) {
314 CStem
* pStem
= components
->at(z
);
316 tRule
= new CTerminalRule( rule
);
317 tRule
->Append( pStem
);
318 Q_ASSERT( tRule
== ( stems
<< tRule
) );
322 QMap
<QString
, CTerminalRuleCollection
*> lexicon
;
323 lexicon
.insert( "Root", &stems
);
324 if( linker
!= QString::null
) lexicon
.insert( "Linker", &linkers
);
329 linguistica::ui::status_user_agent
& status
= status_display();
330 status
.major_operation
= "Parsing possible compounds";
331 status
.progress
.clear();
332 status
.progress
.set_denominator(Stems
.GetCount());
333 CEarleyParser
* parser
= NULL
;
334 for (int i
= 0; i
< Stems
.GetCount(); i
++) {
336 CStem
* pStem
= Stems
.GetAtSort(i
);
338 // We don't want to analyze anything that can't contain two stems
339 // Words that are not at least the length of two stems
340 if( pStem
->GetKeyLength() < 2 * MINIMUM_STEM_LENGTH
) continue;
342 // This includes anything that has less than two valid stems that start
343 // at the first character of this word
344 CSS key
= pStem
->GetKey();
345 if( Stems
.CountValidSubstrings( key
) < 2 ) continue;
347 // And also includes anything that has less than two valid stems that
348 // end at the last character of this word
350 if( Stems
.GetReverseTrie()->CountValidSubstrings( key
) < 2 ) continue;
353 logFile
= LogFileOn() ? GetLogFileStream() : NULL
;
356 parser
= new CEarleyParser(pStem
, &grammar
, &lexicon
, logFile
, MaximumParseDepth
);
357 pStem
->SetMyEarleyParser(parser
);
359 if (parser
->isValidGrammar())
360 allParses
= parser
->Parse();
362 if (allParses
&& !allParses
->isEmpty()) {
363 // We found some parses, now we can create a compound
364 // we'll use the first parse until we can get component
365 // probability information
367 pEdge
= allParses
->first();
368 pEdge
->GetParse( &oneParse
);
370 pCompound
= *m_pCompounds
<< oneParse
.Display();
371 pCompound
->IncrementCorpusCount( pStem
->GetCorpusCount() - 1 );
373 double componentCount
= 0.0,
375 pCompound
->SetLexicon( this );
376 pCompound
->SetParses( allParses
, &componentCount
, &linkerCount
);
377 m_pCompounds
->SetComponentCount( m_pCompounds
->GetComponentCount() + componentCount
);
378 m_pCompounds
->SetLinkerCount( m_pCompounds
->GetLinkerCount() + linkerCount
);
379 pCompound
->SetBestParse(0);
381 UpdateCompound( pCompound
->Display() );
384 status
.progress
.clear();
385 status
.major_operation
.clear();
389 void CLexicon::FromStemsFindCompounds( QList
<CStem
*>* compounds
, QList
<CStem
*>* components
, QString linker
)
395 int MaximumParseDepth
= GetIntParameter ("EarleyParser\\MaximumParseDepth", 5 );
397 // We need to have analyzed some stems.
400 for( i
= 0; i
< static_cast <int> ( m_pMiniLexica
->size() ) ; i
++ )
402 pMini
= (*m_pMiniLexica
)[i
];
403 if( pMini
) stemCount
+= pMini
->GetStems()->GetCount();
405 if( stemCount
== 0 ) return;
410 CTerminalRule
* tRule
;
411 CWildCardRule
* wRule
;
412 QString rule
, compound
;
413 Q3PtrList
<CEdge
>* allParses
= NULL
;
414 CCompound
* pCompound
;
419 QTextStream
* logFile
= NULL
;
421 int longestCompound
= 0;
422 int shortestComponent
= 0;
424 m_pCompounds
->SetComponentCount( 0.0 );
425 m_pCompounds
->SetLinkerCount( 0.0 );
427 int MINIMUM_STEM_LENGTH
= GetIntParameter( "Main\\MinimumStemLength", 3 );
428 int MAXIMUM_LINKER_LENGTH
= GetIntParameter( "Compounds\\MaximumLinkerLength", 1 );
430 // We will attempt to parse all stems and unanalyzed
431 // words (assumed to be stems), so we should collect
432 // them all into one place.
433 CStemCollection Stems
;
434 Stems
.CreateReverseTrie();
438 for( i
= GetMiniSize()-1; i
>= 0; i
-- )
440 mini
= GetMiniLexicon(i
);
441 if( !mini
) continue;
443 for( j
= 0; j
< mini
->GetStems()->GetCount(); j
++ )
445 Stems
<< mini
->GetStems()->GetAt(j
);
448 for( j
= 0; j
< mini
->GetWords()->GetCount(); j
++ )
450 // We don't want to parse analyzed words
451 if( mini
->GetWords()->GetAt(j
)->Size() > 1 ) continue;
453 // We don't want words that are too short
454 if( mini
->GetWords()->GetAt(j
)->GetKeyLength() < MINIMUM_STEM_LENGTH
) continue;
456 Stems
<< mini
->GetWords()->GetAt(j
);
462 //for( pStem = compounds->first(); pStem; pStem = compounds->next() )
463 for (int z
= 0; z
< compounds
->size(); z
++)
464 { pStem
= compounds
->at(z
);
466 if( pStem
->GetKeyLength() > longestCompound
)
468 longestCompound
= pStem
->GetKeyLength();
472 //for( pStem = components->first(); pStem; pStem = components->next() )
473 for (int y
= 0; y
< components
->size(); y
++)
474 { pStem
= components
->at(y
);
476 if( shortestComponent
== 0 || pStem
->GetKeyLength() < shortestComponent
)
478 shortestComponent
= pStem
->GetKeyLength();
484 // We need a grammar to parse from
485 RuleCollection grammar
;
488 grammar
.insert( "Start", RuleList() );
489 //grammar[ "Start" ].setAutoDelete( TRUE );
491 pRule
= new CGrammarRule();
492 rule
= "Start Word"; // Start --> Word
493 pRule
->Collapse( rule
);
494 grammar
[ "Start" ].append( pRule
);
497 grammar
.insert( "Word", RuleList() );
498 //grammar[ "Word" ].setAutoDelete( TRUE );
500 pRule
= new CGrammarRule();
501 rule
= "Word Compound"; // Word --> Compound
502 pRule
->Collapse( rule
);
503 grammar
[ "Word" ].append( pRule
);
506 grammar
.insert( "Compound", RuleList() );
507 //grammar[ "Compound" ].setAutoDelete( TRUE );
509 pRule
= new CGrammarRule();
510 rule
= "Compound Compound Compound"; // Compound --> Compound Compound
511 pRule
->Collapse( rule
);
512 grammar
[ "Compound" ].append( pRule
);
514 pRule
= new CGrammarRule();
515 rule
= "Compound Compound Root"; // Compound --> Compound Root
516 pRule
->Collapse( rule
);
517 grammar
[ "Compound" ].append( pRule
);
519 pRule
= new CGrammarRule();
520 rule
= "Compound Root Compound"; // Compound --> Root Compound
521 pRule
->Collapse( rule
);
522 grammar
[ "Compound" ].append( pRule
);
524 pRule
= new CGrammarRule();
525 rule
= "Compound Root Root"; // Compound --> Root Root
526 pRule
->Collapse( rule
);
527 grammar
[ "Compound" ].append( pRule
);
530 CTerminalRuleCollection linkers
;
532 if( linker
== QString::null
)
534 if( MAXIMUM_LINKER_LENGTH
> 0 )
536 grammar
.insert( "Linker", RuleList() );
537 //grammar[ "Linker" ].setAutoDelete( TRUE );
539 for( i
= 1; i
<= MAXIMUM_LINKER_LENGTH
; i
++ )
542 wRule
= new CWildCardRule( rule
, i
);
543 grammar
[ "Linker" ].append( wRule
);
550 tRule
= new CTerminalRule( rule
);
551 tRule
->SetKey( CStringSurrogate( linker
) );
552 Q_ASSERT( tRule
== ( linkers
<< tRule
) );
556 if( grammar
.find( "Linker" ) != grammar
.end() ||
557 linker
!= QString::null
)
559 pRule
= new CGrammarRule();
560 rule
= "Compound Compound Linker Compound"; // Compound --> Compound Linker Compound
561 pRule
->Collapse( rule
);
562 grammar
[ "Compound" ].append( pRule
);
564 pRule
= new CGrammarRule();
565 rule
= "Compound Compound Linker Root"; // Compound --> Compound Linker Root
566 pRule
->Collapse( rule
);
567 grammar
[ "Compound" ].append( pRule
);
569 pRule
= new CGrammarRule();
570 rule
= "Compound Root Linker Compound"; // Compound --> Root Linker Compound
571 pRule
->Collapse( rule
);
572 grammar
[ "Compound" ].append( pRule
);
574 pRule
= new CGrammarRule();
575 rule
= "Compound Root Linker Root"; // Compound --> Root Linker Root
576 pRule
->Collapse( rule
);
577 grammar
[ "Compound" ].append( pRule
);
580 // Add all the stems as terminal root rules
581 CTerminalRuleCollection stems
;
586 for( i
= 0; i
< Stems
.GetCount(); i
++ )
588 pStem
= Stems
.GetAtSort(i
);
591 tRule
= new CTerminalRule( rule
);
592 tRule
->Append( pStem
);
593 Q_ASSERT( tRule
== ( stems
<< tRule
) );
598 //for( pStem = components->first(); pStem; pStem = components->next() )
599 for (int z
= 0; z
< components
->size(); z
++)
600 { pStem
= components
->at(z
);
602 tRule
= new CTerminalRule( rule
);
603 tRule
->Append( pStem
);
604 Q_ASSERT( tRule
== ( stems
<< tRule
) );
608 QMap
<QString
, CTerminalRuleCollection
*> lexicon
;
609 lexicon
.insert( "Root", &stems
);
610 if( linker
!= QString::null
) lexicon
.insert( "Linker", &linkers
);
615 linguistica::ui::status_user_agent
& status
= status_display();
617 status
.major_operation
= "Parsing possible compounds";
618 status
.progress
.clear();
619 status
.progress
.set_denominator(Stems
.GetCount());
620 CEarleyParser
* parser
= NULL
;
621 for (i
= 0; i
< Stems
.GetCount(); i
++) {
623 pStem
= Stems
.GetAtSort(i
);
625 // We don't want to analyze anything that can't contain two stems
626 // Words that are not at least the length of two stems
627 if( pStem
->GetKeyLength() < 2 * MINIMUM_STEM_LENGTH
) continue;
629 // This includes anything that has less than two valid stems that start
630 // at the first character of this word
631 CSS key
= pStem
->GetKey();
632 if( Stems
.CountValidSubstrings( key
) < 2 ) continue;
634 // And also includes anything that has less than two valid stems that
635 // end at the last character of this word
637 if( Stems
.GetReverseTrie()->CountValidSubstrings( key
) < 2 ) continue;
642 logFile
= GetLogFileStream();
643 } else { logFile
= NULL
; }
646 parser
= new CEarleyParser( pStem
, &grammar
, &lexicon
, logFile
, MaximumParseDepth
);
647 pStem
->SetMyEarleyParser( parser
);
649 if( parser
->isValidGrammar() )
651 allParses
= parser
->Parse();
654 if( allParses
&& !allParses
->isEmpty() )
656 // We found some parses, now we can create a compound
657 // we'll use the first parse until we can get component
658 // probability information
660 pEdge
= allParses
->first();
661 pEdge
->GetParse( &oneParse
);
663 pCompound
= *m_pCompounds
<< oneParse
.Display();
664 pCompound
->IncrementCorpusCount( pStem
->GetCorpusCount() - 1 );
666 //-----------------------------------------------//
667 for (int m
= 1; m
<= oneParse
.Size(); m
++)
669 *m_pCompounds
->GetComponents() << oneParse
.GetPiece(m
).Display();
672 //-----------------------------------------------//
674 double componentCount
= 0.0,
676 pCompound
->SetLexicon( this );
677 pCompound
->SetParses( allParses
, &componentCount
, &linkerCount
);
678 m_pCompounds
->SetComponentCount( m_pCompounds
->GetComponentCount() + componentCount
);
679 m_pCompounds
->SetLinkerCount( m_pCompounds
->GetLinkerCount() + linkerCount
);
680 pCompound
->SetBestParse(0);
682 UpdateCompound( pCompound
->Display() );
685 status
.progress
.clear();
686 status
.major_operation
.clear();
690 void CLexicon::CalculateCoefficientsOfAffixness()
694 SuffixSet
* pSuffixSet
;
695 PrefixSet
* pPrefixSet
;
698 CStem
* pStem
, * pWord
;
703 CCompound
* pCompound
;
706 CSignature
* pPrefixSignature
;
708 QList
<CStem
*>* pStems
;
710 linguistica::ui::status_user_agent
& status
= status_display();
711 status
.major_operation
= "Calculating affixness...";
712 status
.progress
.clear();
713 status
.progress
.set_denominator(m_pCompounds
->GetComponentMap()->count());
714 ComponentMap::Iterator it
;
715 for (it
= m_pCompounds
->GetComponentMap()->begin(); it
!= m_pCompounds
->GetComponentMap()->end(); ++it
) {
716 status
.progress
= ++count
;
717 double affix_prob
= 0.0,
718 component_prob
= 0.0;
720 double sig_count
, stem_count
, affix_count
;
722 pSuffixSet
= m_AllSuffixes
[ it
.key() ];
726 for(int suffixno
= 0; suffixno
< static_cast <int> ( pSuffixSet
->count() ); suffixno
++ )
728 pSuffix
= pSuffixSet
->at(suffixno
);
730 pStems
= pSuffix
->GetStems();
732 if( !pStems
) continue;
734 for (int stemno
= 0; stemno
< pStems
->size(); stemno
++)
735 { pStem
= pStems
->at(stemno
);
736 stem_count
= pStem
->GetCorpusCount();
737 sig_count
= pStem
->GetSuffixSignature()->GetCorpusCount();
738 CSignature
* pSuffixSignature
= pStem
->GetSuffixSignature();
740 numberofwords
= pSuffixSignature
->GetNumberOfWords();
741 pWord
= pSuffixSignature
->GetWord(stemno
, suffixno
);
742 if( pWord
->GetSuffix() != pSuffix
->GetKey() ) continue;
743 affix_count
+= pWord
->GetCorpusCount();
744 affix_prob
+= ( (double) sig_count
/ (double) GetCorpusCount() ) *
745 ( (double) stem_count
/ (double) sig_count
) *
746 ( (double) affix_count
/ (double) sig_count
);
753 pPrefixSet
= m_AllPrefixes
[ it
.key() ];
757 for( int prefixno
= 0; prefixno
< static_cast <int> ( pPrefixSet
->count() ); j
++ )
759 pPrefix
= pPrefixSet
->at(prefixno
);
761 pStems
= pPrefix
->GetStems();
763 if( !pStems
) continue;
765 //for( pStem = pStems->first(); pStem; pStem = pStems->next() )
766 for (int stemno
= 0; stemno
< pStems
->size(); stemno
++)
767 { pStem
= pStems
->at(stemno
);
768 stem_count
= pStem
->GetCorpusCount();
769 sig_count
= pStem
->GetPrefixSignature()->GetCorpusCount();
772 // QList<CStem*>* pWords = pStem->GetPrefixSignature()->GetWordPtrList();
773 pPrefixSignature
= pStem
->GetPrefixSignature();
774 numberofwords
= pPrefixSignature
->GetNumberOfWords();
775 // if( !pWords ) continue;
778 pWord
= pPrefixSignature
->GetWord(stemno
, prefixno
);
779 if( pWord
->GetPrefix() != pPrefix
->GetKey() ) continue;
780 affix_count
+= pWord
->GetCorpusCount();
783 affix_prob
+= ( (double) sig_count
/ (double) GetCorpusCount() ) *
784 ( (double) stem_count
/ (double) sig_count
) *
785 ( (double) affix_count
/ (double) sig_count
);
792 double word_is_compound_prob
= (double) m_pCompounds
->GetCorpusCount() / (double) GetCorpusCount();
794 for( i
= 0; i
< m_pCompounds
->GetCount(); i
++ )
796 pCompound
= m_pCompounds
->GetAt(i
);
798 double this_parse_prob
,
800 for( pEdge
= pCompound
->GetParses()->first(); pEdge
; pEdge
= pCompound
->GetParses()->next() )
801 //for (int z= 0; z < pCompound->GetParses()->size(); z++)
802 { // pEdge = pCompound->GetParses()->at(z);
803 pEdge
->GetParse( &parse
);
805 if( parse
.Find( it
.key() ) == 0 ) continue;
807 this_parse_prob
= 1.0;
809 for( j
= 1; j
< parse
.Size(); j
++ )
811 if( m_pCompounds
->GetComponentMap()->find( parse
.GetPiece(j
).Display() ) != m_pCompounds
->GetComponentMap()->end() )
813 pStemSet
= m_pCompounds
->GetComponentMap()->find( parse
.GetPiece(j
).Display() ).data();
815 else pLinker
= *GetLinkers() ^= parse
.GetPiece(j
);
819 piece_prob
= pStemSet
->at(0)->GetCompoundCount() / m_pCompounds
->GetComponentCount();
821 else if( pLinker
) piece_prob
= pLinker
->GetCompoundCount() / m_pCompounds
->GetLinkerCount();
824 this_parse_prob
*= piece_prob
;
827 component_prob
+= word_is_compound_prob
* this_parse_prob
;
832 pStemSet
= it
.data();
833 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
834 for (int y
= 0; y
< pStemSet
->size(); y
++)
835 { pStem
= pStemSet
->at(y
);
836 if( affix_prob
== 0.0 ) pStem
->SetAffixness( 0.0 );
837 else if( component_prob
== 0.0 ) pStem
->SetAffixness( 1.0 );
838 else pStem
->SetAffixness( affix_prob
/ ( affix_prob
+ component_prob
) );
841 status
.progress
.clear();
842 status
.major_operation
.clear();
846 void CLexicon::FromAffixnessUpdateSigsAndCompounds()
849 double UPPER_THRESHOLD
= 0.70;
850 UPPER_THRESHOLD
= QInputDialog::getDouble( "Linguistica",
851 "Enter the upper affixness threshold:",
852 UPPER_THRESHOLD
, 0.0, 1.0, 2, &ok
, m_pDoc
);
853 if ( !ok
) UPPER_THRESHOLD
= 0.70;
855 double LOWER_THRESHOLD
= 0.30;
856 LOWER_THRESHOLD
= QInputDialog::getDouble( "Linguistica",
857 "Enter the lower affixness threshold:",
858 LOWER_THRESHOLD
, 0.0, 1.0, 2, &ok
, m_pDoc
);
859 if ( !ok
) LOWER_THRESHOLD
= 0.30;
861 if( UPPER_THRESHOLD
< LOWER_THRESHOLD
) UPPER_THRESHOLD
= LOWER_THRESHOLD
;
865 ComponentMap
* pComponents
= m_pCompounds
->GetComponentMap();
866 CCompound
* pCompound
;
869 QString component
, word
;
871 SuffixSet
* pSuffixSet
;
872 PrefixSet
* pPrefixSet
;
873 CStem
* pStem
, *pWord
;
878 QList
<CCompound
*> cmpdDeletions
;
879 QList
<CMiniLexicon
*> affectedMinis
;
881 for( int i
= 0; i
< m_pCompounds
->GetCount(); i
++ )
883 pCompound
= m_pCompounds
->GetAt(i
);
885 QList
<CEdge
*> edgeDeletions
;
887 for( pEdge
= pCompound
->GetParses()->first(); pEdge
; pEdge
= pCompound
->GetParses()->next() )
888 //for (int z= 0; z < pCompound->GetParses()->size(); z++)
889 {// pEdge = pCompound->GetParses()->at(z);
890 pEdge
->GetParse( &parse
);
892 for( int j
= 1; j
<= parse
.Size(); j
++ )
894 component
= parse
.GetPiece(j
).Display();
895 if( pComponents
->find( component
) == pComponents
->end() ) continue;
897 pStemSet
= pComponents
->find( component
).data();
899 affixness
= pStemSet
->at(0)->GetAffixness();
901 // Remove compounds which have a component whose affixness
902 // exceeds the upper affixness threshold
903 if( affixness
>= UPPER_THRESHOLD
)
905 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
906 for (int y
= 0; y
< pStemSet
->size(); y
++)
907 { pStem
= pStemSet
->at(y
);
908 pStem
->SetCompoundCount( 0.0 );
909 edgeDeletions
.append( pEdge
);
914 // Remove suffixes whose corresponding component's affixness
915 // is below the lower affixness threshold
916 if( affixness
<= LOWER_THRESHOLD
&& affixness
> 0 )
918 // Must exist as suffix or prefix also
919 pSuffixSet
= m_AllSuffixes
[ component
];
920 bool isSuffix
= TRUE
;
923 pPrefixSet
= m_AllPrefixes
[ component
];
925 if( !pPrefixSet
) continue;
930 //for( pSuffix = pSuffixSet->first(); pSuffix; pSuffix = pSuffixSet->next() )
931 for (int z
= 0; z
< pSuffixSet
->size(); z
++)
932 { pSuffix
= pSuffixSet
->at(z
);
933 pMini
= pSuffix
->GetMyMini();
934 pStemSet
= pSuffix
->GetStems();
936 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
937 for (int y
= 0; y
< pStemSet
->size(); y
++)
938 { pStem
= pStemSet
->at(y
);
939 word
= pStem
->Display() + pSuffix
->Display();
941 pWord
= (*pMini
->GetWords()) ^= CSS( word
);
943 pWord
->ClearParseStructure();
945 if( affectedMinis
.indexOf( pMini
) < 0 ) affectedMinis
.append( pMini
);
951 //for( pPrefix = pPrefixSet->first(); pPrefix; pPrefix = pPrefixSet->next() )
952 for (int z
= 0; z
< pPrefixSet
->size(); z
++)
953 { pPrefix
= pPrefixSet
->at(z
);
954 pMini
= pPrefix
->GetMyMini();
955 pStemSet
= pPrefix
->GetStems();
957 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
958 for (int z
= 0; z
< pStemSet
->size(); z
++)
959 { pStem
= pStemSet
->at(z
);
960 word
= pPrefix
->Display() + pStem
->Display();
961 pWord
= (*pMini
->GetWords()) ^= CSS( word
);
962 pWord
->ClearParseStructure();
963 if( affectedMinis
.indexOf( pMini
) < 0 ) affectedMinis
.append( pMini
);
971 // Remove all edges marked for deletion
972 //for( pEdge = edgeDeletions.first(); pEdge; pEdge = edgeDeletions.next() )
973 for (int z
=0; z
< edgeDeletions
.size(); z
++)
974 { pEdge
= edgeDeletions
.at(z
);
975 pCompound
->RemoveParse( pEdge
);
978 if( pCompound
->GetParses()->count() == 0 )
980 cmpdDeletions
.append( pCompound
);
982 else if( pCompound
->GetBestParse() < 0 ) pCompound
->SetBestParse( 0 );
985 // Remove all compounds marked for deletion
986 //for( pCompound = cmpdDeletions.first(); pCompound; pCompound = cmpdDeletions.next() )
987 for (int z
= 0; z
< cmpdDeletions
.size(); z
++)
988 { pCompound
= cmpdDeletions
.at(z
);
989 m_pCompounds
->RemoveMember( pCompound
);
992 // Update all affected minis
993 QString strAffixness
= "Affixness";
994 CStringSurrogate
cssAffixness( strAffixness
);
995 //for( pMini = affectedMinis.first(); pMini; pMini = affectedMinis.next() )
996 for (int z
= 0; z
< affectedMinis
.size(); z
++)
997 { pMini
= affectedMinis
.at(z
);
998 pMini
->TakeSplitWords_ProduceStemsAndSigs( cssAffixness
);
1001 // Check validity of compounds
1002 m_pCompounds
->CheckAndRecount();