HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / Lexicon_Compounds.cpp
blobb5eca87dd1d9580ced885e68109d004b0c0183b3
1 // Compound discovery and analysis within the Lexicon class
2 // Copyright © 2009 The University of Chicago
3 #include "Lexicon.h"
5 #include <QTextStream>
6 #include <QList>
7 #include "linguisticamainwindow.h"
8 #include "ui/Status.h"
9 #include "EarleyParser.h"
10 #include "MiniLexicon.h"
11 #include "GrammarRule.h"
12 #include "Signature.h"
13 #include "Compound.h"
14 #include "Linker.h"
15 #include "Suffix.h"
16 #include "Prefix.h"
17 #include "Stem.h"
18 #include "Edge.h"
19 #include "TerminalRuleCollection.h"
20 #include "CompoundCollection.h"
21 #include "LinkerCollection.h"
22 #include "WordCollection.h"
23 #include "StemCollection.h"
24 #include "Typedefs.h"
26 /**
27 * Displays all compounds in the collection view <i>pView</i>. All joined characters are
28 * re-filtered on output to separate characters with <i>filter</i>. The <i>separator</i> will be
29 * placed between components of a compound in the first column.
31 void CLexicon::CompoundListDisplay( Q3ListView* pView, StringToString* filter, QChar separator )
33 CCompound* pCompound;
34 int MostParses = 0;
35 int n;
37 pView->setRootIsDecorated( FALSE );
39 for (int i = 0; i < (int)m_pCompounds->GetCount(); i++) {
40 if( !m_pCompounds->GetAt(i)->GetParses() ) continue;
42 n = m_pCompounds->GetAt(i)->GetParses()->count();
43 if( n > MostParses ) MostParses = n;
46 // Remove all previous columns
47 while( pView->columns() ) pView->removeColumn( 0 );
49 // Add Column headers
50 pView->addColumn( "Compound" );
51 pView->addColumn( "# Parses" );
52 pView->addColumn( "Most Frequent Stem" );
53 pView->addColumn( "Prefixness" );
54 pView->addColumn( "Suffixness" );
55 m_pCompounds->Sort(KEY);
57 linguistica::ui::status_user_agent& status = status_display();
58 status.major_operation = "Creating compound list for display";
59 status.progress.clear();
60 status.progress.set_denominator(m_pCompounds->GetCount());
61 for (int i = 0; i < (int)m_pCompounds->GetCount(); i++) {
62 status.progress = i;
63 pCompound = m_pCompounds->GetAtSort(i);
64 pCompound->CompoundListDisplay(pView, filter, separator);
66 status.progress.clear();
67 status.major_operation.clear();
71 /**
72 * Displays all linkers in the collection view <i>pView</i>. All joined characters are
73 * re-filtered on output to separate characters with <i>filter</i>.
75 void CLexicon::LinkerListDisplay(Q3ListView* pView, QMap<QString, QString>* filter)
77 pView->setRootIsDecorated(false);
79 // Remove all previous columns
80 while( pView->columns() ) pView->removeColumn( 0 );
82 // Add Column headers
83 pView->addColumn( "Linker" );
84 pView->addColumn( "Corpus Count" );
85 pView->addColumn( "Compound Count" );
86 pView->addColumn( "Compounds" );
87 m_pLinkers->Sort(KEY);
89 linguistica::ui::status_user_agent& status = status_display();
90 status.major_operation = "Creating linker list for display";
91 status.progress.clear();
92 status.progress.set_denominator(m_pLinkers->GetCount());
93 for (int i = 0; i < (int)m_pLinkers->GetCount(); i++) {
94 status.progress = i;
95 CLinker* pLinker = m_pLinkers->GetAtSort(i);
96 pLinker->ListDisplay( pView, filter );
98 status.progress.clear();
99 status.major_operation.clear();
103 * Displays all compound components in the collection view <i>pView</i>.
104 * All joined characters are re-filtered on output to separate characters
105 * with the lexicon’s filter.
107 void CLexicon::CompoundComponentListDisplay(Q3ListView* pView)
109 pView->setRootIsDecorated(false);
111 // Remove all previous columns
112 while( pView->columns() ) pView->removeColumn( 0 );
114 // Add Column headers
115 pView->addColumn( "Compound component" );
116 pView->addColumn( "Corpus Count" );
117 pView->addColumn( "Compound Count" );
119 linguistica::ui::status_user_agent& status = status_display();
120 status.progress.clear();
121 status.progress.set_denominator(m_pCompounds->GetComponents()->GetCount());
122 for (int i = 0; i < (int)m_pCompounds->GetComponents()->GetCount(); i++) {
123 status.progress = i;
124 CStem* pStem = m_pCompounds->GetComponents()->GetAtSort(i);
125 static_cast<void>(new Q3ListViewItem(
126 pView, pStem->GetKey().Display(),
127 IntToStringWithCommas(pStem->GetCorpusCount())));
129 status.progress.clear();
131 // XXX. necessary?
132 status.major_operation.clear();
135 void CLexicon::FromStemsFindFlatCompounds( QList<CStem*>* compounds, QList<CStem*>* components, QString linker, int maxNumberOfRoots )
137 CMiniLexicon* pMini;
138 int stemCount = 0;
139 int j;
141 int MaximumParseDepth = GetIntParameter ("EarleyParser\\MaximumParseDepth", 6 );
143 // We need to have analyzed some stems.
144 if( !compounds )
146 for (int i = 0; i < static_cast <int> ( m_pMiniLexica->size() ) ; i++ )
148 pMini = (*m_pMiniLexica)[i];
149 if( pMini ) stemCount += pMini->GetStems()->GetCount();
151 if( stemCount == 0 ) return;
154 CGrammarRule* pRule;
155 CTerminalRule* tRule;
156 CWildCardRule* wRule;
157 QString rule, compound;
158 Q3PtrList<CEdge>* allParses = NULL;
159 CCompound* pCompound;
160 CParse oneParse;
161 CEdge* pEdge;
162 CSS ssCompound;
163 CMiniLexicon* mini;
165 QTextStream* logFile = NULL;
167 int longestCompound = 0;
168 int shortestComponent = 0;
170 m_pCompounds->SetComponentCount( 0.0 );
171 m_pCompounds->SetLinkerCount( 0.0 );
173 int MINIMUM_STEM_LENGTH = GetIntParameter( "Main\\MinimumStemLength", 3 );
174 int MAXIMUM_LINKER_LENGTH = GetIntParameter( "Compounds\\MaximumLinkerLength", 1 );
176 // We will attempt to parse all stems and unanalyzed
177 // words (assumed to be stems), so we should collect
178 // them all into one place.
179 CStemCollection Stems;
180 Stems.CreateReverseTrie();
182 if (!compounds) {
183 for (int i = GetMiniSize()-1; i >= 0; i--) {
184 mini = GetMiniLexicon(i);
185 if( !mini ) continue;
187 for( j = 0; j < mini->GetStems()->GetCount(); j++ )
189 Stems << mini->GetStems()->GetAt(j);
192 for( j = 0; j < mini->GetWords()->GetCount(); j++ )
194 // We don't want to parse analyzed words
195 if( mini->GetWords()->GetAt(j)->Size() > 1 ) continue;
197 // We don't want words that are too short
198 if( mini->GetWords()->GetAt(j)->GetKeyLength() < MINIMUM_STEM_LENGTH ) continue;
200 Stems << mini->GetWords()->GetAt(j);
203 } else {
204 for (int z = 0; z < compounds->size(); z++) {
205 CStem* pStem = compounds->at(z);
206 Stems << pStem;
207 if (pStem->GetKeyLength() > longestCompound)
208 longestCompound = pStem->GetKeyLength();
211 for (int y = 0; y < components->size(); y++) {
212 CStem* pStem = components->at(y);
213 Stems << pStem;
214 if( shortestComponent == 0 || pStem->GetKeyLength() < shortestComponent )
216 shortestComponent = pStem->GetKeyLength();
222 // We need a grammar to parse from
223 RuleCollection grammar;
225 // Start
226 grammar.insert( "Start", RuleList() );
227 //grammar[ "Start" ].setAutoDelete( TRUE );
229 pRule = new CGrammarRule();
230 rule = "Start Word"; // Start --> Word
231 pRule->Collapse( rule );
232 grammar[ "Start" ].append( pRule );
234 // Word
235 grammar.insert( "Word", RuleList() );
236 //grammar[ "Word" ].setAutoDelete( TRUE );
238 pRule = new CGrammarRule();
239 rule = "Word Compound"; // Word --> Compound
240 pRule->Collapse( rule );
241 grammar[ "Word" ].append( pRule );
243 // Compound
244 grammar.insert( "Compound", RuleList() );
245 //grammar[ "Compound" ].setAutoDelete( TRUE );
247 for (int i = 2; i <= maxNumberOfRoots; i++) { // Compound --> Root Root+
248 pRule = new CGrammarRule();
250 rule = "Compound";
251 for( j = 0; j < i; j++ ) rule += " Root";
253 pRule->Collapse( rule );
254 grammar[ "Compound" ].append( pRule );
258 // Linker
259 CTerminalRuleCollection linkers;
261 if( linker == QString::null )
263 if( MAXIMUM_LINKER_LENGTH > 0 )
265 grammar.insert( "Linker", RuleList() );
266 //grammar[ "Linker" ].setAutoDelete( TRUE );
268 for (int i = 1; i <= MAXIMUM_LINKER_LENGTH; i++) {
269 rule = "Linker";
270 wRule = new CWildCardRule( rule, i );
271 grammar[ "Linker" ].append( wRule );
275 else
277 rule = "Linker";
278 tRule = new CTerminalRule( rule );
279 tRule->SetKey( CStringSurrogate( linker ) );
280 Q_ASSERT( tRule == ( linkers << tRule ) );
284 if( grammar.find( "Linker" ) != grammar.end() ||
285 linker != QString::null )
288 for (int i = 2; i <= maxNumberOfRoots; i++) { // Compound --> Root (Linker Root)+
289 pRule = new CGrammarRule();
291 rule = "Compound";
292 for( j = 0; j < i - 1; j++ ) rule += " Root Linker";
293 rule += " Root";
295 pRule->Collapse( rule );
296 grammar[ "Compound" ].append( pRule );
300 // Add all the stems as terminal root rules
301 CTerminalRuleCollection stems;
303 if (!components) {
304 Stems.Sort(KEY);
305 for (int i = 0; i < Stems.GetCount(); i++) {
306 CStem* pStem = Stems.GetAtSort(i);
307 rule = "Root";
308 tRule = new CTerminalRule( rule );
309 tRule->Append( pStem );
310 Q_ASSERT( tRule == ( stems << tRule ) );
312 } else {
313 for (int z = 0; z < components->size(); z++) {
314 CStem* pStem = components->at(z);
315 rule = "Root";
316 tRule = new CTerminalRule( rule );
317 tRule->Append( pStem );
318 Q_ASSERT( tRule == ( stems << tRule ) );
322 QMap<QString, CTerminalRuleCollection*> lexicon;
323 lexicon.insert( "Root", &stems );
324 if( linker != QString::null ) lexicon.insert( "Linker", &linkers );
326 // Parse all stems
327 Stems.Sort(KEY);
329 linguistica::ui::status_user_agent& status = status_display();
330 status.major_operation = "Parsing possible compounds";
331 status.progress.clear();
332 status.progress.set_denominator(Stems.GetCount());
333 CEarleyParser* parser = NULL;
334 for (int i = 0; i < Stems.GetCount(); i++) {
335 status.progress = i;
336 CStem* pStem = Stems.GetAtSort(i);
338 // We don't want to analyze anything that can't contain two stems
339 // Words that are not at least the length of two stems
340 if( pStem->GetKeyLength() < 2 * MINIMUM_STEM_LENGTH ) continue;
342 // This includes anything that has less than two valid stems that start
343 // at the first character of this word
344 CSS key = pStem->GetKey();
345 if( Stems.CountValidSubstrings( key ) < 2 ) continue;
347 // And also includes anything that has less than two valid stems that
348 // end at the last character of this word
349 key.SetBackwards();
350 if( Stems.GetReverseTrie()->CountValidSubstrings( key ) < 2 ) continue;
352 // Log if desired
353 logFile = LogFileOn() ? GetLogFileStream() : NULL;
355 // Parse the word
356 parser = new CEarleyParser(pStem, &grammar, &lexicon, logFile, MaximumParseDepth);
357 pStem->SetMyEarleyParser(parser);
359 if (parser->isValidGrammar())
360 allParses = parser->Parse();
362 if (allParses && !allParses->isEmpty()) {
363 // We found some parses, now we can create a compound
364 // we'll use the first parse until we can get component
365 // probability information
367 pEdge = allParses->first();
368 pEdge->GetParse( &oneParse );
370 pCompound = *m_pCompounds << oneParse.Display();
371 pCompound->IncrementCorpusCount( pStem->GetCorpusCount() - 1 );
373 double componentCount = 0.0,
374 linkerCount = 0.0;
375 pCompound->SetLexicon( this );
376 pCompound->SetParses( allParses, &componentCount, &linkerCount );
377 m_pCompounds->SetComponentCount( m_pCompounds->GetComponentCount() + componentCount );
378 m_pCompounds->SetLinkerCount( m_pCompounds->GetLinkerCount() + linkerCount );
379 pCompound->SetBestParse(0);
381 UpdateCompound( pCompound->Display() );
384 status.progress.clear();
385 status.major_operation.clear();
389 void CLexicon::FromStemsFindCompounds( QList<CStem*>* compounds, QList<CStem*>* components, QString linker )
391 CMiniLexicon* pMini;
392 int stemCount = 0;
393 int i, j;
395 int MaximumParseDepth = GetIntParameter ("EarleyParser\\MaximumParseDepth", 5 );
397 // We need to have analyzed some stems.
398 if( !compounds )
400 for( i = 0; i < static_cast <int> ( m_pMiniLexica->size() ) ; i++ )
402 pMini = (*m_pMiniLexica)[i];
403 if( pMini ) stemCount += pMini->GetStems()->GetCount();
405 if( stemCount == 0 ) return;
408 CStem* pStem;
409 CGrammarRule* pRule;
410 CTerminalRule* tRule;
411 CWildCardRule* wRule;
412 QString rule, compound;
413 Q3PtrList<CEdge>* allParses = NULL;
414 CCompound* pCompound;
415 CParse oneParse;
416 CEdge* pEdge;
417 CSS ssCompound;
418 CMiniLexicon* mini;
419 QTextStream* logFile = NULL;
421 int longestCompound = 0;
422 int shortestComponent = 0;
424 m_pCompounds->SetComponentCount( 0.0 );
425 m_pCompounds->SetLinkerCount( 0.0 );
427 int MINIMUM_STEM_LENGTH = GetIntParameter( "Main\\MinimumStemLength", 3 );
428 int MAXIMUM_LINKER_LENGTH = GetIntParameter( "Compounds\\MaximumLinkerLength", 1 );
430 // We will attempt to parse all stems and unanalyzed
431 // words (assumed to be stems), so we should collect
432 // them all into one place.
433 CStemCollection Stems;
434 Stems.CreateReverseTrie();
436 if( !compounds )
438 for( i = GetMiniSize()-1; i >= 0; i-- )
440 mini = GetMiniLexicon(i);
441 if( !mini ) continue;
443 for( j = 0; j < mini->GetStems()->GetCount(); j++ )
445 Stems << mini->GetStems()->GetAt(j);
448 for( j = 0; j < mini->GetWords()->GetCount(); j++ )
450 // We don't want to parse analyzed words
451 if( mini->GetWords()->GetAt(j)->Size() > 1 ) continue;
453 // We don't want words that are too short
454 if( mini->GetWords()->GetAt(j)->GetKeyLength() < MINIMUM_STEM_LENGTH ) continue;
456 Stems << mini->GetWords()->GetAt(j);
460 else
462 //for( pStem = compounds->first(); pStem; pStem = compounds->next() )
463 for (int z= 0; z < compounds->size(); z++)
464 { pStem = compounds->at(z);
465 Stems << pStem;
466 if( pStem->GetKeyLength() > longestCompound )
468 longestCompound = pStem->GetKeyLength();
472 //for( pStem = components->first(); pStem; pStem = components->next() )
473 for (int y = 0; y < components->size(); y++)
474 { pStem = components->at(y);
475 Stems << pStem;
476 if( shortestComponent == 0 || pStem->GetKeyLength() < shortestComponent )
478 shortestComponent = pStem->GetKeyLength();
484 // We need a grammar to parse from
485 RuleCollection grammar;
487 // Start
488 grammar.insert( "Start", RuleList() );
489 //grammar[ "Start" ].setAutoDelete( TRUE );
491 pRule = new CGrammarRule();
492 rule = "Start Word"; // Start --> Word
493 pRule->Collapse( rule );
494 grammar[ "Start" ].append( pRule );
496 // Word
497 grammar.insert( "Word", RuleList() );
498 //grammar[ "Word" ].setAutoDelete( TRUE );
500 pRule = new CGrammarRule();
501 rule = "Word Compound"; // Word --> Compound
502 pRule->Collapse( rule );
503 grammar[ "Word" ].append( pRule );
505 // Compound
506 grammar.insert( "Compound", RuleList() );
507 //grammar[ "Compound" ].setAutoDelete( TRUE );
509 pRule = new CGrammarRule();
510 rule = "Compound Compound Compound"; // Compound --> Compound Compound
511 pRule->Collapse( rule );
512 grammar[ "Compound" ].append( pRule );
514 pRule = new CGrammarRule();
515 rule = "Compound Compound Root"; // Compound --> Compound Root
516 pRule->Collapse( rule );
517 grammar[ "Compound" ].append( pRule );
519 pRule = new CGrammarRule();
520 rule = "Compound Root Compound"; // Compound --> Root Compound
521 pRule->Collapse( rule );
522 grammar[ "Compound" ].append( pRule );
524 pRule = new CGrammarRule();
525 rule = "Compound Root Root"; // Compound --> Root Root
526 pRule->Collapse( rule );
527 grammar[ "Compound" ].append( pRule );
529 // Linker
530 CTerminalRuleCollection linkers;
532 if( linker == QString::null )
534 if( MAXIMUM_LINKER_LENGTH > 0 )
536 grammar.insert( "Linker", RuleList() );
537 //grammar[ "Linker" ].setAutoDelete( TRUE );
539 for( i = 1; i <= MAXIMUM_LINKER_LENGTH; i++ )
541 rule = "Linker";
542 wRule = new CWildCardRule( rule, i );
543 grammar[ "Linker" ].append( wRule );
547 else
549 rule = "Linker";
550 tRule = new CTerminalRule( rule );
551 tRule->SetKey( CStringSurrogate( linker ) );
552 Q_ASSERT( tRule == ( linkers << tRule ) );
556 if( grammar.find( "Linker" ) != grammar.end() ||
557 linker != QString::null )
559 pRule = new CGrammarRule();
560 rule = "Compound Compound Linker Compound"; // Compound --> Compound Linker Compound
561 pRule->Collapse( rule );
562 grammar[ "Compound" ].append( pRule );
564 pRule = new CGrammarRule();
565 rule = "Compound Compound Linker Root"; // Compound --> Compound Linker Root
566 pRule->Collapse( rule );
567 grammar[ "Compound" ].append( pRule );
569 pRule = new CGrammarRule();
570 rule = "Compound Root Linker Compound"; // Compound --> Root Linker Compound
571 pRule->Collapse( rule );
572 grammar[ "Compound" ].append( pRule );
574 pRule = new CGrammarRule();
575 rule = "Compound Root Linker Root"; // Compound --> Root Linker Root
576 pRule->Collapse( rule );
577 grammar[ "Compound" ].append( pRule );
580 // Add all the stems as terminal root rules
581 CTerminalRuleCollection stems;
583 if( !components )
585 Stems.Sort(KEY);
586 for( i = 0; i < Stems.GetCount(); i++ )
588 pStem = Stems.GetAtSort(i);
590 rule = "Root";
591 tRule = new CTerminalRule( rule );
592 tRule->Append( pStem );
593 Q_ASSERT( tRule == ( stems << tRule ) );
596 else
598 //for( pStem = components->first(); pStem; pStem = components->next() )
599 for (int z = 0; z < components->size(); z++)
600 { pStem = components->at(z);
601 rule = "Root";
602 tRule = new CTerminalRule( rule );
603 tRule->Append( pStem );
604 Q_ASSERT( tRule == ( stems << tRule ) );
608 QMap<QString, CTerminalRuleCollection*> lexicon;
609 lexicon.insert( "Root", &stems );
610 if( linker != QString::null ) lexicon.insert( "Linker", &linkers );
612 // Parse all stems
613 Stems.Sort(KEY);
615 linguistica::ui::status_user_agent& status = status_display();
617 status.major_operation = "Parsing possible compounds";
618 status.progress.clear();
619 status.progress.set_denominator(Stems.GetCount());
620 CEarleyParser* parser = NULL;
621 for (i = 0; i < Stems.GetCount(); i++) {
622 status.progress = i;
623 pStem = Stems.GetAtSort(i);
625 // We don't want to analyze anything that can't contain two stems
626 // Words that are not at least the length of two stems
627 if( pStem->GetKeyLength() < 2 * MINIMUM_STEM_LENGTH ) continue;
629 // This includes anything that has less than two valid stems that start
630 // at the first character of this word
631 CSS key = pStem->GetKey();
632 if( Stems.CountValidSubstrings( key ) < 2 ) continue;
634 // And also includes anything that has less than two valid stems that
635 // end at the last character of this word
636 key.SetBackwards();
637 if( Stems.GetReverseTrie()->CountValidSubstrings( key ) < 2 ) continue;
639 // Log if desired
640 if( LogFileOn() )
642 logFile = GetLogFileStream();
643 } else { logFile = NULL; }
645 // Parse the word
646 parser = new CEarleyParser( pStem, &grammar, &lexicon, logFile, MaximumParseDepth );
647 pStem->SetMyEarleyParser( parser);
649 if( parser->isValidGrammar() )
651 allParses = parser->Parse();
654 if( allParses && !allParses->isEmpty() )
656 // We found some parses, now we can create a compound
657 // we'll use the first parse until we can get component
658 // probability information
660 pEdge = allParses->first();
661 pEdge->GetParse( &oneParse );
663 pCompound = *m_pCompounds << oneParse.Display();
664 pCompound->IncrementCorpusCount( pStem->GetCorpusCount() - 1 );
666 //-----------------------------------------------//
667 for (int m = 1; m <= oneParse.Size(); m++)
669 *m_pCompounds->GetComponents() << oneParse.GetPiece(m).Display();
672 //-----------------------------------------------//
674 double componentCount = 0.0,
675 linkerCount = 0.0;
676 pCompound->SetLexicon( this );
677 pCompound->SetParses( allParses, &componentCount, &linkerCount );
678 m_pCompounds->SetComponentCount( m_pCompounds->GetComponentCount() + componentCount );
679 m_pCompounds->SetLinkerCount( m_pCompounds->GetLinkerCount() + linkerCount );
680 pCompound->SetBestParse(0);
682 UpdateCompound( pCompound->Display() );
685 status.progress.clear();
686 status.major_operation.clear();
690 void CLexicon::CalculateCoefficientsOfAffixness()
692 int i, j, count = 0;
693 int numberofwords;
694 SuffixSet* pSuffixSet;
695 PrefixSet* pPrefixSet;
696 CSuffix* pSuffix;
697 CPrefix* pPrefix;
698 CStem* pStem, * pWord;
699 StemSet* pStemSet;
700 pStemSet = NULL;
701 CLinker* pLinker;
702 pLinker = NULL;
703 CCompound* pCompound;
704 CEdge* pEdge;
705 CParse parse;
706 CSignature* pPrefixSignature;
708 QList<CStem*>* pStems;
710 linguistica::ui::status_user_agent& status = status_display();
711 status.major_operation = "Calculating affixness...";
712 status.progress.clear();
713 status.progress.set_denominator(m_pCompounds->GetComponentMap()->count());
714 ComponentMap::Iterator it;
715 for (it = m_pCompounds->GetComponentMap()->begin(); it != m_pCompounds->GetComponentMap()->end(); ++it) {
716 status.progress = ++count;
717 double affix_prob = 0.0,
718 component_prob = 0.0;
720 double sig_count, stem_count, affix_count;
722 pSuffixSet = m_AllSuffixes[ it.key() ];
724 if( pSuffixSet )
726 for(int suffixno = 0; suffixno < static_cast <int> ( pSuffixSet->count() ); suffixno++ )
728 pSuffix = pSuffixSet->at(suffixno);
730 pStems = pSuffix->GetStems();
732 if( !pStems ) continue;
734 for (int stemno = 0; stemno < pStems->size(); stemno++)
735 { pStem = pStems->at(stemno);
736 stem_count = pStem->GetCorpusCount();
737 sig_count = pStem->GetSuffixSignature()->GetCorpusCount();
738 CSignature* pSuffixSignature = pStem->GetSuffixSignature();
739 affix_count = 0;
740 numberofwords = pSuffixSignature->GetNumberOfWords();
741 pWord = pSuffixSignature->GetWord(stemno, suffixno);
742 if( pWord->GetSuffix() != pSuffix->GetKey() ) continue;
743 affix_count += pWord->GetCorpusCount();
744 affix_prob += ( (double) sig_count / (double) GetCorpusCount() ) *
745 ( (double) stem_count / (double) sig_count ) *
746 ( (double) affix_count / (double) sig_count );
753 pPrefixSet = m_AllPrefixes[ it.key() ];
755 if( pPrefixSet )
757 for( int prefixno = 0; prefixno < static_cast <int> ( pPrefixSet->count() ); j++ )
759 pPrefix = pPrefixSet->at(prefixno);
761 pStems = pPrefix->GetStems();
763 if( !pStems ) continue;
765 //for( pStem = pStems->first(); pStem; pStem = pStems->next() )
766 for (int stemno = 0; stemno < pStems->size(); stemno++)
767 { pStem = pStems->at(stemno);
768 stem_count = pStem->GetCorpusCount();
769 sig_count = pStem->GetPrefixSignature()->GetCorpusCount();
771 // XXX. Explain.
772 // QList<CStem*>* pWords = pStem->GetPrefixSignature()->GetWordPtrList();
773 pPrefixSignature= pStem->GetPrefixSignature();
774 numberofwords = pPrefixSignature->GetNumberOfWords();
775 // if( !pWords ) continue;
777 affix_count = 0;
778 pWord = pPrefixSignature->GetWord(stemno, prefixno);
779 if( pWord->GetPrefix() != pPrefix->GetKey() ) continue;
780 affix_count += pWord->GetCorpusCount();
783 affix_prob += ( (double) sig_count / (double) GetCorpusCount() ) *
784 ( (double) stem_count / (double) sig_count ) *
785 ( (double) affix_count / (double) sig_count );
792 double word_is_compound_prob = (double) m_pCompounds->GetCorpusCount() / (double) GetCorpusCount();
794 for( i = 0; i < m_pCompounds->GetCount(); i++ )
796 pCompound = m_pCompounds->GetAt(i);
798 double this_parse_prob,
799 piece_prob;
800 for( pEdge = pCompound->GetParses()->first(); pEdge; pEdge = pCompound->GetParses()->next() )
801 //for (int z= 0; z < pCompound->GetParses()->size(); z++)
802 { // pEdge = pCompound->GetParses()->at(z);
803 pEdge->GetParse( &parse );
805 if( parse.Find( it.key() ) == 0 ) continue;
807 this_parse_prob = 1.0;
809 for( j = 1; j < parse.Size(); j++ )
811 if( m_pCompounds->GetComponentMap()->find( parse.GetPiece(j).Display() ) != m_pCompounds->GetComponentMap()->end() )
813 pStemSet = m_pCompounds->GetComponentMap()->find( parse.GetPiece(j).Display() ).data();
815 else pLinker = *GetLinkers() ^= parse.GetPiece(j);
817 if( pStemSet )
819 piece_prob = pStemSet->at(0)->GetCompoundCount() / m_pCompounds->GetComponentCount();
821 else if( pLinker ) piece_prob = pLinker->GetCompoundCount() / m_pCompounds->GetLinkerCount();
822 else continue;
824 this_parse_prob *= piece_prob;
827 component_prob += word_is_compound_prob * this_parse_prob;
832 pStemSet = it.data();
833 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
834 for (int y = 0; y < pStemSet->size(); y++)
835 { pStem = pStemSet->at(y);
836 if( affix_prob == 0.0 ) pStem->SetAffixness( 0.0 );
837 else if( component_prob == 0.0 ) pStem->SetAffixness( 1.0 );
838 else pStem->SetAffixness( affix_prob / ( affix_prob + component_prob ) );
841 status.progress.clear();
842 status.major_operation.clear();
846 void CLexicon::FromAffixnessUpdateSigsAndCompounds()
848 bool ok;
849 double UPPER_THRESHOLD = 0.70;
850 UPPER_THRESHOLD = QInputDialog::getDouble( "Linguistica",
851 "Enter the upper affixness threshold:",
852 UPPER_THRESHOLD, 0.0, 1.0, 2, &ok, m_pDoc );
853 if ( !ok ) UPPER_THRESHOLD = 0.70;
855 double LOWER_THRESHOLD = 0.30;
856 LOWER_THRESHOLD = QInputDialog::getDouble( "Linguistica",
857 "Enter the lower affixness threshold:",
858 LOWER_THRESHOLD, 0.0, 1.0, 2, &ok, m_pDoc );
859 if ( !ok ) LOWER_THRESHOLD = 0.30;
861 if( UPPER_THRESHOLD < LOWER_THRESHOLD ) UPPER_THRESHOLD = LOWER_THRESHOLD;
863 double affixness;
865 ComponentMap* pComponents = m_pCompounds->GetComponentMap();
866 CCompound* pCompound;
867 CEdge* pEdge;
868 CParse parse;
869 QString component, word;
870 StemSet* pStemSet;
871 SuffixSet* pSuffixSet;
872 PrefixSet* pPrefixSet;
873 CStem* pStem, *pWord;
874 CSuffix* pSuffix;
875 CPrefix* pPrefix;
876 CMiniLexicon* pMini;
878 QList<CCompound*> cmpdDeletions;
879 QList<CMiniLexicon*> affectedMinis;
881 for( int i = 0; i < m_pCompounds->GetCount(); i++ )
883 pCompound = m_pCompounds->GetAt(i);
885 QList<CEdge*> edgeDeletions;
887 for( pEdge = pCompound->GetParses()->first(); pEdge; pEdge = pCompound->GetParses()->next() )
888 //for (int z= 0; z < pCompound->GetParses()->size(); z++)
889 {// pEdge = pCompound->GetParses()->at(z);
890 pEdge->GetParse( &parse );
892 for( int j = 1; j <= parse.Size(); j++ )
894 component = parse.GetPiece(j).Display();
895 if( pComponents->find( component ) == pComponents->end() ) continue;
897 pStemSet = pComponents->find( component ).data();
899 affixness = pStemSet->at(0)->GetAffixness();
901 // Remove compounds which have a component whose affixness
902 // exceeds the upper affixness threshold
903 if( affixness >= UPPER_THRESHOLD )
905 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
906 for (int y = 0; y < pStemSet->size(); y++)
907 { pStem = pStemSet->at(y);
908 pStem->SetCompoundCount( 0.0 );
909 edgeDeletions.append( pEdge );
914 // Remove suffixes whose corresponding component's affixness
915 // is below the lower affixness threshold
916 if( affixness <= LOWER_THRESHOLD && affixness > 0 )
918 // Must exist as suffix or prefix also
919 pSuffixSet = m_AllSuffixes[ component ];
920 bool isSuffix = TRUE;
921 if( !pSuffixSet )
923 pPrefixSet = m_AllPrefixes[ component ];
924 isSuffix = FALSE;
925 if( !pPrefixSet ) continue;
928 if( isSuffix )
930 //for( pSuffix = pSuffixSet->first(); pSuffix; pSuffix = pSuffixSet->next() )
931 for (int z = 0; z < pSuffixSet->size(); z++)
932 { pSuffix = pSuffixSet->at(z);
933 pMini = pSuffix->GetMyMini();
934 pStemSet = pSuffix->GetStems();
936 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
937 for (int y = 0; y < pStemSet->size(); y++)
938 { pStem = pStemSet->at(y);
939 word = pStem->Display() + pSuffix->Display();
941 pWord = (*pMini->GetWords()) ^= CSS( word );
943 pWord->ClearParseStructure();
945 if( affectedMinis.indexOf( pMini ) < 0 ) affectedMinis.append( pMini );
949 else
951 //for( pPrefix = pPrefixSet->first(); pPrefix; pPrefix = pPrefixSet->next() )
952 for (int z= 0; z < pPrefixSet->size(); z++)
953 { pPrefix = pPrefixSet->at(z);
954 pMini = pPrefix->GetMyMini();
955 pStemSet = pPrefix->GetStems();
957 //for( pStem = pStemSet->first(); pStem; pStem = pStemSet->next() )
958 for (int z = 0; z < pStemSet->size(); z++)
959 { pStem = pStemSet->at(z);
960 word = pPrefix->Display() + pStem->Display();
961 pWord = (*pMini->GetWords()) ^= CSS( word );
962 pWord->ClearParseStructure();
963 if( affectedMinis.indexOf( pMini ) < 0 ) affectedMinis.append( pMini );
971 // Remove all edges marked for deletion
972 //for( pEdge = edgeDeletions.first(); pEdge; pEdge = edgeDeletions.next() )
973 for (int z=0; z < edgeDeletions.size(); z++)
974 { pEdge = edgeDeletions.at(z);
975 pCompound->RemoveParse( pEdge );
978 if( pCompound->GetParses()->count() == 0 )
980 cmpdDeletions.append( pCompound );
982 else if( pCompound->GetBestParse() < 0 ) pCompound->SetBestParse( 0 );
985 // Remove all compounds marked for deletion
986 //for( pCompound = cmpdDeletions.first(); pCompound; pCompound = cmpdDeletions.next() )
987 for (int z= 0; z < cmpdDeletions.size(); z++)
988 { pCompound = cmpdDeletions.at(z);
989 m_pCompounds->RemoveMember( pCompound );
992 // Update all affected minis
993 QString strAffixness = "Affixness";
994 CStringSurrogate cssAffixness( strAffixness );
995 //for( pMini = affectedMinis.first(); pMini; pMini = affectedMinis.next() )
996 for (int z = 0; z < affectedMinis.size(); z++)
997 { pMini = affectedMinis.at(z);
998 pMini->TakeSplitWords_ProduceStemsAndSigs( cssAffixness );
1001 // Check validity of compounds
1002 m_pCompounds->CheckAndRecount();