1 // Implementation of finite-state automaton class
2 // Copyright © 2009 The University of Chicago
8 #include <QTableWidgetItem>
10 #include "MiniLexicon.h"
11 #include "Signature.h"
13 #include "SignatureCollection.h"
14 #include "StemCollection.h"
15 #include "WordCollection.h"
16 #include "SuffixCollection.h"
22 #include <graphviz/gvc.h>
25 #define TEST_FUNC_MAX 32
29 double FSAMorpheme::GetDL(int characterCount
) const
31 if (this->str
== "NULL")
33 return double(this->str
.size()) * base2log(characterCount
);
36 bool FSAMorpheme::operator==(const FSAMorpheme
& other
) const
38 return this->toStr() == other
.toStr();
41 namespace linguistica
{
42 namespace join_helper
{
43 class cat_with_delim
: public std::binary_function
<
44 QString
, const FSAMorpheme
*, QString
> {
47 cat_with_delim(const QString
& d
) : delim(d
) { }
48 QString
operator()(const QString
& lhs
,
49 const FSAMorpheme
* rhs
) const
53 result
.append(rhs
->toStr());
60 QString
join(const FSAMorphemeList
& v
, const QString
& delim
)
62 using linguistica::join_helper::cat_with_delim
;
63 using std::accumulate
;
65 FSAMorphemeList::const_iterator iter
= v
.begin();
70 const QString first
= (*iter
)->toStr();
71 return accumulate(++iter
, v
.end(), first
, cat_with_delim(delim
));
74 FSAstate::FSAstate(FSA
* pFSA
)
75 : m_EdgesOut(), m_EdgesIn(),
76 m_MaxDepth(-1), //m_stateLabel(),
77 m_PathList(), m_DiscoverCount(0),
78 m_stateId(pFSA
->m_nextStateId
++)
79 { pFSA
->AddState(this); }
81 /*int FSAstate::getWordCount() const
85 for(int j=0; j<this->GetEdgesIn()->size();j++) {
86 FSAedge* pEdge = this->GetEdgesIn()->at(j);
87 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
89 sumIn += pEdge->GetMorphemes()->at(k)->m_corpusCount;
93 for(int j=0; j<this->GetEdgesOut()->size();j++) {
94 FSAedge* pEdge = this->GetEdgesOut()->at(j);
95 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
96 sumOut += pEdge->GetMorphemes()->at(k)->m_corpusCount;
99 Q_ASSERT(sumIn==0 || sumOut==0 || sumOut==sumIn);
100 Q_ASSERT(sumOut+sumIn > 0);
109 void FSAstate::AddEdgeOut(FSAedge
* pEdge
)
111 this->m_EdgesOut
.push_back(pEdge
);
114 void FSAstate::AddEdgeIn(FSAedge
* pEdge
)
116 this->m_EdgesIn
.push_back(pEdge
);
118 //----------------------------------------------------------------//
119 FSAedge::FSAedge(FSA
* pFSA
, FSAstate
* pStartState
, FSAstate
* pEndState
)
120 : m_Morphemes(), m_FromState(pStartState
), m_ToState(pEndState
), m_FSA(pFSA
)
125 FSAedge::FSAedge(FSA* pFSA, FSAstate* pStartState, FSAstate* pEndState, CParse* pLabels)
126 : m_Morphemes(), // initialized below
127 m_FromState(pStartState),
131 pLabels->CreateQStringList(m_Morphemes);
134 //void FSAedge::AddMorpheme(Morpheme* pMorph)
135 void FSAedge::AddMorpheme(const QString
& str
, int addedCount
)
137 Q_ASSERT(addedCount
);
138 //if Morpheme not included in FSA, update
139 if(this->m_FSA
->m_morphemes
.contains(str
))
141 FSAMorpheme
* pMorph
= this->m_FSA
->m_morphemes
[str
];
142 pMorph
->m_corpusCount
+=addedCount
;
144 Q_ASSERT(1); //should not be called now, we are not adding stems or suffixes
145 FSAMorpheme
* pMorph
= new FSAMorpheme(str
,addedCount
);
146 this->m_FSA
->m_morphemes
.insert(str
, pMorph
);
149 this->m_Morphemes
.push_back( new FSAMorpheme(str
,addedCount
) );
152 void FSAedge::RemoveMorpheme(FSAMorphemeList::iterator iter
)
154 FSAMorpheme
* pMorph
= *iter
;
155 QString str
= pMorph
->toStr();
156 int count
= pMorph
->m_corpusCount
;
157 //this->GetMorphemes()->remove(pMorph);
158 this->GetMorphemes()->erase(iter
);
160 // Do cleanup at FSA level
161 FSAMorpheme
* FsaMorph
= this->m_FSA
->m_morphemes
[str
];
163 Q_ASSERT(FsaMorph
->m_corpusCount
>= count
);
164 //std::cout << "Removing: " << pMorph->toStr().toStdString() << " FSA count: " << FsaMorph->m_corpusCount << " This edge count:" << count << std::endl;
165 if(count
== FsaMorph
->m_corpusCount
) //remove from FSA
166 this->m_FSA
->m_morphemes
.remove(str
);
168 this->m_FSA
->m_morphemes
[str
]->m_corpusCount
-=count
;
173 FSAedge
* FSA::DoParallelSplit(FSAedge
* pEdge
, FSAMorphemeList
& morphsToMove
)
175 Q_ASSERT(!morphsToMove
.empty());
176 //if(morphsToMove.empty()) return NULL;
178 if(morphsToMove
.size() == pEdge
->GetMorphemes()->size())
181 FSAMorphemeList copy1
= *pEdge
->GetMorphemes();
182 FSAMorphemeList copy2
= morphsToMove
;
185 Q_ASSERT(copy1
==copy2
);
190 FSAMorphemeList
* pEdgeMorphemes
= pEdge
->GetMorphemes();
191 foreach (FSAMorpheme
* pMorph
, morphsToMove
) {
192 FSAMorphemeList::iterator m_it
=
193 std::find(pEdgeMorphemes
->begin(),pEdgeMorphemes
->end(),pMorph
);
194 Q_ASSERT(m_it
!=pEdgeMorphemes
->end());
197 //just move dynamically allocated FSAMorph object from one edge to another.
198 // No other bookkeeping needed, but watch for this if defn any of FSA classes
199 // change. Note that morpheme counts are tracked at FSA level, but split does
200 // not change these counts.
201 FSAedge
* pNewEdge
= new FSAedge(this, pEdge
->GetFromState(), pEdge
->GetToState());
202 foreach (FSAMorpheme
*pMorph
,morphsToMove
) {
203 pEdge
->GetMorphemes()->remove(pMorph
);
204 pNewEdge
->GetMorphemes()->push_back(pMorph
);
209 void FSA::DoMultParallelSplit( FSAedge
* pEdge
,
210 std::list
<FSAMorphemeList
>::iterator first
,
211 std::list
<FSAMorphemeList
>::iterator last
)
213 std::list
<FSAMorphemeList
>::iterator list_it
= first
;
218 foreach(FSAMorpheme
* pMorph
,*list_it
++)
219 all
.push_back(pMorph
);
221 Q_ASSERT(all
.size()==pEdge
->GetMorphemes()->size());
223 FSAMorphemeList copy
= *pEdge
->GetMorphemes();
226 list_it
=first
; //reset iterator
228 //remove 1 non-empty list, some morphs will stay on pEdge
229 while((*list_it
++).size()==0)
233 //this->DoParallelSplit(pEdge,*list_it++);
234 FSAMorphemeList
& morphList
= *list_it
++;
236 if(morphList
.size()==0) continue; //ignore empty lists
238 FSAedge
* pNewEdge
= new FSAedge(this, pEdge
->GetFromState(), pEdge
->GetToState());
239 foreach (FSAMorpheme
*pMorph
,morphList
) {
240 pEdge
->GetMorphemes()->remove(pMorph
);
241 pNewEdge
->GetMorphemes()->push_back(pMorph
);
248 FSAedge
* FSA::DoSeriesSplit(FSAedge
* pEdge
,unsigned int len
, bool suffix
)
250 Q_ASSERT(!pEdge
->GetMorphemes()->empty());
252 QString firstMorph
= pEdge
->GetMorphemes()->front()->toStr();
253 QString commonMorphStr
= ( suffix
? firstMorph
.right(len
) : firstMorph
.left(len
) );
255 FSAstate
* pNewState
= new FSAstate(this);
256 FSAedge
* pEdge1
= new FSAedge(this, pEdge
->GetFromState(),pNewState
);
257 FSAedge
* pEdge2
= new FSAedge(this, pNewState
,pEdge
->GetToState());
259 FSAedge
*commonEdge
, *multiEdge
;
267 unsigned int totalCount
= 0;
269 FSAMorphemeList::iterator m_iter
= pEdge
->GetMorphemes()->begin();
270 while(m_iter
!= pEdge
->GetMorphemes()->end())
272 FSAMorpheme
* pMorph
= *m_iter
;
273 QString morphStr
= pMorph
->toStr();
274 QString curCommon
= ( suffix
? morphStr
.right(len
) : morphStr
.left(len
) );
275 Q_ASSERT( curCommon
== commonMorphStr
);
276 totalCount
+= pMorph
->m_corpusCount
;
278 unsigned int len_other
= morphStr
.length()-len
;
279 QString cur_multi
= ( suffix
? morphStr
.left(len_other
) : morphStr
.right(len_other
) );
281 multiEdge
->AddMorpheme(cur_multi
,pMorph
->m_corpusCount
);
282 pEdge
->RemoveMorpheme(m_iter
++);
285 Q_ASSERT(pEdge
->GetMorphemes()->empty());
286 commonEdge
->AddMorpheme(commonMorphStr
,totalCount
);
288 this->RemoveEdge(pEdge
);
294 : m_States(), m_Edges(), m_StartStates(),
295 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
296 m_lexicon(), m_charCount(0),
298 m_nextStartStateId(1), m_nextStateId(1) { }
300 FSA::FSA(const FSA
& fsa
)
301 : m_States(fsa
.m_States
), m_Edges(fsa
.m_Edges
), m_StartStates(fsa
.m_StartStates
),
302 m_FSAPathList(fsa
.m_FSAPathList
), m_searched(false), m_MaxPathLength(0),
303 m_lexicon(), m_charCount(0),
304 m_morphemes(fsa
.m_morphemes
),
305 m_nextStartStateId(1), m_nextStateId(1) { }
307 FSA::FSA(CMiniLexicon
* pMiniLexicon
)
308 : m_States(new FSAStateList
),
309 m_Edges(new FSAedgeList
),
310 m_StartStates(new FSAStateList
),
311 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
312 m_lexicon(pMiniLexicon
),
313 m_charCount(pMiniLexicon
->GetNumberOfCharacterTypes()),
315 m_nextStartStateId(1), m_nextStateId(1)
317 //m_StartState->setMaxDepth(0);
318 // this->AddState(m_StartState);
320 //int MinimumNumberOfStemsForDisplay = 2;
321 //if (pMiniLexicon->GetSignatures()->GetCount() < 20 ) MinimumNumberOfStemsForDisplay = 1;
322 //int MinimumNumberOfAffixesForDisplay = 1; //make this adjustable by user @@@@
324 for (int i
= 0; i
< pMiniLexicon
->GetSignatures()->GetCount(); i
++)
326 CSignature
* pSig
= pMiniLexicon
->GetSignatures()->GetAt(i
);
328 //if (pSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay ) continue;
329 //if (pSig->Size() < MinimumNumberOfAffixesForDisplay ) continue;
337 while(!m_Edges
->empty())
339 FSAedge
* pEdge
= m_Edges
->front();
340 m_Edges
->pop_front();
341 while(!(pEdge
->GetMorphemes()->empty()))
343 delete pEdge
->GetMorphemes()->front();
344 pEdge
->GetMorphemes()->pop_front();
349 while(!m_States
->empty())
351 FSAstate
* pState
= m_States
->front();
352 m_States
->pop_front();
353 while(!pState
->m_PathList
.empty())
355 delete pState
->m_PathList
.front();
356 pState
->m_PathList
.pop_front();
361 QMap
<QString
, FSAMorpheme
*>::iterator i
= m_morphemes
.begin();
362 while (i
!= m_morphemes
.end()) {
367 delete this->m_States
;
368 delete this->m_Edges
;
369 delete this->m_StartStates
;
372 void FSA::AddSignature(CSignature
* pSig
)
374 //std::cout << "---- Signature " <<pSig->GetNumberOfStems()<<" "<< pSig->GetNumberOfAffixes()<< std::endl;
377 FSAstate
* pStart
= new FSAstate(this);
378 pStart
->setMaxDepth(0);
379 this->m_StartStates
->push_back(pStart
);
381 FSAstate
* pMiddle
= new FSAstate(this);
382 FSAstate
* pEnd
= new FSAstate(this);
385 CStringSurrogate ssStem
;
386 FSAedge
* pEdge1
= new FSAedge(this, pStart
, pMiddle
);
387 for (int i
= 0; i
< pSig
->GetNumberOfStems(); i
++)
389 CStem
* pStem
= pSig
->GetStemPtrList()->at(i
);
390 QString str
= pStem
->Display();
392 int corpusCount
= pStem
->GetCorpusCount();
393 Q_ASSERT(corpusCount
>0);
396 //std::cout << "\tStem:"<<str.toStdString() <<"."<< corpusCount <<std::endl;
397 pEdge1
->AddMorpheme( str
,corpusCount
); //TODO
401 FSAedge
* pEdge2
= new FSAedge(this, pMiddle
, pEnd
);
402 for (int i
= 0; i
< pSig
->GetNumberOfAffixes(); i
++)
404 CSuffix
* pSuffix
= pSig
->GetSuffixPtrList()->at(i
);
405 QString suffixStr
= pSuffix
->Display();
408 //for(int j=0;j<pEdge1->GetMorphemes()->size();j++)
409 FSAMorphemeList::iterator j_iter
= pEdge1
->GetMorphemes()->begin();
410 while(j_iter
!= pEdge1
->GetMorphemes()->end())
412 FSAMorpheme
*pMorph
= *j_iter
++;
413 int stemCount
= pMorph
->m_corpusCount
;
414 Q_ASSERT(stemCount
>0);
415 QString analyzedWord
= pMorph
->toStr();
416 if (suffixStr
!="NULL")
417 analyzedWord
.append(suffixStr
);
419 CWordCollection
* wc
= this->m_lexicon
->GetWords();
421 CStem
* pStem
= ( *wc
^=analyzedWord
);
424 corpusCount
+=(pStem
->GetCorpusCount());
427 if(suffixStr
.size()>0)
429 //std::cout << "\t\tSuffix "<< suffixStr.toStdString() <<"."<<corpusCount<<std::endl;
430 pEdge2
->AddMorpheme(suffixStr
, corpusCount
);
434 //----------------------------------------------------------------//
437 void FSA::AddState(FSAstate
* pState
)
439 m_States
->push_back(pState
);
442 void FSA::AddEdge(FSAedge
* pEdge
)
444 m_Edges
->push_back(pEdge
);
445 pEdge
->GetFromState()->AddEdgeOut(pEdge
);
446 pEdge
->GetToState()->AddEdgeIn(pEdge
);
449 void FSA::RemoveEdge(FSAedge
* pEdge
)
451 // remove all morphemes from edge, this does bookkeeping at FSA level
452 // currently unused, TestFunc only removes edge when Morph count is zero
453 if (pEdge
->GetMorphemes()->size() > 0)
455 FSAMorphemeList::iterator iter
= pEdge
->GetMorphemes()->begin();
456 while(iter
!= pEdge
->GetMorphemes()->end())
457 pEdge
->RemoveMorpheme(iter
++);
460 this->m_Edges
->remove(pEdge
);
461 pEdge
->GetFromState()->m_EdgesOut
.remove(pEdge
);
462 pEdge
->GetToState()->m_EdgesIn
.remove(pEdge
);
464 //delete all paths that contain this edge
465 for(std::list
<FSAedgeList
*>::iterator i
=this->m_FSAPathList
.begin();
466 i
!= this->m_FSAPathList
.end(); ++i
)
468 FSAedgeList
* pPath
= *i
;
469 //if edge in path: linear search, paths will be short
470 if( std::find(pPath
->begin(),pPath
->end(),pEdge
) != pPath
->end() );
472 this->m_FSAPathList
.remove(pPath
);
479 void FSA::FSATestFunc()
481 CSuffixCollection
* lexSuffixes
= this->m_lexicon
->GetSuffixes();
482 lexSuffixes
->Sort(COUNT
);
483 CStemCollection
* lexStems
= this->m_lexicon
->GetStems();
485 for (int suf_idx
=0; suf_idx
<lexSuffixes
->GetCount(); ++suf_idx
)
487 if(suf_idx
>=TEST_FUNC_MAX
) {
488 //std::cout << "BREAK" << std::endl;
491 CSuffix
* pSuffix
= lexSuffixes
->GetAtSort(suf_idx
);
492 //std::cout << "Suffix:" << pSuffix->Display().toStdString() << std::endl;
493 QString suffixStr
= pSuffix
->Display();
495 if(suffixStr
== "NULL") continue;
497 //make copy now, list will be altered in while-loop
498 FSAedgeList edges
= FSAedgeList(*this->m_Edges
);
500 FSAedgeList::iterator edge_it
= edges
.begin();
501 while(edge_it
!= edges
.end())
503 FSAedge
* pEdge
= *edge_it
;
504 ++edge_it
; //increment now, pEdge may be deleted
506 FSAMorphemeList morphemesToReanalyze
;
507 FSAMorphemeList
* morphemes
= pEdge
->GetMorphemes();
508 //std::cout << "At edge: " << join(*morphemes,QString(' ')).toStdString() << std::endl;
509 foreach (FSAMorpheme
* pMorph
, *morphemes
)
511 QString edgeStr
= pMorph
->toStr();
512 //std::cout<<" Morpheme:"<<edgeStr.toStdString()<<std::endl;
514 if (edgeStr
.endsWith(suffixStr
) && edgeStr
!=suffixStr
)
516 QString stemStr
= edgeStr
.left(edgeStr
.length() - suffixStr
.length());
518 CStem
* pStem
= (*lexStems
^=stemStr
);
519 //if stemStr in Lexicon
522 //if stem takes this suffix
523 CSignature
* pSig
= pStem
->GetSuffixSignature();
524 for (int k
= 0;k
<pSig
->GetSuffixPtrList()->count();k
++)
526 CSuffix
* pSuffix
= pSig
->GetSuffixPtrList()->at(k
);
527 if(pSuffix
->Display() == suffixStr
)
529 morphemesToReanalyze
.push_back(pMorph
);
534 } //else skip this morpheme
536 if(!morphemesToReanalyze
.empty()) {
537 if( morphemes
->size()==morphemesToReanalyze
.size() ) {
538 //just do parallel split
539 this->DoSeriesSplit(pEdge
,suffixStr
.length(),true);
541 FSAedge
* pNewEdge
= this->DoParallelSplit(pEdge
,morphemesToReanalyze
);
542 this->DoSeriesSplit(pNewEdge
,suffixStr
.length(),true);
554 this->m_searched
= true;
556 // get 'depth' of FSA
557 this-> m_MaxPathLength
=0;
559 //queue.append(this->m_StartState);
560 for ( FSAStateList::iterator i
=this->m_StartStates
->begin();
561 i
!= this->m_StartStates
->end(); ++i
)
563 FSAstate
* pStartState
=*i
; //this->m_StartStates->at(i);
566 FSAedgeList
* pPath
= new FSAedgeList();
567 pStartState
->addPath(pPath
);
569 queue
.push_back(pStartState
);
571 while(!queue
.empty())
573 //FSAstate* pState = queue.takeFirst();
574 FSAstate
* pState
= queue
.front();
577 FSAedgeList
* edgeList
= pState
->GetEdgesOut();
579 //if at leaf, add paths to FSA and do not search outbound edges
580 if(edgeList
->size() == 0)
582 //this->m_FSAPathList += *(pState->GetPathList());
583 for(std::list
<FSAedgeList
*>::iterator j
=pState
->GetPathList()->begin();
584 j
!= pState
->GetPathList()->end(); ++j
)
585 this->m_FSAPathList
.push_back(*j
);
589 //iter over outbound edges
590 for (FSAedgeList::iterator j
=edgeList
->begin(); j
!=edgeList
->end(); ++j
)
593 FSAstate
* pChildState
= pEdge
->GetToState();
594 pChildState
->m_DiscoverCount
++;
596 //add paths to pChildState
597 for(std::list
<FSAedgeList
*>::iterator l
=pState
->GetPathList()->begin();
598 l
!= pState
->GetPathList()->end(); ++l
)
600 FSAedgeList
* pPath
= *l
;
602 FSAedgeList
* pNewPath
= new FSAedgeList(*pPath
);
603 pNewPath
->push_back(pEdge
);
604 pChildState
->addPath(pNewPath
);
607 int maxChildDepth
= pChildState
->getMaxDepth();
609 //set length of max path to child
610 if(maxChildDepth
==-1 || pState
->getMaxDepth()+1 > maxChildDepth
)
612 pChildState
->setMaxDepth(pState
->getMaxDepth()+1);
613 maxChildDepth
= pChildState
->getMaxDepth();
616 if (maxChildDepth
> m_MaxPathLength
)
618 m_MaxPathLength
=maxChildDepth
;
621 Q_ASSERT(pChildState
->m_DiscoverCount
<= pChildState
->GetEdgesIn()->size());
623 if(pChildState
->m_DiscoverCount
== pChildState
->GetEdgesIn()->size())
624 queue
.push_back(pChildState
);
629 std::cout
<< "FSA DL:"<<this->ComputeDL() << std::endl
;
632 int FSA::GetRobustness(FSAedgeList
* pPath
)
634 QStringList pathWordsWorking
;
635 int pathCharCount
= 0;
636 FSAMorphemeList
* pEdge0Morphemes
= pPath
->front()->GetMorphemes();
637 FSAMorphemeList::iterator j_iter
= pEdge0Morphemes
->begin();
638 //for(int j=0;j<pEdge0Morphemes->size();j++)
639 while(j_iter
!=pEdge0Morphemes
->end())
641 FSAMorpheme
* pMorph
= *j_iter
++;
642 pathCharCount
+=pMorph
->toStr().length();
643 pathWordsWorking
.push_back(pMorph
->toStr());
646 FSAedgeList::iterator i
= pPath
->begin();
647 ++i
; //skip first edge, already got these morphemes (above)
648 while( i
!= pPath
->end() )
650 FSAedge
* pEdge
= *i
++;
651 QStringList newWords
;
652 FSAMorphemeList
* mList
= pEdge
->GetMorphemes();
653 FSAMorphemeList::iterator j_iter
= mList
->begin();
654 //for(int j=0;j<mList->size();j++)
655 while (j_iter
!=mList
->end())
657 QString morph
= (*j_iter
++)->toStr();
658 //std::cout<< "At morpheme:"<< morph.toStdString() <<std::endl;
659 pathCharCount
+= morph
.length();
660 for(int k
=0;k
<pathWordsWorking
.size();k
++)
662 QString newWord
= pathWordsWorking
.at(k
) + morph
;
663 //std::cout<< " created: "<< newWord.toStdString() <<std::endl;
664 newWords
.append(newWord
);
667 pathWordsWorking
.clear();
668 for(int j
=0;j
<newWords
.size();j
++)
669 pathWordsWorking
.append(newWords
.at(j
));
672 int charsReplacedCount
= 0;
673 for(int i
=0;i
<pathWordsWorking
.size();i
++)
675 charsReplacedCount
+=pathWordsWorking
.at(i
).length();
678 //std::cout<< " charsReplaced: "<< charsReplacedCount << " pathCharCount: "<< pathCharCount << std::endl;
679 return charsReplacedCount
-pathCharCount
;
682 void FSAstate::OutputXfst (QTextStream
& outf
)
684 FSAstate
* pState
=this;
685 if ( pState
->GetEdgesIn()->size() == 0)//start state
687 outf
<< "\ndefine "<< this->getStateName() << " 0; "<<endl
;
691 outf
<< "define " << this->getStateName() << " ";
692 if (pState
->GetEdgesIn()->size()>1) outf
<< " [ ";
694 for (FSAedgeList::iterator j_iter
=pState
->GetEdgesIn()->begin();
695 j_iter
!= pState
->GetEdgesIn()->end(); ++j_iter
)
697 if (j_iter
!= pState
->GetEdgesIn()->begin())
699 FSAedge
* pEdge
= *j_iter
;
700 outf
<< pEdge
->GetFromState()->getStateName();
702 if (pEdge
->GetMorphemes()->size()==1)
704 FSAMorpheme
* pm
= pEdge
->GetMorphemes()->front();
705 const QString ss
= pm
->toStr();
706 if (pm
->toStr()=="NULL")
709 outf
<< " {" << ss
.toAscii() << "} ";
713 //outf << QString("S%1 ").arg(pEdge->GetFromState()->m_stateId);
715 FSAMorphemeList::iterator k_iter
= pEdge
->GetMorphemes()->begin();
716 //for(int k=0;k<pEdge->GetMorphemes()->size();k++)
717 while(k_iter
!= pEdge
->GetMorphemes()->end())
719 if (k_iter
!=pEdge
->GetMorphemes()->begin()) outf
<< "|";
720 FSAMorpheme
* pm
= *k_iter
++;
721 const QString ss
= pm
->toStr();
722 if (pm
->toStr()=="NULL")
725 outf
<< " {" << ss
.toAscii() << "} ";
730 if (pState
->GetEdgesIn()->size()>1) outf
<< " ]";
735 void FSAstate::SearchEdgeXfst(QTextStream
& outf
, std::set
<FSAstate
*>& discovered
)
737 for(FSAedgeList::iterator i
=this->GetEdgesIn()->begin();
738 i
!= this->GetEdgesIn()->end(); i
++)
740 FSAstate
* pStateIn
= (*i
)->GetFromState();
741 if(discovered
.count(pStateIn
)==0) //if state not discovered yet
742 pStateIn
->SearchEdgeXfst(outf
,discovered
);
744 this->OutputXfst(outf
);
747 void FSA::OutputFSAXfst ( const QString
& filename
)
749 QFile
file( filename
);
751 FSAStateList topoSortedStates
;
752 std::set
<FSAstate
*> discovered
;
754 //find states with no outbound edges
755 FSAStateList endStates
;
756 for (FSAStateList::iterator i
=this->GetStates()->begin();
757 i
!= this->GetStates()->end();++i
)
759 if ((*i
)->GetEdgesOut()->size()==0)
760 endStates
.push_back(*i
);
764 if( file
.open( IO_WriteOnly
) )
766 QTextStream
outf( &file
); //Should be ascii file, not unicode
768 outf
<< "# " << endl
;
769 outf
<< "# File: " << filename
<< endl
;
770 outf
<< "# " << endl
;
772 //for (int i=0;i<this->GetStates()->size();i++)
773 for (FSAStateList::iterator i
= endStates
.begin();
774 i
!=endStates
.end(); ++i
)
776 (*i
)->SearchEdgeXfst(outf
,discovered
);
780 //outf << "union net" << endl << endl;
781 //outf << "print words" << endl << endl;
787 void FSA::FSAListDisplay(Q3ListView
* pView
,
788 QMap
<class QString
, class QString
>* /* out filter not supported yet */,
789 bool /* “analyzed words only” flag for future use */)
791 pView
->setRootIsDecorated( FALSE
);
792 pView
->setSorting(1);
793 // Remove all previous columns
794 while( pView
->columns() ) pView
->removeColumn( 0 );
797 // get 'depth' of FSA
801 pView
->setSorting(0); //sort on 0th column
803 // Add Column headers
804 //std::cout<<"Graph Max: "<<m_MaxPathLength <<std::endl;
806 for (int i
= 0;i
<this->m_MaxPathLength
;i
++)
808 pView
->addColumn( QString( "Edge %1" ).arg(i
));
810 pView
->addColumn( QString( "Robustness" ));
811 //m_pLexicon->GetDocument()->setStatusBar1( "Displaying FSA" );
813 //associate each start state with it's parent list view item (row).
814 // If start state only has one outbound path then there will
815 // be only one lv item, otherwise lv items/paths will be associated
816 // with a placeholder parent path
817 QMap
<FSAstate
*,FSAListViewItem
*> lvMap
;
818 //create placeholder lvItem for start states with >=1 path
819 for(std::list
<FSAedgeList
*>::iterator i
=this->m_FSAPathList
.begin();
820 i
!= this->m_FSAPathList
.end(); ++i
)
822 FSAedgeList
* pPath
= *i
;
823 FSAstate
* pStartState
= pPath
->front()->GetFromState();
824 if(!lvMap
.contains(pStartState
) && pStartState
->GetEdgesOut()->size()>1)
826 FSAListViewItem
* lvtop
= new FSAListViewItem(pView
,pPath
);
827 lvtop
->setVisible(TRUE
);
828 lvtop
->setOpen(TRUE
);
829 lvtop
->setText(0,"All paths: ");
830 lvMap
.insert(pStartState
,lvtop
);
834 //store totaled robustness values for subgraph starting at each start state
835 QMap
<FSAstate
*,int> robustnessMap
;
837 //Add Item for each start state
839 for (int i=0;i<this->m_StartStates->size();i++)
841 FSAstate* pStartState = m_StartStates->at(i);
843 FSAListViewItem * pStartStateItem= new FSAListViewItem(pView, this->m_FSAPathList.at(0)); //LEAK
844 pStartStateItem->setVisible(TRUE);
845 pStartStateItem->setOpen(TRUE);
847 lvMap.insert(pStartState,pStartStateItem);
850 //add every path as row to list view
851 for(std::list
<FSAedgeList
*>::iterator i
=this->m_FSAPathList
.begin();
852 i
!= this->m_FSAPathList
.end(); ++i
)
854 FSAedgeList
* pPath
= *i
;
856 int robustness
= FSA::GetRobustness(pPath
);
857 QString robustnessStr
= QString("%1").arg(robustness
,16);
859 FSAListViewItem
*lvItem
;
860 //get parent list view item
861 FSAstate
* pStartState
=pPath
->front()->GetFromState();
862 FSAListViewItem
* pParent
=lvMap
.value(pStartState
);
865 lvItem
= new FSAListViewItem(pView
, pPath
);
866 lvMap
.insert(pStartState
,lvItem
);
870 lvItem
= new FSAListViewItem(pParent
,pPath
);
872 int oldVal
= robustnessMap
.value(pStartState
);
873 robustnessMap
.insert(pStartState
,robustness
+oldVal
); //score of lvItem,pPath
876 lvItem
->setVisible(TRUE
);
877 lvItem
->setOpen(TRUE
);
881 for(FSAedgeList::iterator i
=pPath
->begin(); i
!=pPath
->end(); ++i
)
884 FSAMorphemeList
* labelList
= pEdge
->GetMorphemes();
885 QString label
= join(*labelList
, QString(' '));
886 if(label
.length() > 64)
887 label
= QString("%1 ... %2 (%3)")
888 .arg(labelList
->front()->toStr())
889 .arg(labelList
->back()->toStr())
890 .arg(labelList
->size());
893 lvItem
->setText(col_idx
,label
+" ");
895 //if this edge spans multiple columns,
896 //then columns / use whitespace
897 col_idx
= pEdge
->GetToState()->getMaxDepth();
900 //FSAedgeList * edgeList = pEdge->GetToState()->GetEdgesOut();
901 //pEdge = ((edgeList->isEmpty())? NULL : edgeList->first());
903 lvItem
->setText(m_MaxPathLength
,robustnessStr
);
906 QMap
<FSAstate
*,FSAListViewItem
*>::iterator lvm_it
= lvMap
.begin();
907 while (lvm_it
!= lvMap
.end())
909 FSAstate
* pState
= lvm_it
.key();
910 if(pState
->GetEdgesOut()->size() > 1)
912 int robVal
= robustnessMap
.value(pState
);
913 QString robustnessStr
= QString("%1").arg(robVal
,16);
914 lvm_it
.value()->setText(m_MaxPathLength
,robustnessStr
);
921 //--------------------------------------------------------//
923 FSAListViewItem::~FSAListViewItem()
925 if(m_pImage
) delete m_pImage
;
928 QPixmap
* FSAListViewItem::GetQPixmap()
937 void FSAListViewItem::build_pixmap()
940 const char outputName
[] = "tmp.png";
941 QMap
<FSAstate
*,Agnode_t
*> nodeMap
;
943 // generate new image
944 GVC_t
* gvc
= gvContext();
945 graph_t
* g
= agopen(const_cast<char*>("tmpgraph"), AGDIGRAPHSTRICT
);
947 // left-to-right orientation
948 agraphattr(g
, const_cast<char*>("rankdir"), const_cast<char*>(""));
949 agset(g
, const_cast<char*>("rankdir"), const_cast<char*>("LR"));
951 /*QString& start_state_name = this->GetLVStartState()->m_stateLabel;
952 if (start_state_name.isEmpty())
953 //start_state_name = "S";
954 start_state_name = this->GetLVStartState()->getStateName();*/
956 // create new Agnode_t for start state
957 nodeMap
.insert(this->GetLVStartState(),
958 agnode(g
, this->GetLVStartState()->getStateName().toUtf8().data()));
960 QList
<FSAstate
*> queue
;
961 queue
.append(this->GetLVStartState());
965 while (!queue
.isEmpty()) {
966 FSAstate
* pCurState
= queue
.takeFirst();
968 FSAedgeList
& edges
= *pCurState
->GetEdgesOut();
969 for (FSAedgeList::iterator i
= edges
.begin(); i
!= edges
.end(); ++i
) {
971 FSAstate
* pNextState
= pEdge
->GetToState();
973 //label node, if does not already have a label
974 /*if (pNextState->m_stateLabel.isEmpty())
976 //QString("N%1").arg(j++);
978 pNextState->m_stateLabel = QString("S%1").arg(pNextState->m_stateId);
981 if (!nodeMap
.contains(pNextState
)) {
982 //QString label_text = pNextState->m_stateLabel;
983 QString label_text
= pNextState
->getStateName();
984 nodeMap
.insert(pNextState
,
985 agnode(g
, label_text
.toUtf8().data()));
986 queue
.append(pNextState
);
990 FSAMorphemeList
* labelList
= pEdge
->GetMorphemes();
991 QString label
= join(*labelList
, QString(' '));
992 if(label
.length() > 64)
993 label
= QString("%1 ... %2 (%3)")
994 .arg(labelList
->front()->toStr())
995 .arg(labelList
->back()->toStr())
996 .arg(labelList
->size());
999 Agedge_t
* e1
= agedge(g
, nodeMap
.value(pCurState
),
1000 nodeMap
.value(pNextState
));
1001 agedgeattr(g
, const_cast<char*>("label"),
1002 const_cast<char*>(""));
1003 agset(e1
, const_cast<char*>("label"),
1004 const_cast<char*>(label
.toUtf8().data()));
1008 // start state has >=2 outbound paths
1009 GetLVStartState()->GetEdgesOut()->size() >= 2 &&
1011 std::find(this->m_pPath
->begin(),this->m_pPath
->end(),pEdge
)
1012 != this->m_pPath
->end()
1015 agedgeattr(g
, const_cast<char*>("style"),
1016 const_cast<char*>(""));
1017 agset(e1
, const_cast<char*>("style"),
1018 const_cast<char*>("bold"));
1023 gvLayout(gvc
, g
, const_cast<char*>("dot"));
1024 gvRenderFilename(gvc
, g
, const_cast<char*>("png"),
1025 const_cast<char*>(outputName
));
1027 // cleanup - free Layout
1028 gvFreeLayout(gvc
, g
);
1033 // close output file and free context
1038 m_pImage
= new QPixmap(outputName
);
1042 void FSAListViewItem::DisplayEdgePath(Q3TextEdit
* m_commandLine
)
1045 FSAedge
* pEdge
=this->m_pPath
->front();//GetEdge();
1046 int tablength
= 0; //FIX
1048 m_commandLine
->insert( QString( "Edge %1:\n\n" ).arg(i
++) );
1049 FSAMorphemeList
* morphemes
= pEdge
->GetMorphemes();
1053 FSAMorphemeList::iterator j_iter
= morphemes
->begin();
1054 //for (int j = 0; j < morphemes->size(); )
1055 while(j_iter
!=morphemes
->end())
1057 //read up to 5 morphemes and print to line
1058 for( int k
=0; k
< nColumns
&& j_iter
!= morphemes
->end(); k
++ )
1060 //QString text = morphemes->at(j++)->toStr();//.stripWhiteSpace();
1061 QString text
= (*j_iter
++)->toStr();//.stripWhiteSpace();
1062 m_commandLine
->insert( text
+ "\t" );
1064 // Make tab length depend on the longest
1065 // word and font size
1066 int wordlength
= text
.length() * m_commandLine
->pointSize();
1067 if( tablength
< wordlength
) tablength
= wordlength
;
1069 m_commandLine
->insert( "\n" );
1071 m_commandLine
->insert( "\n" );
1073 //m_commandLine->insert(join(*morphemes, QString(' ')));
1074 FSAedgeList
* edgeList
= pEdge
->GetToState()->GetEdgesOut();
1075 if(!edgeList
->empty())
1076 pEdge
=edgeList
->front();
1079 m_commandLine
->insert("\n\n");
1081 m_commandLine
->setTabStopWidth( tablength
);
1085 void FSA::ResetDisplay()
1087 this->m_searched
=false;
1088 this->m_FSAPathList
.clear();
1090 for (FSAStateList::iterator i
=this->GetStates()->begin();
1091 i
!= this->GetStates()->end(); ++i
)
1093 FSAstate
* pState
= *i
;
1094 pState
->GetPathList()->clear();
1096 pState
->m_DiscoverCount
=0;
1097 pState
->m_stateLabel
=QString("");
1103 double FSA::ComputeDL()
1105 double morphemesDL
=0;
1107 QMap
<QString
,FSAMorpheme
*>::iterator m_it
;
1108 int mSum
=0; // sum morphCounter
1109 for (m_it
= this->m_morphemes
.begin(); m_it
!=m_morphemes
.end(); m_it
++)
1110 mSum
+= m_it
.value()->m_corpusCount
;
1112 QMap
<QString
,double> morphPtrDLMap
;
1113 for (m_it
= this->m_morphemes
.begin(); m_it
!=m_morphemes
.end(); m_it
++)
1115 FSAMorpheme
* pMorph
= m_it
.value();
1117 morphemesDL
+= pMorph
->GetDL(this->m_charCount
);
1119 //lenght of pointer to morpheme
1120 double cnt
= pMorph
->m_corpusCount
; // morphCounter.value(mi.key());
1121 double pDL
= base2log(mSum
/cnt
);
1122 //std::cout << mi.key().toStdString() << " \tDL: " << pDL << std::endl;
1123 morphPtrDLMap
.insert(pMorph
->toStr(), pDL
);
1126 std::cout
<< "morphemes DL: " << morphemesDL
<< std::endl
;
1128 double allStatesDL
=0;
1130 QMap
<FSAedge
*,double> edgePtrDLMap
;
1131 for (FSAStateList::iterator i
= this->m_States
->begin();
1132 i
!= this->m_States
->end(); ++i
)
1134 FSAstate
* pState
= *i
;
1136 for (FSAedgeList::iterator j_iter
=pState
->m_EdgesOut
.begin();
1137 j_iter
!= pState
->m_EdgesOut
.end(); ++j_iter
)
1139 FSAedge
* pEdge
= *j_iter
;
1140 int edgeWordCount
= 0;
1142 for(FSAMorphemeList::iterator k_iter
= pEdge
->GetMorphemes()->begin();
1143 k_iter
!= pEdge
->GetMorphemes()->end();
1146 FSAMorpheme
* pMorph
= *k_iter
;
1147 const QString
& morphStr
= pMorph
->toStr();
1148 edgeWordCount
+= pMorph
->m_corpusCount
;
1149 stateDL
+= morphPtrDLMap
[morphStr
];
1152 double edgePtrDL
= base2log(mSum
/edgeWordCount
);
1154 edgePtrDLMap
.insert(pEdge
,edgePtrDL
);
1156 allStatesDL
+=stateDL
;
1158 std::cout
<< "states DL: " << allStatesDL
<< std::endl
;
1160 return morphemesDL
+ allStatesDL
;