HowManyAreAnalyzed(): use status_user_agent to report progress
[linguistica.git] / FSA.cpp
blob7649c57f8bb6ee844a24aa368f88bd753f98e05a
1 // Implementation of finite-state automaton class
2 // Copyright © 2009 The University of Chicago
3 #include "FSA.h"
5 #include <algorithm>
6 #include <functional>
7 #include <numeric>
8 #include <QTableWidgetItem>
9 #include <QPixmap>
10 #include "MiniLexicon.h"
11 #include "Signature.h"
12 #include "Parse.h"
13 #include "SignatureCollection.h"
14 #include "StemCollection.h"
15 #include "WordCollection.h"
16 #include "SuffixCollection.h"
17 #include "Suffix.h"
18 #include "Stem.h"
19 #include "log2.h"
21 #ifdef USE_GRAPHVIZ
22 #include <graphviz/gvc.h>
23 #endif
25 #define TEST_FUNC_MAX 32
27 #define FSA_DEBUG 1
29 double FSAMorpheme::GetDL(int characterCount) const
31 if (this->str == "NULL")
32 return 0.0;
33 return double(this->str.size()) * base2log(characterCount);
36 bool FSAMorpheme::operator==(const FSAMorpheme & other) const
38 return this->toStr() == other.toStr();
41 namespace linguistica {
42 namespace join_helper {
43 class cat_with_delim : public std::binary_function<
44 QString, const FSAMorpheme*, QString> {
45 const QString& delim;
46 public:
47 cat_with_delim(const QString& d) : delim(d) { }
48 QString operator()(const QString& lhs,
49 const FSAMorpheme* rhs) const
51 QString result = lhs;
52 result.append(delim);
53 result.append(rhs->toStr());
54 return result;
60 QString join(const FSAMorphemeList& v, const QString& delim)
62 using linguistica::join_helper::cat_with_delim;
63 using std::accumulate;
65 FSAMorphemeList::const_iterator iter = v.begin();
66 if (iter == v.end())
67 // empty list
68 return QString();
70 const QString first = (*iter)->toStr();
71 return accumulate(++iter, v.end(), first, cat_with_delim(delim));
74 FSAstate::FSAstate(FSA* pFSA)
75 : m_EdgesOut(), m_EdgesIn(),
76 m_MaxDepth(-1), //m_stateLabel(),
77 m_PathList(), m_DiscoverCount(0),
78 m_stateId(pFSA->m_nextStateId++)
79 { pFSA->AddState(this); }
81 /*int FSAstate::getWordCount() const
83 int sumIn=0,sumOut=0;
85 for(int j=0; j<this->GetEdgesIn()->size();j++) {
86 FSAedge* pEdge = this->GetEdgesIn()->at(j);
87 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
89 sumIn += pEdge->GetMorphemes()->at(k)->m_corpusCount;
93 for(int j=0; j<this->GetEdgesOut()->size();j++) {
94 FSAedge* pEdge = this->GetEdgesOut()->at(j);
95 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
96 sumOut += pEdge->GetMorphemes()->at(k)->m_corpusCount;
99 Q_ASSERT(sumIn==0 || sumOut==0 || sumOut==sumIn);
100 Q_ASSERT(sumOut+sumIn > 0);
102 if(sumIn)
103 return sumIn;
104 else
105 return sumOut;
109 void FSAstate::AddEdgeOut(FSAedge* pEdge)
111 this->m_EdgesOut.push_back(pEdge);
114 void FSAstate::AddEdgeIn(FSAedge* pEdge)
116 this->m_EdgesIn.push_back(pEdge);
118 //----------------------------------------------------------------//
119 FSAedge::FSAedge(FSA* pFSA, FSAstate* pStartState, FSAstate* pEndState)
120 : m_Morphemes(), m_FromState(pStartState), m_ToState(pEndState), m_FSA(pFSA)
122 pFSA->AddEdge(this);
125 FSAedge::FSAedge(FSA* pFSA, FSAstate* pStartState, FSAstate* pEndState, CParse* pLabels)
126 : m_Morphemes(), // initialized below
127 m_FromState(pStartState),
128 m_ToState(pEndState)
130 pFSA->AddEdge(this);
131 pLabels->CreateQStringList(m_Morphemes);
134 //void FSAedge::AddMorpheme(Morpheme* pMorph)
135 void FSAedge::AddMorpheme(const QString& str, int addedCount)
137 Q_ASSERT(addedCount);
138 //if Morpheme not included in FSA, update
139 if(this->m_FSA->m_morphemes.contains(str))
141 FSAMorpheme * pMorph = this->m_FSA->m_morphemes[str];
142 pMorph->m_corpusCount+=addedCount;
143 } else {
144 Q_ASSERT(1); //should not be called now, we are not adding stems or suffixes
145 FSAMorpheme * pMorph = new FSAMorpheme(str,addedCount);
146 this->m_FSA->m_morphemes.insert(str, pMorph);
149 this->m_Morphemes.push_back( new FSAMorpheme(str,addedCount) );
152 void FSAedge::RemoveMorpheme(FSAMorphemeList::iterator iter)
154 FSAMorpheme* pMorph = *iter;
155 QString str = pMorph->toStr();
156 int count = pMorph->m_corpusCount;
157 //this->GetMorphemes()->remove(pMorph);
158 this->GetMorphemes()->erase(iter);
160 // Do cleanup at FSA level
161 FSAMorpheme * FsaMorph = this->m_FSA->m_morphemes[str];
162 Q_ASSERT(FsaMorph);
163 Q_ASSERT(FsaMorph->m_corpusCount >= count);
164 //std::cout << "Removing: " << pMorph->toStr().toStdString() << " FSA count: " << FsaMorph->m_corpusCount << " This edge count:" << count << std::endl;
165 if(count == FsaMorph->m_corpusCount ) //remove from FSA
166 this->m_FSA->m_morphemes.remove(str);
167 else
168 this->m_FSA->m_morphemes[str]->m_corpusCount-=count;
170 delete pMorph;
173 FSAedge* FSA::DoParallelSplit(FSAedge* pEdge, FSAMorphemeList& morphsToMove)
175 Q_ASSERT(!morphsToMove.empty());
176 //if(morphsToMove.empty()) return NULL;
178 if(morphsToMove.size() == pEdge->GetMorphemes()->size())
180 #ifdef FSA_DEBUG
181 FSAMorphemeList copy1 = *pEdge->GetMorphemes();
182 FSAMorphemeList copy2 = morphsToMove;
183 copy1.sort();
184 copy2.sort();
185 Q_ASSERT(copy1==copy2);
186 #endif
187 return NULL;
189 #ifdef FSA_DEBUG
190 FSAMorphemeList* pEdgeMorphemes = pEdge->GetMorphemes();
191 foreach (FSAMorpheme* pMorph, morphsToMove) {
192 FSAMorphemeList::iterator m_it =
193 std::find(pEdgeMorphemes->begin(),pEdgeMorphemes->end(),pMorph);
194 Q_ASSERT(m_it!=pEdgeMorphemes->end());
196 #endif
197 //just move dynamically allocated FSAMorph object from one edge to another.
198 // No other bookkeeping needed, but watch for this if defn any of FSA classes
199 // change. Note that morpheme counts are tracked at FSA level, but split does
200 // not change these counts.
201 FSAedge* pNewEdge = new FSAedge(this, pEdge->GetFromState(), pEdge->GetToState());
202 foreach (FSAMorpheme *pMorph,morphsToMove) {
203 pEdge->GetMorphemes()->remove(pMorph);
204 pNewEdge->GetMorphemes()->push_back(pMorph);
206 return pNewEdge;
209 void FSA::DoMultParallelSplit( FSAedge* pEdge,
210 std::list<FSAMorphemeList>::iterator first,
211 std::list<FSAMorphemeList>::iterator last )
213 std::list<FSAMorphemeList>::iterator list_it = first;
214 #ifdef FSA_DEBUG
215 FSAMorphemeList all;
216 while(list_it!=last)
218 foreach(FSAMorpheme* pMorph,*list_it++)
219 all.push_back(pMorph);
221 Q_ASSERT(all.size()==pEdge->GetMorphemes()->size());
222 all.sort();
223 FSAMorphemeList copy = *pEdge->GetMorphemes();
224 copy.sort();
225 Q_ASSERT(all==copy);
226 list_it=first; //reset iterator
227 #endif
228 //remove 1 non-empty list, some morphs will stay on pEdge
229 while((*list_it++).size()==0)
231 while(list_it!=last)
233 //this->DoParallelSplit(pEdge,*list_it++);
234 FSAMorphemeList& morphList = *list_it++;
236 if(morphList.size()==0) continue; //ignore empty lists
238 FSAedge* pNewEdge = new FSAedge(this, pEdge->GetFromState(), pEdge->GetToState());
239 foreach (FSAMorpheme *pMorph,morphList) {
240 pEdge->GetMorphemes()->remove(pMorph);
241 pNewEdge->GetMorphemes()->push_back(pMorph);
247 //breaks an edge in
248 FSAedge* FSA::DoSeriesSplit(FSAedge* pEdge,unsigned int len, bool suffix)
250 Q_ASSERT(!pEdge->GetMorphemes()->empty());
252 QString firstMorph = pEdge->GetMorphemes()->front()->toStr();
253 QString commonMorphStr = ( suffix ? firstMorph.right(len) : firstMorph.left(len) );
255 FSAstate* pNewState = new FSAstate(this);
256 FSAedge* pEdge1 = new FSAedge(this, pEdge->GetFromState(),pNewState);
257 FSAedge* pEdge2 = new FSAedge(this, pNewState,pEdge->GetToState());
259 FSAedge *commonEdge, *multiEdge;
260 if(suffix) {
261 multiEdge=pEdge1;
262 commonEdge=pEdge2;
263 } else {
264 commonEdge=pEdge1;
265 multiEdge=pEdge2;
267 unsigned int totalCount = 0;
269 FSAMorphemeList::iterator m_iter = pEdge->GetMorphemes()->begin();
270 while(m_iter != pEdge->GetMorphemes()->end())
272 FSAMorpheme * pMorph = *m_iter;
273 QString morphStr = pMorph->toStr();
274 QString curCommon = ( suffix ? morphStr.right(len) : morphStr.left(len) );
275 Q_ASSERT( curCommon == commonMorphStr );
276 totalCount += pMorph->m_corpusCount;
278 unsigned int len_other = morphStr.length()-len;
279 QString cur_multi = ( suffix ? morphStr.left(len_other) : morphStr.right(len_other) );
280 //add stem morphemes
281 multiEdge->AddMorpheme(cur_multi,pMorph->m_corpusCount);
282 pEdge->RemoveMorpheme(m_iter++);
284 //add affix morpheme
285 Q_ASSERT(pEdge->GetMorphemes()->empty());
286 commonEdge->AddMorpheme(commonMorphStr,totalCount);
288 this->RemoveEdge(pEdge);
289 return pEdge1;
293 FSA::FSA()
294 : m_States(), m_Edges(), m_StartStates(),
295 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
296 m_lexicon(), m_charCount(0),
297 m_morphemes(),
298 m_nextStartStateId(1), m_nextStateId(1) { }
300 FSA::FSA(const FSA& fsa)
301 : m_States(fsa.m_States), m_Edges(fsa.m_Edges), m_StartStates(fsa.m_StartStates),
302 m_FSAPathList(fsa.m_FSAPathList), m_searched(false), m_MaxPathLength(0),
303 m_lexicon(), m_charCount(0),
304 m_morphemes(fsa.m_morphemes),
305 m_nextStartStateId(1), m_nextStateId(1) { }
307 FSA::FSA(CMiniLexicon* pMiniLexicon)
308 : m_States(new FSAStateList),
309 m_Edges(new FSAedgeList),
310 m_StartStates(new FSAStateList),
311 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
312 m_lexicon(pMiniLexicon),
313 m_charCount(pMiniLexicon->GetNumberOfCharacterTypes()),
314 m_morphemes(),
315 m_nextStartStateId(1), m_nextStateId(1)
317 //m_StartState->setMaxDepth(0);
318 // this->AddState(m_StartState);
320 //int MinimumNumberOfStemsForDisplay = 2;
321 //if (pMiniLexicon->GetSignatures()->GetCount() < 20 ) MinimumNumberOfStemsForDisplay = 1;
322 //int MinimumNumberOfAffixesForDisplay = 1; //make this adjustable by user @@@@
324 for (int i = 0; i < pMiniLexicon->GetSignatures()->GetCount(); i++)
326 CSignature* pSig = pMiniLexicon->GetSignatures()->GetAt(i);
328 //if (pSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay ) continue;
329 //if (pSig->Size() < MinimumNumberOfAffixesForDisplay ) continue;
331 AddSignature(pSig);
335 FSA::~FSA()
337 while(!m_Edges->empty())
339 FSAedge* pEdge = m_Edges->front();
340 m_Edges->pop_front();
341 while(!(pEdge->GetMorphemes()->empty()))
343 delete pEdge->GetMorphemes()->front();
344 pEdge->GetMorphemes()->pop_front();
346 delete pEdge;
349 while(!m_States->empty())
351 FSAstate* pState = m_States->front();
352 m_States->pop_front();
353 while(!pState->m_PathList.empty())
355 delete pState->m_PathList.front();
356 pState->m_PathList.pop_front();
358 delete pState;
361 QMap<QString, FSAMorpheme*>::iterator i = m_morphemes.begin();
362 while (i != m_morphemes.end()) {
363 delete i.value();
364 ++i;
367 delete this->m_States;
368 delete this->m_Edges;
369 delete this->m_StartStates;
372 void FSA::AddSignature(CSignature* pSig)
374 //std::cout << "---- Signature " <<pSig->GetNumberOfStems()<<" "<< pSig->GetNumberOfAffixes()<< std::endl;
376 //add start state
377 FSAstate* pStart = new FSAstate(this);
378 pStart->setMaxDepth(0);
379 this->m_StartStates->push_back(pStart);
381 FSAstate* pMiddle = new FSAstate(this);
382 FSAstate* pEnd = new FSAstate(this);
385 CStringSurrogate ssStem;
386 FSAedge* pEdge1 = new FSAedge(this, pStart, pMiddle);
387 for (int i = 0; i < pSig->GetNumberOfStems(); i++)
389 CStem* pStem = pSig->GetStemPtrList()->at(i);
390 QString str = pStem->Display();
392 int corpusCount = pStem->GetCorpusCount();
393 Q_ASSERT(corpusCount>0);
394 if(str.size()>0)
396 //std::cout << "\tStem:"<<str.toStdString() <<"."<< corpusCount <<std::endl;
397 pEdge1->AddMorpheme( str,corpusCount); //TODO
401 FSAedge* pEdge2 = new FSAedge(this, pMiddle, pEnd);
402 for (int i = 0; i < pSig->GetNumberOfAffixes(); i++)
404 CSuffix* pSuffix= pSig->GetSuffixPtrList()->at(i);
405 QString suffixStr = pSuffix->Display();
406 int corpusCount = 0;
408 //for(int j=0;j<pEdge1->GetMorphemes()->size();j++)
409 FSAMorphemeList::iterator j_iter = pEdge1->GetMorphemes()->begin();
410 while(j_iter != pEdge1->GetMorphemes()->end())
412 FSAMorpheme *pMorph = *j_iter++;
413 int stemCount = pMorph->m_corpusCount;
414 Q_ASSERT(stemCount>0);
415 QString analyzedWord = pMorph->toStr();
416 if (suffixStr!="NULL")
417 analyzedWord.append(suffixStr);
419 CWordCollection* wc = this->m_lexicon->GetWords();
421 CStem* pStem = ( *wc^=analyzedWord);
422 Q_ASSERT(pStem);
424 corpusCount+=(pStem->GetCorpusCount());
427 if(suffixStr.size()>0)
429 //std::cout << "\t\tSuffix "<< suffixStr.toStdString() <<"."<<corpusCount<<std::endl;
430 pEdge2->AddMorpheme(suffixStr, corpusCount);
434 //----------------------------------------------------------------//
437 void FSA::AddState(FSAstate* pState)
439 m_States->push_back(pState);
442 void FSA::AddEdge(FSAedge* pEdge)
444 m_Edges->push_back(pEdge);
445 pEdge->GetFromState()->AddEdgeOut(pEdge);
446 pEdge->GetToState()->AddEdgeIn(pEdge);
449 void FSA::RemoveEdge(FSAedge* pEdge)
451 // remove all morphemes from edge, this does bookkeeping at FSA level
452 // currently unused, TestFunc only removes edge when Morph count is zero
453 if (pEdge->GetMorphemes()->size() > 0)
455 FSAMorphemeList::iterator iter = pEdge->GetMorphemes()->begin();
456 while(iter != pEdge->GetMorphemes()->end())
457 pEdge->RemoveMorpheme(iter++);
460 this->m_Edges->remove(pEdge);
461 pEdge->GetFromState()->m_EdgesOut.remove(pEdge);
462 pEdge->GetToState()->m_EdgesIn.remove(pEdge);
464 //delete all paths that contain this edge
465 for(std::list<FSAedgeList*>::iterator i=this->m_FSAPathList.begin();
466 i != this->m_FSAPathList.end(); ++i)
468 FSAedgeList* pPath = *i;
469 //if edge in path: linear search, paths will be short
470 if( std::find(pPath->begin(),pPath->end(),pEdge) != pPath->end() );
472 this->m_FSAPathList.remove(pPath);
473 delete pPath;
476 delete pEdge;
479 void FSA::FSATestFunc()
481 CSuffixCollection* lexSuffixes = this->m_lexicon->GetSuffixes();
482 lexSuffixes->Sort(COUNT);
483 CStemCollection* lexStems = this->m_lexicon->GetStems();
485 for (int suf_idx=0; suf_idx<lexSuffixes->GetCount(); ++suf_idx)
487 if(suf_idx>=TEST_FUNC_MAX) {
488 //std::cout << "BREAK" << std::endl;
489 break;
491 CSuffix* pSuffix = lexSuffixes->GetAtSort(suf_idx);
492 //std::cout << "Suffix:" << pSuffix->Display().toStdString() << std::endl;
493 QString suffixStr = pSuffix->Display();
495 if(suffixStr == "NULL") continue;
497 //make copy now, list will be altered in while-loop
498 FSAedgeList edges = FSAedgeList(*this->m_Edges);
500 FSAedgeList::iterator edge_it = edges.begin();
501 while(edge_it != edges.end())
503 FSAedge* pEdge = *edge_it;
504 ++edge_it; //increment now, pEdge may be deleted
506 FSAMorphemeList morphemesToReanalyze;
507 FSAMorphemeList * morphemes = pEdge->GetMorphemes();
508 //std::cout << "At edge: " << join(*morphemes,QString(' ')).toStdString() << std::endl;
509 foreach (FSAMorpheme* pMorph, *morphemes)
511 QString edgeStr = pMorph->toStr();
512 //std::cout<<" Morpheme:"<<edgeStr.toStdString()<<std::endl;
514 if (edgeStr.endsWith(suffixStr) && edgeStr!=suffixStr)
516 QString stemStr = edgeStr.left(edgeStr.length() - suffixStr.length());
518 CStem * pStem = (*lexStems^=stemStr);
519 //if stemStr in Lexicon
520 if(pStem)
522 //if stem takes this suffix
523 CSignature * pSig = pStem->GetSuffixSignature();
524 for (int k = 0;k<pSig->GetSuffixPtrList()->count();k++)
526 CSuffix* pSuffix= pSig->GetSuffixPtrList()->at(k);
527 if(pSuffix->Display() == suffixStr )
529 morphemesToReanalyze.push_back(pMorph);
530 break;
534 } //else skip this morpheme
535 }//edge-morphemes
536 if(!morphemesToReanalyze.empty()) {
537 if( morphemes->size()==morphemesToReanalyze.size() ) {
538 //just do parallel split
539 this->DoSeriesSplit(pEdge,suffixStr.length(),true);
540 } else {
541 FSAedge* pNewEdge = this->DoParallelSplit(pEdge,morphemesToReanalyze);
542 this->DoSeriesSplit(pNewEdge,suffixStr.length(),true);
545 }//edges
546 }//suffix morphemes
549 void FSA::DoSearch()
551 if(this->m_searched)
552 return;
554 this->m_searched = true;
556 // get 'depth' of FSA
557 this-> m_MaxPathLength=0;
558 FSAStateList queue;
559 //queue.append(this->m_StartState);
560 for ( FSAStateList::iterator i=this->m_StartStates->begin();
561 i != this->m_StartStates->end(); ++i)
563 FSAstate* pStartState=*i; //this->m_StartStates->at(i);
565 //add empty path
566 FSAedgeList* pPath = new FSAedgeList();
567 pStartState->addPath(pPath);
569 queue.push_back(pStartState);
571 while(!queue.empty())
573 //FSAstate* pState = queue.takeFirst();
574 FSAstate* pState = queue.front();
575 queue.pop_front();
577 FSAedgeList* edgeList = pState->GetEdgesOut();
579 //if at leaf, add paths to FSA and do not search outbound edges
580 if(edgeList->size() == 0)
582 //this->m_FSAPathList += *(pState->GetPathList());
583 for(std::list<FSAedgeList*>::iterator j=pState->GetPathList()->begin();
584 j != pState->GetPathList()->end(); ++j)
585 this->m_FSAPathList.push_back(*j);
587 else
589 //iter over outbound edges
590 for (FSAedgeList::iterator j=edgeList->begin(); j!=edgeList->end(); ++j)
592 FSAedge* pEdge = *j;
593 FSAstate* pChildState = pEdge->GetToState();
594 pChildState->m_DiscoverCount++;
596 //add paths to pChildState
597 for(std::list<FSAedgeList*>::iterator l=pState->GetPathList()->begin();
598 l != pState->GetPathList()->end(); ++l)
600 FSAedgeList* pPath = *l;
602 FSAedgeList* pNewPath = new FSAedgeList(*pPath);
603 pNewPath->push_back(pEdge);
604 pChildState->addPath(pNewPath);
607 int maxChildDepth = pChildState->getMaxDepth();
609 //set length of max path to child
610 if(maxChildDepth==-1 || pState->getMaxDepth()+1 > maxChildDepth)
612 pChildState->setMaxDepth(pState->getMaxDepth()+1);
613 maxChildDepth = pChildState->getMaxDepth();
616 if (maxChildDepth > m_MaxPathLength)
618 m_MaxPathLength=maxChildDepth;
621 Q_ASSERT(pChildState->m_DiscoverCount <= pChildState->GetEdgesIn()->size());
623 if(pChildState->m_DiscoverCount == pChildState->GetEdgesIn()->size())
624 queue.push_back(pChildState);
629 std::cout<< "FSA DL:"<<this->ComputeDL() << std::endl;
632 int FSA::GetRobustness(FSAedgeList * pPath)
634 QStringList pathWordsWorking;
635 int pathCharCount = 0;
636 FSAMorphemeList* pEdge0Morphemes = pPath->front()->GetMorphemes();
637 FSAMorphemeList::iterator j_iter = pEdge0Morphemes->begin();
638 //for(int j=0;j<pEdge0Morphemes->size();j++)
639 while(j_iter!=pEdge0Morphemes->end())
641 FSAMorpheme* pMorph = *j_iter++;
642 pathCharCount+=pMorph->toStr().length();
643 pathWordsWorking.push_back(pMorph->toStr());
646 FSAedgeList::iterator i = pPath->begin();
647 ++i; //skip first edge, already got these morphemes (above)
648 while( i != pPath->end() )
650 FSAedge* pEdge = *i++;
651 QStringList newWords;
652 FSAMorphemeList* mList = pEdge->GetMorphemes();
653 FSAMorphemeList::iterator j_iter = mList->begin();
654 //for(int j=0;j<mList->size();j++)
655 while (j_iter!=mList->end())
657 QString morph = (*j_iter++)->toStr();
658 //std::cout<< "At morpheme:"<< morph.toStdString() <<std::endl;
659 pathCharCount += morph.length();
660 for(int k=0;k<pathWordsWorking.size();k++)
662 QString newWord = pathWordsWorking.at(k) + morph;
663 //std::cout<< " created: "<< newWord.toStdString() <<std::endl;
664 newWords.append(newWord);
667 pathWordsWorking.clear();
668 for(int j=0;j<newWords.size();j++)
669 pathWordsWorking.append(newWords.at(j));
670 }//end edge
672 int charsReplacedCount = 0;
673 for(int i=0;i<pathWordsWorking.size();i++)
675 charsReplacedCount+=pathWordsWorking.at(i).length();
678 //std::cout<< " charsReplaced: "<< charsReplacedCount << " pathCharCount: "<< pathCharCount << std::endl;
679 return charsReplacedCount-pathCharCount;
682 void FSAstate::OutputXfst (QTextStream& outf)
684 FSAstate* pState=this;
685 if ( pState->GetEdgesIn()->size() == 0)//start state
687 outf << "\ndefine "<< this->getStateName() << " 0; "<<endl;
689 else
691 outf << "define " << this->getStateName() << " ";
692 if (pState->GetEdgesIn()->size()>1) outf << " [ ";
694 for (FSAedgeList::iterator j_iter=pState->GetEdgesIn()->begin();
695 j_iter != pState->GetEdgesIn()->end(); ++j_iter)
697 if (j_iter != pState->GetEdgesIn()->begin())
698 outf << "|";
699 FSAedge* pEdge = *j_iter;
700 outf << pEdge->GetFromState()->getStateName();
702 if (pEdge->GetMorphemes()->size()==1)
704 FSAMorpheme* pm = pEdge->GetMorphemes()->front();
705 const QString ss = pm->toStr();
706 if (pm->toStr()=="NULL")
707 outf << " 0 ";
708 else
709 outf << " {" << ss.toAscii() << "} ";
711 else
713 //outf << QString("S%1 ").arg(pEdge->GetFromState()->m_stateId);
714 outf << " [ ";
715 FSAMorphemeList::iterator k_iter = pEdge->GetMorphemes()->begin();
716 //for(int k=0;k<pEdge->GetMorphemes()->size();k++)
717 while(k_iter != pEdge->GetMorphemes()->end())
719 if (k_iter!=pEdge->GetMorphemes()->begin()) outf << "|";
720 FSAMorpheme* pm = *k_iter++;
721 const QString ss = pm->toStr();
722 if (pm->toStr()=="NULL")
723 outf << " 0 ";
724 else
725 outf << " {" << ss.toAscii() << "} ";
727 outf << " ] ";
730 if (pState->GetEdgesIn()->size()>1) outf << " ]";
731 outf << ";" << endl;
735 void FSAstate::SearchEdgeXfst(QTextStream& outf, std::set<FSAstate*>& discovered)
737 for(FSAedgeList::iterator i=this->GetEdgesIn()->begin();
738 i != this->GetEdgesIn()->end(); i++)
740 FSAstate* pStateIn = (*i)->GetFromState();
741 if(discovered.count(pStateIn)==0) //if state not discovered yet
742 pStateIn->SearchEdgeXfst(outf,discovered);
744 this->OutputXfst(outf);
747 void FSA::OutputFSAXfst ( const QString& filename )
749 QFile file( filename );
751 FSAStateList topoSortedStates;
752 std::set<FSAstate*> discovered;
754 //find states with no outbound edges
755 FSAStateList endStates;
756 for (FSAStateList::iterator i=this->GetStates()->begin();
757 i != this->GetStates()->end();++i)
759 if ((*i)->GetEdgesOut()->size()==0)
760 endStates.push_back(*i);
764 if( file.open( IO_WriteOnly ) )
766 QTextStream outf( &file ); //Should be ascii file, not unicode
768 outf << "# " << endl;
769 outf << "# File: " << filename << endl;
770 outf << "# " << endl;
772 //for (int i=0;i<this->GetStates()->size();i++)
773 for (FSAStateList::iterator i = endStates.begin();
774 i!=endStates.end(); ++i)
776 (*i)->SearchEdgeXfst(outf,discovered);
779 outf << endl;
780 //outf << "union net" << endl << endl;
781 //outf << "print words" << endl << endl;
783 file.close();
787 void FSA::FSAListDisplay(Q3ListView* pView,
788 QMap<class QString, class QString>* /* out filter not supported yet */,
789 bool /* “analyzed words only” flag for future use */)
791 pView->setRootIsDecorated( FALSE );
792 pView->setSorting(1);
793 // Remove all previous columns
794 while( pView->columns() ) pView->removeColumn( 0 );
795 pView->clear();
797 // get 'depth' of FSA
799 this->DoSearch();
801 pView->setSorting(0); //sort on 0th column
803 // Add Column headers
804 //std::cout<<"Graph Max: "<<m_MaxPathLength <<std::endl;
805 QStringList headers;
806 for (int i = 0;i<this->m_MaxPathLength;i++)
808 pView->addColumn( QString( "Edge %1" ).arg(i));
810 pView->addColumn( QString( "Robustness" ));
811 //m_pLexicon->GetDocument()->setStatusBar1( "Displaying FSA" );
813 //associate each start state with it's parent list view item (row).
814 // If start state only has one outbound path then there will
815 // be only one lv item, otherwise lv items/paths will be associated
816 // with a placeholder parent path
817 QMap<FSAstate*,FSAListViewItem*> lvMap;
818 //create placeholder lvItem for start states with >=1 path
819 for(std::list<FSAedgeList*>::iterator i=this->m_FSAPathList.begin();
820 i != this->m_FSAPathList.end(); ++i)
822 FSAedgeList * pPath= *i;
823 FSAstate * pStartState = pPath->front()->GetFromState();
824 if(!lvMap.contains(pStartState) && pStartState->GetEdgesOut()->size()>1)
826 FSAListViewItem * lvtop = new FSAListViewItem(pView,pPath);
827 lvtop->setVisible(TRUE);
828 lvtop->setOpen(TRUE);
829 lvtop->setText(0,"All paths: ");
830 lvMap.insert(pStartState,lvtop);
834 //store totaled robustness values for subgraph starting at each start state
835 QMap<FSAstate*,int> robustnessMap;
837 //Add Item for each start state
839 for (int i=0;i<this->m_StartStates->size();i++)
841 FSAstate* pStartState = m_StartStates->at(i);
843 FSAListViewItem * pStartStateItem= new FSAListViewItem(pView, this->m_FSAPathList.at(0)); //LEAK
844 pStartStateItem->setVisible(TRUE);
845 pStartStateItem->setOpen(TRUE);
847 lvMap.insert(pStartState,pStartStateItem);
850 //add every path as row to list view
851 for(std::list<FSAedgeList*>::iterator i=this->m_FSAPathList.begin();
852 i != this->m_FSAPathList.end(); ++i)
854 FSAedgeList* pPath = *i;
856 int robustness = FSA::GetRobustness(pPath);
857 QString robustnessStr = QString("%1").arg(robustness,16);
859 FSAListViewItem *lvItem;
860 //get parent list view item
861 FSAstate* pStartState=pPath->front()->GetFromState();
862 FSAListViewItem * pParent=lvMap.value(pStartState);
863 if(pParent==NULL)
865 lvItem = new FSAListViewItem(pView, pPath);
866 lvMap.insert(pStartState,lvItem);
868 else
870 lvItem = new FSAListViewItem(pParent,pPath);
872 int oldVal = robustnessMap.value(pStartState);
873 robustnessMap.insert(pStartState,robustness+oldVal); //score of lvItem,pPath
876 lvItem->setVisible(TRUE);
877 lvItem->setOpen(TRUE);
879 int col_idx=0;
880 // first column
881 for(FSAedgeList::iterator i=pPath->begin(); i!=pPath->end(); ++i)
883 FSAedge* pEdge = *i;
884 FSAMorphemeList* labelList = pEdge->GetMorphemes();
885 QString label = join(*labelList, QString(' '));
886 if(label.length() > 64)
887 label= QString("%1 ... %2 (%3)")
888 .arg(labelList->front()->toStr())
889 .arg(labelList->back()->toStr())
890 .arg(labelList->size());
892 //populate table
893 lvItem->setText(col_idx,label+" ");
895 //if this edge spans multiple columns,
896 //then columns / use whitespace
897 col_idx = pEdge->GetToState()->getMaxDepth();
899 //iterate
900 //FSAedgeList * edgeList = pEdge->GetToState()->GetEdgesOut();
901 //pEdge = ((edgeList->isEmpty())? NULL : edgeList->first());
903 lvItem->setText(m_MaxPathLength,robustnessStr);
906 QMap<FSAstate*,FSAListViewItem*>::iterator lvm_it = lvMap.begin();
907 while (lvm_it != lvMap.end())
909 FSAstate* pState = lvm_it.key();
910 if(pState->GetEdgesOut()->size() > 1)
912 int robVal = robustnessMap.value(pState);
913 QString robustnessStr = QString("%1").arg(robVal,16);
914 lvm_it.value()->setText(m_MaxPathLength,robustnessStr);
917 lvm_it++;
921 //--------------------------------------------------------//
923 FSAListViewItem::~FSAListViewItem()
925 if(m_pImage) delete m_pImage;
928 QPixmap* FSAListViewItem::GetQPixmap()
930 if (m_pImage == 0)
931 build_pixmap();
933 return m_pImage;
936 #ifdef USE_GRAPHVIZ
937 void FSAListViewItem::build_pixmap()
939 // Init graph
940 const char outputName[] = "tmp.png";
941 QMap<FSAstate*,Agnode_t*> nodeMap;
943 // generate new image
944 GVC_t* gvc = gvContext();
945 graph_t* g = agopen(const_cast<char*>("tmpgraph"), AGDIGRAPHSTRICT);
947 // left-to-right orientation
948 agraphattr(g, const_cast<char*>("rankdir"), const_cast<char*>(""));
949 agset(g, const_cast<char*>("rankdir"), const_cast<char*>("LR"));
951 /*QString& start_state_name = this->GetLVStartState()->m_stateLabel;
952 if (start_state_name.isEmpty())
953 //start_state_name = "S";
954 start_state_name = this->GetLVStartState()->getStateName();*/
956 // create new Agnode_t for start state
957 nodeMap.insert(this->GetLVStartState(),
958 agnode(g, this->GetLVStartState()->getStateName().toUtf8().data()));
960 QList<FSAstate*> queue;
961 queue.append(this->GetLVStartState());
963 //now do search
964 //int j=1;
965 while (!queue.isEmpty()) {
966 FSAstate* pCurState = queue.takeFirst();
968 FSAedgeList& edges = *pCurState->GetEdgesOut();
969 for (FSAedgeList::iterator i = edges.begin(); i != edges.end(); ++i) {
970 FSAedge* pEdge = *i;
971 FSAstate* pNextState = pEdge->GetToState();
973 //label node, if does not already have a label
974 /*if (pNextState->m_stateLabel.isEmpty())
976 //QString("N%1").arg(j++);
977 ++j;
978 pNextState->m_stateLabel = QString("S%1").arg(pNextState->m_stateId);
981 if (!nodeMap.contains(pNextState)) {
982 //QString label_text = pNextState->m_stateLabel;
983 QString label_text = pNextState->getStateName();
984 nodeMap.insert(pNextState,
985 agnode(g, label_text.toUtf8().data()));
986 queue.append(pNextState);
989 // make edge label
990 FSAMorphemeList* labelList = pEdge->GetMorphemes();
991 QString label = join(*labelList, QString(' '));
992 if(label.length() > 64)
993 label= QString("%1 ... %2 (%3)")
994 .arg(labelList->front()->toStr())
995 .arg(labelList->back()->toStr())
996 .arg(labelList->size());
998 // make edge
999 Agedge_t* e1 = agedge(g, nodeMap.value(pCurState),
1000 nodeMap.value(pNextState));
1001 agedgeattr(g, const_cast<char*>("label"),
1002 const_cast<char*>(""));
1003 agset(e1, const_cast<char*>("label"),
1004 const_cast<char*>(label.toUtf8().data()));
1006 // make bold if:
1007 if (
1008 // start state has >=2 outbound paths
1009 GetLVStartState()->GetEdgesOut()->size() >= 2 &&
1010 // edge in path
1011 std::find(this->m_pPath->begin(),this->m_pPath->end(),pEdge)
1012 != this->m_pPath->end()
1015 agedgeattr(g, const_cast<char*>("style"),
1016 const_cast<char*>(""));
1017 agset(e1, const_cast<char*>("style"),
1018 const_cast<char*>("bold"));
1023 gvLayout(gvc, g, const_cast<char*>("dot"));
1024 gvRenderFilename(gvc, g, const_cast<char*>("png"),
1025 const_cast<char*>(outputName));
1027 // cleanup - free Layout
1028 gvFreeLayout(gvc, g);
1030 // free graph
1031 agclose(g);
1033 // close output file and free context
1034 gvFreeContext(gvc);
1036 // save image
1037 delete m_pImage;
1038 m_pImage = new QPixmap(outputName);
1040 #endif
1042 void FSAListViewItem::DisplayEdgePath(Q3TextEdit* m_commandLine)
1044 int i = 0;
1045 FSAedge* pEdge=this->m_pPath->front();//GetEdge();
1046 int tablength = 0; //FIX
1047 while( pEdge ){
1048 m_commandLine->insert( QString( "Edge %1:\n\n" ).arg(i++) );
1049 FSAMorphemeList* morphemes = pEdge->GetMorphemes();
1051 //display as table
1052 int nColumns = 5;
1053 FSAMorphemeList::iterator j_iter = morphemes->begin();
1054 //for (int j = 0; j < morphemes->size(); )
1055 while(j_iter!=morphemes->end())
1057 //read up to 5 morphemes and print to line
1058 for( int k=0; k < nColumns && j_iter != morphemes->end(); k++ )
1060 //QString text = morphemes->at(j++)->toStr();//.stripWhiteSpace();
1061 QString text = (*j_iter++)->toStr();//.stripWhiteSpace();
1062 m_commandLine->insert( text + "\t" );
1064 // Make tab length depend on the longest
1065 // word and font size
1066 int wordlength = text.length() * m_commandLine->pointSize();
1067 if( tablength < wordlength ) tablength = wordlength;
1069 m_commandLine->insert( "\n" );
1071 m_commandLine->insert( "\n" );
1073 //m_commandLine->insert(join(*morphemes, QString(' ')));
1074 FSAedgeList * edgeList = pEdge->GetToState()->GetEdgesOut();
1075 if(!edgeList->empty())
1076 pEdge=edgeList->front();
1077 else
1078 pEdge=NULL;
1079 m_commandLine->insert("\n\n");
1081 m_commandLine->setTabStopWidth( tablength );
1085 void FSA::ResetDisplay()
1087 this->m_searched=false;
1088 this->m_FSAPathList.clear();
1090 for (FSAStateList::iterator i=this->GetStates()->begin();
1091 i != this->GetStates()->end(); ++i)
1093 FSAstate* pState = *i;
1094 pState->GetPathList()->clear();
1095 //CHEAP!!
1096 pState->m_DiscoverCount=0;
1097 pState->m_stateLabel=QString("");
1103 double FSA::ComputeDL()
1105 double morphemesDL=0;
1107 QMap<QString,FSAMorpheme*>::iterator m_it;
1108 int mSum=0; // sum morphCounter
1109 for (m_it = this->m_morphemes.begin(); m_it!=m_morphemes.end(); m_it++)
1110 mSum += m_it.value()->m_corpusCount;
1112 QMap<QString,double> morphPtrDLMap;
1113 for (m_it = this->m_morphemes.begin(); m_it!=m_morphemes.end(); m_it++)
1115 FSAMorpheme* pMorph = m_it.value();
1117 morphemesDL += pMorph->GetDL(this->m_charCount);
1119 //lenght of pointer to morpheme
1120 double cnt = pMorph->m_corpusCount; // morphCounter.value(mi.key());
1121 double pDL = base2log(mSum/cnt);
1122 //std::cout << mi.key().toStdString() << " \tDL: " << pDL << std::endl;
1123 morphPtrDLMap.insert(pMorph->toStr(), pDL);
1126 std::cout << "morphemes DL: " << morphemesDL << std::endl;
1128 double allStatesDL=0;
1130 QMap<FSAedge*,double> edgePtrDLMap;
1131 for (FSAStateList::iterator i = this->m_States->begin();
1132 i != this->m_States->end(); ++i)
1134 FSAstate* pState = *i;
1135 double stateDL = 0;
1136 for (FSAedgeList::iterator j_iter=pState->m_EdgesOut.begin();
1137 j_iter != pState->m_EdgesOut.end(); ++j_iter)
1139 FSAedge* pEdge = *j_iter;
1140 int edgeWordCount = 0;
1142 for(FSAMorphemeList::iterator k_iter = pEdge->GetMorphemes()->begin();
1143 k_iter != pEdge->GetMorphemes()->end();
1144 k_iter++)
1146 FSAMorpheme* pMorph = *k_iter;
1147 const QString & morphStr = pMorph->toStr();
1148 edgeWordCount += pMorph->m_corpusCount;
1149 stateDL += morphPtrDLMap[morphStr];
1152 double edgePtrDL = base2log(mSum/edgeWordCount);
1153 stateDL+=edgePtrDL;
1154 edgePtrDLMap.insert(pEdge,edgePtrDL);
1156 allStatesDL+=stateDL;
1158 std::cout << "states DL: " << allStatesDL << std::endl;
1160 return morphemesDL + allStatesDL;