CMiniLexicon::FindMajorSignatures(): use log file routines
[linguistica.git] / FSA.cpp
blob628d51716831a1a1f00e1ac8146d347042f7f4fe
1 // Implementation of finite-state automaton class
2 // Copyright © 2009 The University of Chicago
3 #include "FSA.h"
5 #include <algorithm>
6 #include <functional>
7 #include <numeric>
8 #include <QTableWidgetItem>
9 #include <QPixmap>
10 #include "MiniLexicon.h"
11 #include "Signature.h"
12 #include "Parse.h"
13 #include "SignatureCollection.h"
14 #include "StemCollection.h"
15 #include "WordCollection.h"
16 #include "SuffixCollection.h"
17 #include "Suffix.h"
18 #include "Stem.h"
19 #include "log2.h"
21 #include "PrefixCollection.h"
22 #include "Prefix.h"
24 #ifdef USE_GRAPHVIZ
25 #include <graphviz/gvc.h>
26 #endif
28 #define TEST_FUNC_MAX 32
30 #define FSA_DEBUG 1
32 double FSAMorpheme::GetDL(int characterCount) const
34 if (this->str == "NULL")
35 return 0.0;
36 return double(this->str.size()) * base2log(characterCount);
39 bool FSAMorpheme::operator==(const FSAMorpheme & other) const
41 return this->toStr() == other.toStr();
44 namespace linguistica {
45 namespace join_helper {
46 class cat_with_delim : public std::binary_function<
47 QString, const FSAMorpheme*, QString> {
48 const QString& delim;
49 public:
50 cat_with_delim(const QString& d) : delim(d) { }
51 QString operator()(const QString& lhs,
52 const FSAMorpheme* rhs) const
54 QString result = lhs;
55 result.append(delim);
56 result.append(rhs->toStr());
57 return result;
63 QString join(const FSAMorphemeList& v, const QString& delim)
65 using linguistica::join_helper::cat_with_delim;
66 using std::accumulate;
68 FSAMorphemeList::const_iterator iter = v.begin();
69 if (iter == v.end())
70 // empty list
71 return QString();
73 const QString first = (*iter)->toStr();
74 return accumulate(++iter, v.end(), first, cat_with_delim(delim));
77 FSAstate::FSAstate(FSA* pFSA)
78 : m_EdgesOut(), m_EdgesIn(),
79 m_MaxDepth(-1), //m_stateLabel(),
80 m_PathList(), m_DiscoverCount(0),
81 m_stateId(pFSA->m_nextStateId++)
82 { pFSA->AddState(this); }
84 /*int FSAstate::getWordCount() const
86 int sumIn=0,sumOut=0;
88 for(int j=0; j<this->GetEdgesIn()->size();j++) {
89 FSAedge* pEdge = this->GetEdgesIn()->at(j);
90 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
92 sumIn += pEdge->GetMorphemes()->at(k)->m_corpusCount;
96 for(int j=0; j<this->GetEdgesOut()->size();j++) {
97 FSAedge* pEdge = this->GetEdgesOut()->at(j);
98 for(int k=0; k<pEdge->GetMorphemes()->size(); k++)
99 sumOut += pEdge->GetMorphemes()->at(k)->m_corpusCount;
102 Q_ASSERT(sumIn==0 || sumOut==0 || sumOut==sumIn);
103 Q_ASSERT(sumOut+sumIn > 0);
105 if(sumIn)
106 return sumIn;
107 else
108 return sumOut;
112 void FSAstate::AddEdgeOut(FSAedge* pEdge)
114 this->m_EdgesOut.push_back(pEdge);
117 void FSAstate::AddEdgeIn(FSAedge* pEdge)
119 this->m_EdgesIn.push_back(pEdge);
121 //----------------------------------------------------------------//
122 FSAedge::FSAedge(FSA* pFSA, FSAstate* pStartState, FSAstate* pEndState)
123 : m_Morphemes(), m_FromState(pStartState), m_ToState(pEndState), m_FSA(pFSA)
125 pFSA->AddEdge(this);
128 FSAedge::FSAedge(FSA* pFSA, FSAstate* pStartState, FSAstate* pEndState, CParse* pLabels)
129 : m_Morphemes(), // initialized below
130 m_FromState(pStartState),
131 m_ToState(pEndState)
133 pFSA->AddEdge(this);
134 pLabels->CreateQStringList(m_Morphemes);
137 //void FSAedge::AddMorpheme(Morpheme* pMorph)
138 void FSAedge::AddMorpheme(const QString& str, int addedCount)
140 Q_ASSERT(addedCount);
141 //if Morpheme not included in FSA, update
142 if(this->m_FSA->m_morphemes.contains(str))
144 FSAMorpheme * pMorph = this->m_FSA->m_morphemes[str];
145 pMorph->m_corpusCount+=addedCount;
146 } else {
147 Q_ASSERT(1); //should not be called now, we are not adding stems or suffixes
148 FSAMorpheme * pMorph = new FSAMorpheme(str,addedCount);
149 this->m_FSA->m_morphemes.insert(str, pMorph);
152 //Q_ASSERT(!this->m_FSA->m_morphemeEdges.values(str).contains(this));
153 //this->m_FSA->m_morphemeEdges.insert(str,this);
154 this->m_Morphemes.push_back( new FSAMorpheme(str,addedCount) );
157 void FSAedge::RemoveMorpheme(FSAMorphemeList::iterator iter)
159 FSAMorpheme* pMorph = *iter;
160 QString str = pMorph->toStr();
161 int count = pMorph->m_corpusCount;
162 //this->GetMorphemes()->remove(pMorph);
163 this->GetMorphemes()->erase(iter);
165 // Do cleanup at FSA level
166 FSAMorpheme * FsaMorph = this->m_FSA->m_morphemes[str];
167 Q_ASSERT(FsaMorph);
168 Q_ASSERT(FsaMorph->m_corpusCount >= count);
169 //std::cout << "Removing: " << pMorph->toStr().toStdString() << " FSA count: " << FsaMorph->m_corpusCount << " This edge count:" << count << std::endl;
170 if(count == FsaMorph->m_corpusCount ) //remove from FSA
171 this->m_FSA->m_morphemes.remove(str);
172 else
173 this->m_FSA->m_morphemes[str]->m_corpusCount-=count;
175 delete pMorph;
178 FSAedge* FSA::DoParallelSplit(FSAedge* pEdge, FSAMorphemeList& morphsToMove)
180 Q_ASSERT(!morphsToMove.empty());
181 //if(morphsToMove.empty()) return NULL;
183 if(morphsToMove.size() == pEdge->GetMorphemes()->size())
185 #ifdef FSA_DEBUG
186 FSAMorphemeList copy1 = *pEdge->GetMorphemes();
187 FSAMorphemeList copy2 = morphsToMove;
188 copy1.sort();
189 copy2.sort();
190 Q_ASSERT(copy1==copy2);
191 #endif
192 return NULL;
194 #ifdef FSA_DEBUG
195 FSAMorphemeList* pEdgeMorphemes = pEdge->GetMorphemes();
196 foreach (FSAMorpheme* pMorph, morphsToMove) {
197 FSAMorphemeList::iterator m_it =
198 std::find(pEdgeMorphemes->begin(),pEdgeMorphemes->end(),pMorph);
199 Q_ASSERT(m_it!=pEdgeMorphemes->end());
201 #endif
202 //just move dynamically allocated FSAMorph object from one edge to another.
203 // No other bookkeeping needed, but watch for this if defn any of FSA classes
204 // change. Note that morpheme counts are tracked at FSA level, but split does
205 // not change these counts.
206 FSAedge* pNewEdge = new FSAedge(this, pEdge->GetFromState(), pEdge->GetToState());
207 foreach (FSAMorpheme *pMorph,morphsToMove) {
208 pEdge->GetMorphemes()->remove(pMorph);
209 pNewEdge->GetMorphemes()->push_back(pMorph);
211 return pNewEdge;
214 void FSA::DoMultParallelSplit( FSAedge* pEdge,
215 std::list<FSAMorphemeList>::iterator first,
216 std::list<FSAMorphemeList>::iterator last )
218 std::list<FSAMorphemeList>::iterator list_it = first;
219 #ifdef FSA_DEBUG
220 FSAMorphemeList all;
221 while(list_it!=last)
223 foreach(FSAMorpheme* pMorph,*list_it++)
224 all.push_back(pMorph);
226 Q_ASSERT(all.size()==pEdge->GetMorphemes()->size());
227 all.sort();
228 FSAMorphemeList copy = *pEdge->GetMorphemes();
229 copy.sort();
230 Q_ASSERT(all==copy);
231 list_it=first; //reset iterator
232 #endif
233 //remove 1 non-empty list, some morphs will stay on pEdge
234 while((*list_it++).size()==0)
236 while(list_it!=last)
238 //this->DoParallelSplit(pEdge,*list_it++);
239 FSAMorphemeList& morphList = *list_it++;
241 if(morphList.size()==0) continue; //ignore empty lists
243 FSAedge* pNewEdge = new FSAedge(this, pEdge->GetFromState(), pEdge->GetToState());
244 foreach (FSAMorpheme *pMorph,morphList) {
245 pEdge->GetMorphemes()->remove(pMorph);
246 pNewEdge->GetMorphemes()->push_back(pMorph);
252 //breaks an edge in
253 FSAedge* FSA::DoSeriesSplit(FSAedge* pEdge,unsigned int len, bool suffix)
255 Q_ASSERT(!pEdge->GetMorphemes()->empty());
257 QString firstMorph = pEdge->GetMorphemes()->front()->toStr();
258 QString commonMorphStr = ( suffix ? firstMorph.right(len) : firstMorph.left(len) );
260 FSAstate* pNewState = new FSAstate(this);
261 FSAedge* pEdge1 = new FSAedge(this, pEdge->GetFromState(),pNewState);
262 FSAedge* pEdge2 = new FSAedge(this, pNewState,pEdge->GetToState());
264 FSAedge *commonEdge, *multiEdge;
265 if(suffix) {
266 multiEdge=pEdge1;
267 commonEdge=pEdge2;
268 } else {
269 commonEdge=pEdge1;
270 multiEdge=pEdge2;
272 unsigned int totalCount = 0;
274 FSAMorphemeList::iterator m_iter = pEdge->GetMorphemes()->begin();
275 while(m_iter != pEdge->GetMorphemes()->end())
277 FSAMorpheme * pMorph = *m_iter;
278 QString morphStr = pMorph->toStr();
279 QString curCommon = ( suffix ? morphStr.right(len) : morphStr.left(len) );
280 Q_ASSERT( curCommon == commonMorphStr );
281 totalCount += pMorph->m_corpusCount;
283 unsigned int len_other = morphStr.length()-len;
284 QString cur_multi = ( suffix ? morphStr.left(len_other) : morphStr.right(len_other) );
285 //add stem morphemes
286 multiEdge->AddMorpheme(cur_multi,pMorph->m_corpusCount);
287 pEdge->RemoveMorpheme(m_iter++);
289 //add affix morpheme
290 Q_ASSERT(pEdge->GetMorphemes()->empty());
291 commonEdge->AddMorpheme(commonMorphStr,totalCount);
293 this->RemoveEdge(pEdge);
294 return pEdge1;
298 FSA::FSA()
299 : m_States(), m_Edges(), m_StartStates(),
300 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
301 m_lexicon(), m_charCount(0),
302 m_morphemes(),
303 m_nextStartStateId(1), m_nextStateId(1) { }
305 FSA::FSA(const FSA& fsa)
306 : m_States(fsa.m_States), m_Edges(fsa.m_Edges), m_StartStates(fsa.m_StartStates),
307 m_FSAPathList(fsa.m_FSAPathList), m_searched(false), m_MaxPathLength(0),
308 m_lexicon(), m_charCount(0),
309 m_morphemes(fsa.m_morphemes),
310 m_nextStartStateId(1), m_nextStateId(1) { }
312 FSA::FSA(CMiniLexicon* pMiniLexicon)
313 : m_States(new FSAStateList),
314 m_Edges(new FSAedgeList),
315 m_StartStates(new FSAStateList),
316 m_FSAPathList(), m_searched(false), m_MaxPathLength(0),
317 m_lexicon(pMiniLexicon),
318 m_charCount(pMiniLexicon->GetNumberOfCharacterTypes()),
319 m_morphemes(),
320 m_nextStartStateId(1), m_nextStateId(1)
322 //m_StartState->setMaxDepth(0);
323 // this->AddState(m_StartState);
325 //int MinimumNumberOfStemsForDisplay = 2;
326 //if (pMiniLexicon->GetSignatures()->GetCount() < 20 ) MinimumNumberOfStemsForDisplay = 1;
327 //int MinimumNumberOfAffixesForDisplay = 1; //make this adjustable by user @@@@
329 for (int i = 0; i < pMiniLexicon->GetSignatures()->GetCount(); i++)
331 CSignature* pSig = pMiniLexicon->GetSignatures()->GetAt(i);
333 //if (pSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay ) continue;
334 //if (pSig->Size() < MinimumNumberOfAffixesForDisplay ) continue;
336 AddSignature(pSig);
340 FSA::~FSA()
342 while(!m_Edges->empty())
344 FSAedge* pEdge = m_Edges->front();
345 m_Edges->pop_front();
346 while(!(pEdge->GetMorphemes()->empty()))
348 delete pEdge->GetMorphemes()->front();
349 pEdge->GetMorphemes()->pop_front();
351 delete pEdge;
354 while(!m_States->empty())
356 FSAstate* pState = m_States->front();
357 m_States->pop_front();
358 while(!pState->m_PathList.empty())
360 delete pState->m_PathList.front();
361 pState->m_PathList.pop_front();
363 delete pState;
366 QMap<QString, FSAMorpheme*>::iterator i = m_morphemes.begin();
367 while (i != m_morphemes.end()) {
368 delete i.value();
369 ++i;
372 delete this->m_States;
373 delete this->m_Edges;
374 delete this->m_StartStates;
377 void FSA::AddSignature(CSignature* pSig)
379 //std::cout << "---- Signature " <<pSig->GetNumberOfStems()<<" "<< pSig->GetNumberOfAffixes()<< std::endl;
381 //add start state
382 FSAstate* pStart = new FSAstate(this);
383 pStart->setMaxDepth(0);
384 this->m_StartStates->push_back(pStart);
386 FSAstate* pMiddle = new FSAstate(this);
387 FSAstate* pEnd = new FSAstate(this);
390 CStringSurrogate ssStem;
391 FSAedge* pEdge1 = new FSAedge(this, pStart, pMiddle);
392 for (int i = 0; i < pSig->GetNumberOfStems(); i++)
394 CStem* pStem = pSig->GetStemPtrList()->at(i);
395 QString str = pStem->Display();
397 int corpusCount = pStem->GetCorpusCount();
398 Q_ASSERT(corpusCount>0);
399 if(str.size()>0)
401 //std::cout << "\tStem:"<<str.toStdString() <<"."<< corpusCount <<std::endl;
402 pEdge1->AddMorpheme( str,corpusCount); //TODO
406 FSAedge* pEdge2 = new FSAedge(this, pMiddle, pEnd);
407 for (int i = 0; i < pSig->GetNumberOfAffixes(); i++)
409 CSuffix* pSuffix= pSig->GetSuffixPtrList()->at(i);
410 QString suffixStr = pSuffix->Display();
411 int corpusCount = 0;
413 //for(int j=0;j<pEdge1->GetMorphemes()->size();j++)
414 FSAMorphemeList::iterator j_iter = pEdge1->GetMorphemes()->begin();
415 while(j_iter != pEdge1->GetMorphemes()->end())
417 FSAMorpheme *pMorph = *j_iter++;
418 int stemCount = pMorph->m_corpusCount;
419 Q_ASSERT(stemCount>0);
420 QString analyzedWord = pMorph->toStr();
421 if (suffixStr!="NULL")
422 analyzedWord.append(suffixStr);
424 CWordCollection* wc = this->m_lexicon->GetWords();
426 CStem* pStem = ( *wc^=analyzedWord);
427 Q_ASSERT(pStem);
429 corpusCount+=(pStem->GetCorpusCount());
432 if(suffixStr.size()>0)
434 //std::cout << "\t\tSuffix "<< suffixStr.toStdString() <<"."<<corpusCount<<std::endl;
435 pEdge2->AddMorpheme(suffixStr, corpusCount);
439 //----------------------------------------------------------------//
442 void FSA::AddState(FSAstate* pState)
444 m_States->push_back(pState);
447 void FSA::AddEdge(FSAedge* pEdge)
449 m_Edges->push_back(pEdge);
450 pEdge->GetFromState()->AddEdgeOut(pEdge);
451 pEdge->GetToState()->AddEdgeIn(pEdge);
454 namespace linguistica {
455 namespace fsa_helper {
457 struct contains_edge : std::unary_function<const FSAedgeList*, bool> {
458 FSAedge* const e;
459 contains_edge(FSAedge* edge) : e(edge) { }
460 bool operator()(const FSAedgeList* l) const
462 return std::find(l->begin(), l->end(), e) != l->end();
468 void FSA::RemoveEdge(FSAedge* pEdge)
470 using std::find_if;
472 // remove all morphemes from edge, this does bookkeeping at FSA level
473 // currently unused, TestFunc only removes edge when Morph count is zero
474 if (pEdge->GetMorphemes()->size() > 0)
476 FSAMorphemeList::iterator iter = pEdge->GetMorphemes()->begin();
477 while(iter != pEdge->GetMorphemes()->end())
478 pEdge->RemoveMorpheme(iter++);
481 this->m_Edges->remove(pEdge);
482 pEdge->GetFromState()->m_EdgesOut.remove(pEdge);
483 pEdge->GetToState()->m_EdgesIn.remove(pEdge);
485 //remove paths that contain this edge
486 // currently not used -- paths will have been cleared by ResetDisplay
487 const linguistica::fsa_helper::contains_edge contains_pEdge(pEdge);
489 FSApathList::iterator path_it = this->m_FSAPathList.begin();
490 const FSApathList::iterator paths_end = this->m_FSAPathList.end();
492 while ((path_it = find_if(path_it,paths_end,contains_pEdge)) != paths_end)
494 FSAedgeList* const pPath = *path_it;
495 path_it = this->m_FSAPathList.erase(path_it);
496 delete pPath;
498 // end -- remove paths
500 delete pEdge;
503 void FSA::FSATestFunc()
505 CSuffixCollection* lexSuffixes = this->m_lexicon->GetSuffixes();
506 lexSuffixes->Sort(COUNT);
507 CStemCollection* lexStems = this->m_lexicon->GetStems();
509 for (int suf_idx=0; suf_idx<lexSuffixes->GetCount(); ++suf_idx)
511 if(suf_idx>=TEST_FUNC_MAX) {
512 //std::cout << "BREAK" << std::endl;
513 break;
515 CSuffix* pSuffix = lexSuffixes->GetAtSort(suf_idx);
516 //std::cout << "Suffix:" << pSuffix->Display().toStdString() << std::endl;
517 QString suffixStr = pSuffix->Display();
519 if(suffixStr == "NULL") continue;
521 //make copy now, list will be altered in while-loop
522 FSAedgeList edges = FSAedgeList(*this->m_Edges);
524 FSAedgeList::iterator edge_it = edges.begin();
525 while(edge_it != edges.end())
527 FSAedge* pEdge = *edge_it;
528 ++edge_it; //increment now, pEdge may be deleted
530 FSAMorphemeList morphemesToReanalyze;
531 FSAMorphemeList * morphemes = pEdge->GetMorphemes();
532 //std::cout << "At edge: " << join(*morphemes,QString(' ')).toStdString() << std::endl;
533 foreach (FSAMorpheme* pMorph, *morphemes)
535 QString edgeStr = pMorph->toStr();
536 //std::cout<<" Morpheme:"<<edgeStr.toStdString()<<std::endl;
538 if (edgeStr.endsWith(suffixStr) && edgeStr!=suffixStr)
540 QString stemStr = edgeStr.left(edgeStr.length() - suffixStr.length());
542 CStem * pStem = (*lexStems^=stemStr);
543 //if stemStr in Lexicon
544 if(pStem)
546 //if stem takes this suffix
547 CSignature * pSig = pStem->GetSuffixSignature();
548 for (int k = 0;k<pSig->GetSuffixPtrList()->count();k++)
550 CSuffix* pSuffix= pSig->GetSuffixPtrList()->at(k);
551 if(pSuffix->Display() == suffixStr )
553 morphemesToReanalyze.push_back(pMorph);
554 break;
558 } //else skip this morpheme
559 }//edge-morphemes
560 if(!morphemesToReanalyze.empty()) {
561 if( morphemes->size()==morphemesToReanalyze.size() ) {
562 //just do series split
563 this->DoSeriesSplit(pEdge,suffixStr.length(),true);
564 } else {
565 FSAedge* pNewEdge = this->DoParallelSplit(pEdge,morphemesToReanalyze);
566 this->DoSeriesSplit(pNewEdge,suffixStr.length(),true);
569 }//edges
570 }//suffix morphemes
573 void FSA::AddPrefixes(CMiniLexicon* pMiniLexicon) //Should be Prefix MiniLex
575 CPrefixCollection* lexPrefixes = pMiniLexicon->GetPrefixes();
576 Q_ASSERT(lexPrefixes);
577 lexPrefixes->Sort(COUNT);
578 //CStemCollection* lexStems = this->m_lexicon->GetStems();
579 CStemCollection* lexStems = pMiniLexicon->GetStems();
581 for (int suf_idx=0; suf_idx<lexPrefixes->GetCount(); ++suf_idx)
583 //if(suf_idx>=TEST_FUNC_MAX) { //Not used for prefixes
584 //std::cout << "BREAK" << std::endl;
585 // break;
588 CPrefix* pPrefix = lexPrefixes->GetAtSort(suf_idx);
589 std::cout << "Prefix:" << pPrefix->Display().toStdString() << std::endl;
590 QString prefixStr = pPrefix->Display();
592 if(prefixStr == "NULL") continue;
594 //make copy now, list will be altered in while-loop
595 FSAedgeList edges = FSAedgeList(*this->m_Edges);
597 FSAedgeList::iterator edge_it = edges.begin();
598 while(edge_it != edges.end())
600 FSAedge* pEdge = *edge_it;
601 ++edge_it; //increment now, pEdge may be deleted
603 FSAMorphemeList morphemesToReanalyze;
604 FSAMorphemeList * morphemes = pEdge->GetMorphemes();
605 //std::cout << "At edge: " << join(*morphemes,QString(' ')).toStdString() << std::endl;
606 foreach (FSAMorpheme* pMorph, *morphemes)
608 QString edgeStr = pMorph->toStr();
609 //std::cout<<" Morpheme:"<<edgeStr.toStdString()<<std::endl;
611 if (edgeStr.startsWith(prefixStr) && edgeStr!=prefixStr)
613 QString stemStr = edgeStr.right(edgeStr.length() - prefixStr.length());
614 std::cout<<" Morpheme:"<<edgeStr.toStdString()<<std::endl;
616 CStem * pStem = (*lexStems^=stemStr);
617 //if stemStr in Lexicon
618 //if(pStem)
619 if(pStem && pStem->GetPrefixSignature())
621 //if stem takes this prefix
622 CSignature * pSig = pStem->GetPrefixSignature();
623 Q_ASSERT(pSig->GetPrefixPtrList()->count());
624 for (int k = 0;k<pSig->GetPrefixPtrList()->count();k++)
626 CPrefix* pPrefix= pSig->GetPrefixPtrList()->at(k);
627 if(pPrefix->Display() == prefixStr )
629 morphemesToReanalyze.push_back(pMorph);
630 break;
635 } //else skip this morpheme
637 }//edge-morphemes
638 if(!morphemesToReanalyze.empty()) {
639 if( morphemes->size()==morphemesToReanalyze.size() ) {
640 //just do series split
641 this->DoSeriesSplit(pEdge,prefixStr.length(),false);
642 } else {
643 FSAedge* pNewEdge = this->DoParallelSplit(pEdge,morphemesToReanalyze);
644 this->DoSeriesSplit(pNewEdge,prefixStr.length(),false);
647 }//edges
648 }//prefix morphemes
651 void FSA::DoSearch()
653 if(this->m_searched)
654 return;
656 this->m_searched = true;
658 // get 'depth' of FSA
659 this-> m_MaxPathLength=0;
660 FSAStateList queue;
661 //queue.append(this->m_StartState);
662 for ( FSAStateList::iterator i=this->m_StartStates->begin();
663 i != this->m_StartStates->end(); ++i)
665 FSAstate* pStartState=*i; //this->m_StartStates->at(i);
667 //add empty path
668 FSAedgeList* pPath = new FSAedgeList();
669 pStartState->addPath(pPath);
671 queue.push_back(pStartState);
673 while(!queue.empty())
675 //FSAstate* pState = queue.takeFirst();
676 FSAstate* pState = queue.front();
677 queue.pop_front();
679 FSAedgeList* edgeList = pState->GetEdgesOut();
681 //if at leaf, add paths to FSA and do not search outbound edges
682 if(edgeList->size() == 0)
684 //this->m_FSAPathList += *(pState->GetPathList());
685 for(FSApathList::iterator j=pState->GetPathList()->begin();
686 j != pState->GetPathList()->end(); ++j)
687 this->m_FSAPathList.push_back(*j);
689 else
691 //iter over outbound edges
692 for (FSAedgeList::iterator j=edgeList->begin(); j!=edgeList->end(); ++j)
694 FSAedge* pEdge = *j;
695 FSAstate* pChildState = pEdge->GetToState();
696 pChildState->m_DiscoverCount++;
698 //add paths to pChildState
699 for(FSApathList::iterator l=pState->GetPathList()->begin();
700 l != pState->GetPathList()->end(); ++l)
702 FSAedgeList* pPath = *l;
704 FSAedgeList* pNewPath = new FSAedgeList(*pPath);
705 pNewPath->push_back(pEdge);
706 pChildState->addPath(pNewPath);
709 int maxChildDepth = pChildState->getMaxDepth();
711 //set length of max path to child
712 if(maxChildDepth==-1 || pState->getMaxDepth()+1 > maxChildDepth)
714 pChildState->setMaxDepth(pState->getMaxDepth()+1);
715 maxChildDepth = pChildState->getMaxDepth();
718 if (maxChildDepth > m_MaxPathLength)
720 m_MaxPathLength=maxChildDepth;
723 Q_ASSERT(pChildState->m_DiscoverCount <= pChildState->GetEdgesIn()->size());
725 if(pChildState->m_DiscoverCount == pChildState->GetEdgesIn()->size())
726 queue.push_back(pChildState);
731 std::cout<< "FSA DL:"<<this->ComputeDL() << std::endl;
734 int FSA::GetRobustness(FSAedgeList * pPath)
736 QStringList pathWordsWorking;
737 int pathCharCount = 0;
738 FSAMorphemeList* pEdge0Morphemes = pPath->front()->GetMorphemes();
739 FSAMorphemeList::iterator j_iter = pEdge0Morphemes->begin();
740 //for(int j=0;j<pEdge0Morphemes->size();j++)
741 while(j_iter!=pEdge0Morphemes->end())
743 FSAMorpheme* pMorph = *j_iter++;
744 pathCharCount+=pMorph->toStr().length();
745 pathWordsWorking.push_back(pMorph->toStr());
748 FSAedgeList::iterator i = pPath->begin();
749 ++i; //skip first edge, already got these morphemes (above)
750 while( i != pPath->end() )
752 FSAedge* pEdge = *i++;
753 QStringList newWords;
754 FSAMorphemeList* mList = pEdge->GetMorphemes();
755 FSAMorphemeList::iterator j_iter = mList->begin();
756 //for(int j=0;j<mList->size();j++)
757 while (j_iter!=mList->end())
759 QString morph = (*j_iter++)->toStr();
760 //std::cout<< "At morpheme:"<< morph.toStdString() <<std::endl;
761 pathCharCount += morph.length();
762 for(int k=0;k<pathWordsWorking.size();k++)
764 QString newWord = pathWordsWorking.at(k) + morph;
765 //std::cout<< " created: "<< newWord.toStdString() <<std::endl;
766 newWords.append(newWord);
769 pathWordsWorking.clear();
770 for(int j=0;j<newWords.size();j++)
771 pathWordsWorking.append(newWords.at(j));
772 }//end edge
774 int charsReplacedCount = 0;
775 for(int i=0;i<pathWordsWorking.size();i++)
777 charsReplacedCount+=pathWordsWorking.at(i).length();
780 //std::cout<< " charsReplaced: "<< charsReplacedCount << " pathCharCount: "<< pathCharCount << std::endl;
781 return charsReplacedCount-pathCharCount;
784 void FSAstate::OutputXfst (QTextStream& outf)
786 FSAstate* pState=this;
787 if ( pState->GetEdgesIn()->size() == 0)//start state
789 outf << "\ndefine "<< this->getStateName() << " 0; "<<endl;
791 else
793 outf << "define " << this->getStateName() << " ";
794 if (pState->GetEdgesIn()->size()>1) outf << " [ ";
796 for (FSAedgeList::iterator j_iter=pState->GetEdgesIn()->begin();
797 j_iter != pState->GetEdgesIn()->end(); ++j_iter)
799 if (j_iter != pState->GetEdgesIn()->begin())
800 outf << "|";
801 FSAedge* pEdge = *j_iter;
802 outf << pEdge->GetFromState()->getStateName();
804 if (pEdge->GetMorphemes()->size()==1)
806 FSAMorpheme* pm = pEdge->GetMorphemes()->front();
807 const QString ss = pm->toStr();
808 if (pm->toStr()=="NULL")
809 outf << " 0 ";
810 else
811 outf << " {" << ss.toAscii() << "} ";
813 else
815 //outf << QString("S%1 ").arg(pEdge->GetFromState()->m_stateId);
816 outf << " [ ";
817 FSAMorphemeList::iterator k_iter = pEdge->GetMorphemes()->begin();
818 //for(int k=0;k<pEdge->GetMorphemes()->size();k++)
819 while(k_iter != pEdge->GetMorphemes()->end())
821 if (k_iter!=pEdge->GetMorphemes()->begin()) outf << "|";
822 FSAMorpheme* pm = *k_iter++;
823 const QString ss = pm->toStr();
824 if (pm->toStr()=="NULL")
825 outf << " 0 ";
826 else
827 outf << " {" << ss.toAscii() << "} ";
829 outf << " ] ";
832 if (pState->GetEdgesIn()->size()>1) outf << " ]";
833 outf << ";" << endl;
837 void FSAstate::SearchEdgeXfst(QTextStream& outf, std::set<FSAstate*>& discovered)
839 for(FSAedgeList::iterator i=this->GetEdgesIn()->begin();
840 i != this->GetEdgesIn()->end(); i++)
842 FSAstate* pStateIn = (*i)->GetFromState();
843 if(discovered.count(pStateIn)==0) //if state not discovered yet
844 pStateIn->SearchEdgeXfst(outf,discovered);
846 this->OutputXfst(outf);
849 void FSA::OutputFSAXfst ( const QString& filename )
851 QFile file( filename );
853 FSAStateList topoSortedStates;
854 std::set<FSAstate*> discovered;
856 //find states with no outbound edges
857 FSAStateList endStates;
858 for (FSAStateList::iterator i=this->GetStates()->begin();
859 i != this->GetStates()->end();++i)
861 if ((*i)->GetEdgesOut()->size()==0)
862 endStates.push_back(*i);
866 if( file.open( IO_WriteOnly ) )
868 QTextStream outf( &file ); //Should be ascii file, not unicode
870 outf << "# " << endl;
871 outf << "# File: " << filename << endl;
872 outf << "# " << endl;
874 //for (int i=0;i<this->GetStates()->size();i++)
875 for (FSAStateList::iterator i = endStates.begin();
876 i!=endStates.end(); ++i)
878 (*i)->SearchEdgeXfst(outf,discovered);
881 outf << endl;
882 //outf << "union net" << endl << endl;
883 //outf << "print words" << endl << endl;
884 for (FSAStateList::iterator i = endStates.begin();
885 i!=endStates.end(); ++i)
887 FSAstate* pState = *i;
888 outf << endl;
889 outf << "push " << pState->getStateName() << endl;
890 outf << "print words" << endl << endl;
891 outf << "pop stack" << endl << endl;
892 outf << endl;
895 file.close();
899 void FSA::FSAListDisplay(Q3ListView* pView,
900 QMap<class QString, class QString>* /* out filter not supported yet */,
901 bool /* “analyzed words only” flag for future use */)
903 pView->setRootIsDecorated( FALSE );
904 pView->setSorting(1);
905 // Remove all previous columns
906 while( pView->columns() ) pView->removeColumn( 0 );
907 pView->clear();
909 // get 'depth' of FSA
911 this->DoSearch();
913 pView->setSorting(0); //sort on 0th column
915 // Add Column headers
916 //std::cout<<"Graph Max: "<<m_MaxPathLength <<std::endl;
917 QStringList headers;
918 for (int i = 0;i<this->m_MaxPathLength;i++)
920 pView->addColumn( QString( "Edge %1" ).arg(i));
922 pView->addColumn( QString( "Robustness" ));
923 //m_pLexicon->GetDocument()->setStatusBar1( "Displaying FSA" );
925 //associate each start state with it's parent list view item (row).
926 // If start state only has one outbound path then there will
927 // be only one lv item, otherwise lv items/paths will be associated
928 // with a placeholder parent path
929 QMap<FSAstate*,FSAListViewItem*> lvMap;
930 //create placeholder lvItem for start states with >=1 path
931 for(FSApathList::iterator i=this->m_FSAPathList.begin();
932 i != this->m_FSAPathList.end(); ++i)
934 FSAedgeList * pPath= *i;
935 FSAstate * pStartState = pPath->front()->GetFromState();
936 if(!lvMap.contains(pStartState) && pStartState->GetEdgesOut()->size()>1)
938 FSAListViewItem * lvtop = new FSAListViewItem(pView,pPath);
939 lvtop->setVisible(TRUE);
940 lvtop->setOpen(TRUE);
941 lvtop->setText(0,"All paths: ");
942 lvMap.insert(pStartState,lvtop);
946 //store totaled robustness values for subgraph starting at each start state
947 QMap<FSAstate*,int> robustnessMap;
949 //Add Item for each start state
951 for (int i=0;i<this->m_StartStates->size();i++)
953 FSAstate* pStartState = m_StartStates->at(i);
955 FSAListViewItem * pStartStateItem= new FSAListViewItem(pView, this->m_FSAPathList.at(0)); //LEAK
956 pStartStateItem->setVisible(TRUE);
957 pStartStateItem->setOpen(TRUE);
959 lvMap.insert(pStartState,pStartStateItem);
962 //add every path as row to list view
963 for(FSApathList::iterator i=this->m_FSAPathList.begin();
964 i != this->m_FSAPathList.end(); ++i)
966 FSAedgeList* pPath = *i;
968 int robustness = FSA::GetRobustness(pPath);
969 QString robustnessStr = QString("%1").arg(robustness,16);
971 FSAListViewItem *lvItem;
972 //get parent list view item
973 FSAstate* pStartState=pPath->front()->GetFromState();
974 FSAListViewItem * pParent=lvMap.value(pStartState);
975 if(pParent==NULL)
977 lvItem = new FSAListViewItem(pView, pPath);
978 lvMap.insert(pStartState,lvItem);
980 else
982 lvItem = new FSAListViewItem(pParent,pPath);
984 int oldVal = robustnessMap.value(pStartState);
985 robustnessMap.insert(pStartState,robustness+oldVal); //score of lvItem,pPath
988 lvItem->setVisible(TRUE);
989 lvItem->setOpen(TRUE);
991 int col_idx=0;
992 // first column
993 for(FSAedgeList::iterator i=pPath->begin(); i!=pPath->end(); ++i)
995 FSAedge* pEdge = *i;
996 FSAMorphemeList* labelList = pEdge->GetMorphemes();
997 QString label = join(*labelList, QString(' '));
998 if(label.length() > 64)
999 label= QString("%1 ... %2 (%3)")
1000 .arg(labelList->front()->toStr())
1001 .arg(labelList->back()->toStr())
1002 .arg(labelList->size());
1004 //populate table
1005 lvItem->setText(col_idx,label+" ");
1007 //if this edge spans multiple columns,
1008 //then columns / use whitespace
1009 col_idx = pEdge->GetToState()->getMaxDepth();
1011 //iterate
1012 //FSAedgeList * edgeList = pEdge->GetToState()->GetEdgesOut();
1013 //pEdge = ((edgeList->isEmpty())? NULL : edgeList->first());
1015 lvItem->setText(m_MaxPathLength,robustnessStr);
1018 QMap<FSAstate*,FSAListViewItem*>::iterator lvm_it = lvMap.begin();
1019 while (lvm_it != lvMap.end())
1021 FSAstate* pState = lvm_it.key();
1022 if(pState->GetEdgesOut()->size() > 1)
1024 int robVal = robustnessMap.value(pState);
1025 QString robustnessStr = QString("%1").arg(robVal,16);
1026 lvm_it.value()->setText(m_MaxPathLength,robustnessStr);
1029 lvm_it++;
1033 //--------------------------------------------------------//
1035 FSAListViewItem::~FSAListViewItem()
1037 if(m_pImage) delete m_pImage;
1040 QPixmap* FSAListViewItem::GetQPixmap()
1042 if (m_pImage == 0)
1043 build_pixmap();
1045 return m_pImage;
1048 #ifdef USE_GRAPHVIZ
1049 void FSAListViewItem::build_pixmap()
1051 // Init graph
1052 const char outputName[] = "tmp.png";
1053 QMap<FSAstate*,Agnode_t*> nodeMap;
1055 // generate new image
1056 GVC_t* gvc = gvContext();
1057 graph_t* g = agopen(const_cast<char*>("tmpgraph"), AGDIGRAPHSTRICT);
1059 // left-to-right orientation
1060 agraphattr(g, const_cast<char*>("rankdir"), const_cast<char*>(""));
1061 agset(g, const_cast<char*>("rankdir"), const_cast<char*>("LR"));
1063 /*QString& start_state_name = this->GetLVStartState()->m_stateLabel;
1064 if (start_state_name.isEmpty())
1065 //start_state_name = "S";
1066 start_state_name = this->GetLVStartState()->getStateName();*/
1068 // create new Agnode_t for start state
1069 nodeMap.insert(this->GetLVStartState(),
1070 agnode(g, this->GetLVStartState()->getStateName().toUtf8().data()));
1072 QList<FSAstate*> queue;
1073 queue.append(this->GetLVStartState());
1075 //now do search
1076 //int j=1;
1077 while (!queue.isEmpty()) {
1078 FSAstate* pCurState = queue.takeFirst();
1080 FSAedgeList& edges = *pCurState->GetEdgesOut();
1081 for (FSAedgeList::iterator i = edges.begin(); i != edges.end(); ++i) {
1082 FSAedge* pEdge = *i;
1083 FSAstate* pNextState = pEdge->GetToState();
1085 //label node, if does not already have a label
1086 /*if (pNextState->m_stateLabel.isEmpty())
1088 //QString("N%1").arg(j++);
1089 ++j;
1090 pNextState->m_stateLabel = QString("S%1").arg(pNextState->m_stateId);
1093 if (!nodeMap.contains(pNextState)) {
1094 //QString label_text = pNextState->m_stateLabel;
1095 QString label_text = pNextState->getStateName();
1096 nodeMap.insert(pNextState,
1097 agnode(g, label_text.toUtf8().data()));
1098 queue.append(pNextState);
1101 // make edge label
1102 FSAMorphemeList* labelList = pEdge->GetMorphemes();
1103 QString label = join(*labelList, QString(' '));
1104 if(label.length() > 64)
1105 label= QString("%1 ... %2 (%3)")
1106 .arg(labelList->front()->toStr())
1107 .arg(labelList->back()->toStr())
1108 .arg(labelList->size());
1110 // make edge
1111 Agedge_t* e1 = agedge(g, nodeMap.value(pCurState),
1112 nodeMap.value(pNextState));
1113 agedgeattr(g, const_cast<char*>("label"),
1114 const_cast<char*>(""));
1115 agset(e1, const_cast<char*>("label"),
1116 const_cast<char*>(label.toUtf8().data()));
1118 // make bold if:
1119 if (
1120 // start state has >=2 outbound paths
1121 GetLVStartState()->GetEdgesOut()->size() >= 2 &&
1122 // edge in path
1123 std::find(this->m_pPath->begin(),this->m_pPath->end(),pEdge)
1124 != this->m_pPath->end()
1127 agedgeattr(g, const_cast<char*>("style"),
1128 const_cast<char*>(""));
1129 agset(e1, const_cast<char*>("style"),
1130 const_cast<char*>("bold"));
1135 gvLayout(gvc, g, const_cast<char*>("dot"));
1136 gvRenderFilename(gvc, g, const_cast<char*>("png"),
1137 const_cast<char*>(outputName));
1139 // cleanup - free Layout
1140 gvFreeLayout(gvc, g);
1142 // free graph
1143 agclose(g);
1145 // close output file and free context
1146 gvFreeContext(gvc);
1148 // save image
1149 delete m_pImage;
1150 QPixmap * pp = new QPixmap();//"dog.jpg");//QPixmap*uoutputName);
1151 //std::cout<<" Null:"<<(pp->isNull()?"Yes":"No")<<std::endl;
1153 m_pImage = pp;
1155 #endif
1157 void FSAListViewItem::DisplayEdgePath(Q3TextEdit* m_commandLine)
1159 int i = 0;
1160 FSAedge* pEdge=this->m_pPath->front();//GetEdge();
1161 int tablength = 0; //FIX
1162 while( pEdge ){
1163 m_commandLine->insert( QString( "Edge %1:\n\n" ).arg(i++) );
1164 FSAMorphemeList* morphemes = pEdge->GetMorphemes();
1166 //display as table
1167 int nColumns = 5;
1168 FSAMorphemeList::iterator j_iter = morphemes->begin();
1169 //for (int j = 0; j < morphemes->size(); )
1170 while(j_iter!=morphemes->end())
1172 //read up to 5 morphemes and print to line
1173 for( int k=0; k < nColumns && j_iter != morphemes->end(); k++ )
1175 //QString text = morphemes->at(j++)->toStr();//.stripWhiteSpace();
1176 QString text = (*j_iter++)->toStr();//.stripWhiteSpace();
1177 m_commandLine->insert( text + "\t" );
1179 // Make tab length depend on the longest
1180 // word and font size
1181 int wordlength = text.length() * m_commandLine->pointSize();
1182 if( tablength < wordlength ) tablength = wordlength;
1184 m_commandLine->insert( "\n" );
1186 m_commandLine->insert( "\n" );
1188 //m_commandLine->insert(join(*morphemes, QString(' ')));
1189 FSAedgeList * edgeList = pEdge->GetToState()->GetEdgesOut();
1190 if(!edgeList->empty())
1191 pEdge=edgeList->front();
1192 else
1193 pEdge=NULL;
1194 m_commandLine->insert("\n\n");
1196 m_commandLine->setTabStopWidth( tablength );
1200 void FSA::ResetDisplay()
1202 this->m_searched=false;
1203 this->m_FSAPathList.clear();
1205 for (FSAStateList::iterator i=this->GetStates()->begin();
1206 i != this->GetStates()->end(); ++i)
1208 FSAstate* pState = *i;
1209 pState->GetPathList()->clear();
1210 //CHEAP!!
1211 pState->m_DiscoverCount=0;
1212 pState->m_stateLabel=QString("");
1218 double FSA::ComputeDL()
1220 double morphemesDL=0;
1222 QMap<QString,FSAMorpheme*>::iterator m_it;
1223 int mSum=0; // sum morphCounter
1224 for (m_it = this->m_morphemes.begin(); m_it!=m_morphemes.end(); m_it++)
1225 mSum += m_it.value()->m_corpusCount;
1227 QMap<QString,double> morphPtrDLMap;
1228 for (m_it = this->m_morphemes.begin(); m_it!=m_morphemes.end(); m_it++)
1230 FSAMorpheme* pMorph = m_it.value();
1232 morphemesDL += pMorph->GetDL(this->m_charCount);
1234 //lenght of pointer to morpheme
1235 double cnt = pMorph->m_corpusCount; // morphCounter.value(mi.key());
1236 double pDL = base2log(mSum/cnt);
1237 //std::cout << mi.key().toStdString() << " \tDL: " << pDL << std::endl;
1238 morphPtrDLMap.insert(pMorph->toStr(), pDL);
1241 std::cout << "morphemes DL: " << morphemesDL << std::endl;
1243 double allStatesDL=0;
1245 QMap<FSAedge*,double> edgePtrDLMap;
1246 for (FSAStateList::iterator i = this->m_States->begin();
1247 i != this->m_States->end(); ++i)
1249 FSAstate* pState = *i;
1250 double stateDL = 0;
1251 for (FSAedgeList::iterator j_iter=pState->m_EdgesOut.begin();
1252 j_iter != pState->m_EdgesOut.end(); ++j_iter)
1254 FSAedge* pEdge = *j_iter;
1255 int edgeWordCount = 0;
1257 for(FSAMorphemeList::iterator k_iter = pEdge->GetMorphemes()->begin();
1258 k_iter != pEdge->GetMorphemes()->end();
1259 k_iter++)
1261 FSAMorpheme* pMorph = *k_iter;
1262 const QString & morphStr = pMorph->toStr();
1263 edgeWordCount += pMorph->m_corpusCount;
1264 stateDL += morphPtrDLMap[morphStr];
1267 double edgePtrDL = base2log(mSum/edgeWordCount);
1268 stateDL+=edgePtrDL;
1269 edgePtrDLMap.insert(pEdge,edgePtrDL);
1271 allStatesDL+=stateDL;
1273 std::cout << "states DL: " << allStatesDL << std::endl;
1275 return morphemesDL + allStatesDL;
1278 /* This vn based on constructor, not used, using other vn based on TestFunc
1280 void FSA::AddPrefixes(CMiniLexicon* pMiniLexicon)
1282 //m_StartState->setMaxDepth(0);
1283 // this->AddState(m_StartState);
1285 //int MinimumNumberOfStemsForDisplay = 2;
1286 //if (pMiniLexicon->GetSignatures()->GetCount() < 20 ) MinimumNumberOfStemsForDisplay = 1;
1287 //int MinimumNumberOfAffixesForDisplay = 1; //make this adjustable by user @@@@
1289 for (int i = 0; i < pMiniLexicon->GetSignatures()->GetCount(); i++)
1291 CSignature* pSig = pMiniLexicon->GetSignatures()->GetAt(i);
1293 //if (pSig->GetNumberOfStems() < MinimumNumberOfStemsForDisplay ) continue;
1294 //if (pSig->Size() < MinimumNumberOfAffixesForDisplay ) continue;
1295 AddPrefixSignature(pSig);
1297 return;
1301 void FSA::AddPrefixSignature(CSignature* pSig)
1303 CStringSurrogate ssStem;
1304 //FSAedge* pEdge1 = new FSAedge(this, pStart, pMiddle);
1305 for (int i = 0; i < pSig->GetNumberOfStems(); i++)
1307 CStem* pStem = pSig->GetStemPtrList()->at(i);
1308 QString stemStr = pStem->Display();
1310 int corpusCount = pStem->GetCorpusCount();
1311 Q_ASSERT(corpusCount>0);
1312 if(stemStr.size()>0)
1314 std::cout << "\tStem:"<<stemStr.toStdString() <<"."<< corpusCount <<std::endl;
1315 //pEdge1->AddMorpheme( stmStrr,corpusCount); //TODO
1316 Q_ASSERT(pSig->GetNumberOfAffixes());
1317 for (int j = 0; j < pSig->GetNumberOfAffixes(); j++)
1319 CPrefix* pPrefix= pSig->GetPrefixPtrList()->at(j);
1320 QString prefixStr = pPrefix->Display();
1321 if(prefixStr=="NULL") continue; //skip for now
1323 QString analyzedWord = (prefixStr == "NULL" ? "":prefixStr)+stemStr;
1324 std::cout << "\t\tWord:"<<analyzedWord.toStdString() <<"."<< corpusCount <<std::endl;
1325 //QList<FSAedge*> edges= this->m_morphemeEdges.values(analyzedWord);
1326 foreach(FSAedge * pEdge,edges)
1328 FSAMorphemeList * pEdgeMorphemes = pEdge->GetMorphemes();
1329 FSAMorphemeList::iterator m_it = pEdgeMorphemes->begin();
1330 while(m_it != pEdge->GetMorphemes()->end())
1332 if((*m_it)->toStr() == analyzedWord) break;
1333 m_it++;
1335 FSAMorphemeList morphemesToReanalyze;
1336 morphemesToReanalyze.push_back(*m_it);
1337 FSAedge* pNewEdge = this->DoParallelSplit(pEdge,morphemesToReanalyze);
1338 this->DoSeriesSplit(pNewEdge,prefixStr.length(),false);
1344 } */