Got rid of some obsolete stuff from the glade files
[dasher.git] / Src / DasherCore / PinYinConversionHelper.cpp
blob0477d386afaf6cdb285f9bae1ef955a46916697b
2 #ifndef _WIN32
3 #include "config.h"
4 #endif
6 #include "LanguageModelling/LanguageModel.h"
7 #include "LanguageModelling/PPMLanguageModel.h"
8 #include <ChiCEInterface.h>
9 #include <iostream>
10 #include <fstream>
11 #include "PinYinConversionHelper.h"
13 #include <SCENodeNew.h>
15 using namespace Dasher;
17 CPinYinConversionHelper::CPinYinConversionHelper(Dasher::CEventHandler *pEventHandler, CSettingsStore *pSettingsStore, Dasher::CAlphIO *pCAlphIO){
19 //TESTING FOR UTF-8 CHINESE IN C++ STRING
21 FILE *fp;
22 char* SimpChAlphabet = (char*)malloc(62);
23 fp = fopen("test.txt", "rw");
24 fread(SimpChAlphabet, 1, 62, fp);
25 fclose(fp);
28 const std::string SimpChAlphabet = "Chinese / 简体中文 (simplified chinese, in pin yin groups)";
31 Dasher::CAlphIO::AlphInfo oAlphInfo = pCAlphIO->GetInfo(SimpChAlphabet);
33 m_pAlphabet = new CAlphabet(oAlphInfo);
35 // std::cout<<SimpChAlphabet<<std::endl;
36 // for (int i = 0; i < 20; i++)
37 // std::cout<<m_pAlphabet->GetDisplayText(i)<<std::endl;
39 // TODO: Need to figure out what this does - it was previously set to true in PopulateChildren
41 CSymbolAlphabet alphabet(m_pAlphabet->GetNumberTextSymbols());
42 alphabet.SetSpaceSymbol(m_pAlphabet->GetSpaceSymbol()); // FIXME - is this right, or do we have to do some kind of translation?
43 alphabet.SetAlphabetPointer(m_pAlphabet); // Horrible hack, but ignore for now.
45 m_pLanguageModel = new CPPMLanguageModel(pEventHandler, pSettingsStore, alphabet);
48 //Train the PPM Model for Chinese Alphabet
49 TrainChPPM();
53 //Old Code
54 m_bTraceNeeded = true;//reset trace bool
56 //clears the process phrase flags
57 for(int i(0); i<MAX_HZ_NUM; i++)
58 m_bPhrasesProcessed[i]=0;
61 BuildDataBase();
62 CEInitialise();
66 bool CPinYinConversionHelper::Convert(const std::string &strSource, SCENode ** pRoot, int * childCount, int CMid) {
68 SCENodeNew *pConversionList;
69 int iHZCount;
71 if(CEConvert (strSource.c_str(), &pConversionList, &iHZCount, CMid)){
72 SCENodeNew *pHead(pConversionList);
74 std::vector<SCENodeNew *> vHeads;
76 while(pHead) {
77 vHeads.push_back(pHead);
78 pHead = pHead->pChild;
81 SCENode *pTail = NULL;
82 SCENode *pNextTail = NULL;
84 for(std::vector<SCENodeNew *>::reverse_iterator it(vHeads.rbegin()); it != vHeads.rend(); ++it) {
85 SCENodeNew *pCurrentNode(*it);
87 SCENode *pPreviousNode = NULL;
89 while(pCurrentNode) {
90 SCENode *pNewNode = new SCENode;
92 if(pTail)
93 pNewNode->SetChild(pTail);
95 if(pPreviousNode) {
96 pPreviousNode->SetNext(pNewNode);
98 else {
99 pNextTail = pNewNode;
100 pNextTail->Ref();
103 if(pPreviousNode)
104 pPreviousNode->Unref();
106 pPreviousNode = pNewNode;
107 pCurrentNode = pCurrentNode->pNext;
110 if(pPreviousNode)
111 pPreviousNode->Unref();
113 if(pTail)
114 pTail->Unref();
116 pTail = pNextTail;
120 *pRoot = pTail;
123 // // TODO: Now need to convert...
125 // *pRoot= pStart;
127 // // Connect up the rest of the nodes to make a lattice
128 // SCENode *pHead(pStart);
130 // while(pHead) {
131 // SCENode *pNewChild(pHead->GetChild());
132 // SCENode *pCurrent(pHead->GetNext());
134 // while(pCurrent) {
135 // pCurrent->SetChild(pNewChild);
136 // pCurrent = pCurrent->GetNext();
137 // }
139 // pHead = pHead->GetChild();
140 // }
142 return 1;
144 else{
145 *pRoot = 0;
146 return 0;
150 unsigned int CPinYinConversionHelper::GetSumPYProbs(Dasher::CLanguageModel::Context context, SCENode * pPYCandStart, int norm){
152 std::vector <unsigned int > Probs;
153 unsigned int sumProb=0;
155 m_pLanguageModel->GetProbs(context, Probs, norm);
157 SCENode * pCurrentNode = pPYCandStart;
159 while(pCurrentNode){
161 std::vector <symbol >Symbols;
162 std::string HZ = static_cast<std::string>(pCurrentNode->pszConversion);
163 // Distribute the remaining space evenly
165 m_pAlphabet->GetSymbols(&Symbols, &HZ, 0);
167 if(Symbols.size()!=0)
168 sumProb += Probs[Symbols[0]];
169 pCurrentNode = pCurrentNode->GetNext();
173 return sumProb;
176 void CPinYinConversionHelper::GetProbs(Dasher::CLanguageModel::Context context, std::vector < unsigned int >&Probs, int norm){
182 void CPinYinConversionHelper::AssignSizes(SCENode * pStart, CLanguageModel::Context context, long normalization, int uniform, int iNChildren){
184 SCENode *pNode = pStart;
186 std::vector <unsigned int > Probs;
188 int iSymbols = m_pAlphabet->GetNumberSymbols();
189 int iLeft(iNChildren);
190 int iRemaining(normalization);
192 int uniform_add;
193 int nonuniform_norm;
194 int control_space;
195 int iNorm = normalization;
198 //IGNORE CONTROL MODE FOR NOW
199 // if(!GetBoolParameter(BP_CONTROL_MODE)) {
200 control_space = 0;
201 uniform_add = ((iNorm * uniform) / 1000) / (iSymbols - 2); // Subtract 2 from no symbols to lose control/root nodes
202 nonuniform_norm = iNorm - (iSymbols - 2) * uniform_add;
203 // }
205 else {
206 control_space = int (iNorm * 0.05);
207 uniform_add = (((iNorm - control_space) * uniform / 1000) / (iSymbols - 2)); // Subtract 2 from no symbols to lose control/root nodes
208 nonuniform_norm = iNorm - control_space - (iSymbols - 2) * uniform_add;
213 // context = m_pLanguageModel->CreateEmptyContext();
215 //Testing Code for PYCHelper GetPYSumProbs
218 CLanguageModel::Context iContext = m_pLanguageModel->CreateEmptyContext();
220 SCENode * pTemp = pStart;
221 while(pTemp){
222 std::cout<<"test sum probs"<<GetSumPYProbs(iContext, pTemp, nonuniform_norm)<<std::endl;
223 std::cout<<"test norm"<<nonuniform_norm<<std::endl;
224 pTemp=pTemp->pChild;
228 m_pLanguageModel->GetProbs(context, Probs, nonuniform_norm);
231 std::vector<unsigned int>::iterator it;
232 for(it = Probs.begin();it!=Probs.end(); it++)
233 std::cout<<*it<<",";
235 std::cout<<"end"<<std::endl;
238 // unsigned int sum;
241 unsigned long long int sumProb=0;
243 std::vector <symbol >Symbols;
244 std::string HZ;
245 CLanguageModel::Context iCurrentContext;
248 //std::cout<<"start"<<std::endl;
249 while(pNode){
251 Symbols.clear();
252 HZ = static_cast<std::string>(pNode->pszConversion);
253 m_pAlphabet->GetSymbols(&Symbols, &HZ, 0);
255 if(Symbols.size()!=0){
256 pNode->Symbol = Symbols[0];
257 //sumProb += Probs[Symbols[0]];
260 iCurrentContext=m_pLanguageModel->CloneContext(context);
261 m_pLanguageModel->EnterSymbol(iCurrentContext, pNode->Symbol);
263 if(pStart->GetChild()){
264 pNode->SumPYProbStore = GetSumPYProbs(iCurrentContext, pStart->GetChild(), nonuniform_norm);
265 //std::cout<<"sumpyprobstore"<<pNode->SumPYProbStore<<std::endl;
267 else
268 pNode->SumPYProbStore = 1;
270 sumProb += (Probs[pNode->Symbol]*(pNode->SumPYProbStore));
271 //std::cout<<"Probs[symbol]"<<Probs[Symbols[0]]<<std::endl;
272 //std::cout<<"sumProbs"<<sumProb<<std::endl;
274 else
275 pNode->Symbol = -1;
277 pNode = pNode->GetNext();
282 pNode = pStart;
283 while(pNode){
285 std::vector <symbol >Symbols;
286 std::string HZ = static_cast<std::string>(pNode->pszConversion);
289 m_pAlphabet->GetSymbols(&Symbols, &HZ, 0);
292 if(pNode->Symbol!=-1){
293 if(sumProb!=0){
295 // iCurrentContext=m_pLanguageModel->CloneContext(context);
296 //m_pLanguageModel->EnterSymbol(iCurrentContext, pNode->Symbol);
299 pNode->NodeSize =static_cast<unsigned long long int>(Probs[pNode->Symbol])*(pNode->SumPYProbStore)*normalization/sumProb;
302 std::cout<<"HZ"<<pNode->pszConversion<<std::endl;
303 std::cout<<"Probs"<<Probs[pNode->Symbol]<<std::endl;
304 std::cout<<"SumProbStore"<<pNode->SumPYProbStore<<std::endl;
305 std::cout<<"above"<<Probs[pNode->Symbol]*(pNode->SumPYProbStore)<<std::endl;
306 std::cout<<"sumprob"<<sumProb<<std::endl;
307 std::cout<<"nodesize"<<pNode->NodeSize<<std::endl;
311 else{
312 pNode->NodeSize = 0;//hopefully this will be not be displayed
315 if(pNode->NodeSize < 1)
316 pNode->NodeSize = 1;
318 iRemaining -= pNode->NodeSize;
320 pNode = pNode->GetNext();
323 pNode = pStart;
325 while(pNode){
327 int iDiff(iRemaining / iLeft);
329 pNode->NodeSize += iDiff;
331 iRemaining -= iDiff;
332 --iLeft;
333 pNode = pNode->GetNext();
336 pNode = pStart;
337 while(pNode){
338 std::cout<<"size"<<pNode->NodeSize<<std::endl;
339 pNode = pNode ->pNext;
343 pNode = pStart;
344 while(pNode){
345 std::cout<<pNode->NodeSize<<",";
346 pNode = pNode->pNext;
349 std::cout<<std::endl;
352 //std::cout<<catStr<<std::endl;
354 //for(int i=0; i<Symbols.size(); i++)
355 // std::cout<<Symbols[i]<<",";
357 //std::cout<<std::endl;
359 //for(int i=0; i<Symbols.size();i++)
360 // std::cout<<m_pAlphabet->GetText(Symbols[i]);
362 //std::cout<<std::endl;
364 // pSizes[i] = m_pNCManager->GetLongParameter(LP_NORMALIZATION)*(100+5*freq[i])/(100*iNChildren+5*totalFreq);
368 bool CPinYinConversionHelper::GetPhraseList(int HZIndex, SCENode ** psOutput, int CMid){
369 // SCENode * pStart;
371 // if(CEGetPhraseList(HZIndex, &pStart, CMid)){
373 // *psOutput= pStart;
375 // return 1;
376 // }
377 // else{
378 // *psOutput = 0;
379 // return 0;
380 // }
382 return 0;
385 void CPinYinConversionHelper::BuildDataBase(){
387 std::vector<std::vector<std::vector<std::vector<std::vector<int> > > > >IndexStack;
388 std::vector<std::vector<std::vector<std::vector<int> > > >CandStack;
389 std::vector<std::vector<std::vector<int> > > subIndexStack;
390 std::vector<std::vector<int> > subCandStack;
391 std::vector<int> cell;
393 vContextData.clear();
394 cell.push_back(0);
395 subCandStack.push_back(cell);
398 for(int i(0); i<MAX_HZ_NUM; i++){
399 for(int j(0); j<MAX_CARE_CAND; j++){
400 for(int k(0); k<i+1; k++)
401 subIndexStack.push_back(subCandStack);
402 CandStack.push_back(subIndexStack);
403 subIndexStack.clear();
405 IndexStack.push_back(CandStack);
406 CandStack.clear();
409 for(int k(0); k<MAX_CM_NUM; k++)
410 vContextData.push_back(IndexStack);
414 void CPinYinConversionHelper::ClearData(int CMid){
416 std::vector<int> cell;
417 cell.push_back(0);
419 for(int i(0); i<MAX_HZ_NUM; i++){
420 for(int j(0); j<MAX_CARE_CAND/*(m_pRoot[i]->pChild->IsHeadAndCandNum)*/; j++){
421 for(int k(0); k<i+1; k++){
422 vContextData[CMid][i][j][k].clear();
423 vContextData[CMid][i][j][k].push_back(cell);
431 std::vector<std::vector<std::vector<std::vector<std::vector<int> > > > > * CPinYinConversionHelper::GetDP(int CMid){
433 return &vContextData[CMid];
439 //THIS FUNCTION IS CALLED WHEN A SET OF CHILDREN IS BEING POPULATED
440 //AND BEFORE CALCULATING EACH NODE'S SCORE. THE POSITION IN THE
441 //SENTENCE IS GIVEN TO THE PY HELPER->LIBRARY AND A LIST OF PHRASES
442 //CORRESPONDING TO THE CHARACTER IS RETURNED. THESE PHRASES ARE
443 //PROCESSED INTO THE CONTEXT DATA IN PY HELPER
445 void CPinYinConversionHelper::ProcessPhrase(HZIDX HZIndex){
447 SCENode * pPhraseList;
448 SCENode * pNode;
450 // bool stop=0;
452 int iIdx(0);
453 int i;
454 int score[m_iHZCount-HZIndex];
456 CANDIDX CandIndex[m_iHZCount-HZIndex]; //list to store candidates
457 //returned from HZlookup,
458 //used to allocate data
459 std::string strtemp;
461 std::vector<int> cell;
463 if(!(GetPhraseList(HZIndex, &pPhraseList, m_iCMID)))
464 return;
466 if(pPhraseList->AcCharCount>4)
467 pNode = pPhraseList->GetNext();
468 else
469 pNode = pPhraseList;
471 while((pNode)&&(iIdx<=MAX_CARE_PHRASE)){
473 //this section needs research. What scores would be a good estimate.
475 switch(pNode->AcCharCount){
476 case 2:
477 score[0] = 2;
478 score[1] = 3;
479 break;
480 case 3:
481 score[0] = 3;
482 score[1] = 4;
483 score[2] = 5;
484 break;
485 case 4:
486 score[0] = 4;
487 score[1] = 5;
488 score[2] = 6;
489 score[3] = 7;
490 break;
491 default:
492 for(int j(0); j< m_iHZCount-HZIndex; j++)
493 score[j] = 5+j;
494 break;
498 for(i=0 ; (i<pNode->AcCharCount); i++){
499 strtemp=pNode->pszConversion;
501 //TESTING
502 //std::cout<<"accharcount"<<pNode->AcCharCount<<std::endl;
503 //std::cout<<"the cut string is"<<strtemp.substr(3*i,3)<<std::endl;
504 //std::cout<<"list to look from
505 //is"<<m_pRoot[HZIndex+i]->pChild->pszConversion<<std::endl;
508 CandIndex[i] = HZLookup(HZIndex+i, strtemp.substr(3*i, 3));
510 //TESTING
511 //std::cout<<"the lookup is"<<CandIndex[i]<<std::endl;
513 if(CandIndex[i]==-1)
514 break;
515 else{
516 cell.push_back(score[i]);
517 for(int j(0); j< i; j++)
518 cell.push_back(CandIndex[i-j-1]);
521 //say the phrase is XYZ(this) push back in each cell in the
522 //order: score, Z, Y, X so as to match with vTrace in
523 //calculatescore
525 if(!(HZIndex + i> MAX_HZ_NUM -1))
526 (*(GetDP(m_iCMID)))[HZIndex +i][CandIndex[i]][HZIndex].push_back(cell);
527 cell.clear();
532 pNode = pNode ->GetNext();
533 iIdx ++;
535 m_bPhrasesProcessed[HZIndex]=1;
540 CANDIDX CPinYinConversionHelper::HZLookup(HZIDX HZIndex, const std::string &strSource){
543 // this was done before candindex was put into node member, change
544 // if have time
546 // int iIdx(0);
547 if((HZIndex > m_iHZCount-1)||strSource.size()!=3)
548 return -1;
550 // TODO: Reimplement -----
552 // SCENode * pNode = m_pRoot[HZIndex]->pChild;
554 // while(pNode&&(iIdx<=MAX_CARE_CAND)){
556 // if(strSource== pNode->pszConversion)
557 // return iIdx;
558 // pNode = pNode->pNext;
559 // iIdx++;
560 // }
562 // -----
564 return -1;
567 //CALCULATES SCORE OF A CERTAIN CANDIDATE HZ CHARACTER NODE TO BE
568 //POPULATED, FROM THE CONTEXT DATA IN PY HELPER. FINDS VTRACE TO MATCH
569 //CONTEXT SEQUENCE STORED IN THE LAST LEVEL OF DATABASE
571 int CPinYinConversionHelper::CalculateScore(CDasherNode * pNode, CANDIDX CandIndex){
572 CDasherNode *pIterateDNode(pNode);
573 SCENode *pTemp;
575 HZIDX HZIndex;
576 int score=0;
578 bool addtick=1; //bool to signal add score
582 //THIS SECTION IS TO FIND VTRACE, IN THE SAME WAY AS FINDING INPUT
583 //PY STRING
585 if(m_bTraceNeeded){
586 vTrace.clear();
588 while(pIterateDNode&&(pIterateDNode->m_pNodeManager->GetID() == 2)) {
590 pTemp=static_cast<SCENode*>(pIterateDNode->m_pUserData);
592 if(!pTemp)
593 pIterateDNode=0;
594 else{
595 vTrace.push_back(pTemp->CandIndex);
596 pIterateDNode= pIterateDNode->Parent();
600 //THE FOLLOWING IS TESTING FOR VTRACE
601 // if(vTrace.size()!=0){
602 // std::cout<<"signal"<<std::endl;
603 // for(std::vector<int>::iterator it(vTrace.begin());it!=vTrace.end();it++)
604 // std::cout<< *it <<std::endl;
608 m_bTraceNeeded= false;
613 if(CandIndex>=MAX_CARE_CAND)
614 return 0;
618 pTemp=static_cast<SCENode*>(pNode->m_pUserData);
620 if(pTemp)
621 HZIndex = pTemp->AcCharCount;
622 else
623 HZIndex = 0;
626 //THE DATA IS CONSTRUCTED OF UNITS OF SINGLE CELLS STORING CONTEXT SEQUENCE
627 //AND A CORRESPONDING SCORE
629 //LEVEL 1 : HZ INDEX : NUMBER OF CHARACTERS CONVERTED
630 //LEVEL 2 : CAND INDEX : NUMBER OF CANDIDATES WITH EACH POSITION
631 //LEVEL 3 : SUB HZ INDEX :(COULD BE REDUNDANT) CORRESPONDES TO
632 // INDEX OF CHARACTERS WHICH WERE CONSISTED IN PHRASES
633 //LEVEL 4 : SUB CAND INDEX : WHICH CAND WAS IN THE PHRASE
634 //LEVLE 5 : CELL: STORING PHASES IN HZ INDEX AND ASSIGNED SCORE
635 // IN THE WAY: 1.SCORE 2.Z 3.Y 4.X FOR PHRASE XYZ
636 // PREVIOUSLY PROCESSED
638 for(std::vector<std::vector<std::vector<int> > >::iterator itIndex((*(GetDP(m_iCMID)))[HZIndex][CandIndex].begin()); itIndex!=(*(GetDP(m_iCMID)))[HZIndex][CandIndex].end();itIndex++){
640 for(std::vector<std::vector<int> >::iterator itCand(itIndex->begin()); itCand!=itIndex->end(); itCand++){
642 //IF HAS LEFT CONTEXT INFORMATION, MATCH VTRACE WITH CELL
643 //POSITION 1
645 if((*itCand).size() !=1){
646 for(unsigned int i(0); i<(*itCand).size()-1; i++)
647 if((*itCand)[i+1]!=vTrace[i]){
648 addtick = 0;
649 break;
651 if(addtick)
652 score+=(*itCand)[0];
653 addtick =1;
655 else
656 score+=(*itCand)[0];
660 return score;
664 void CPinYinConversionHelper::TrainChPPM(){
666 for(int i =0; i<10;i++)
667 ProcessFile(i);
671 void CPinYinConversionHelper::ProcessFile(int index){
674 CLanguageModel::Context trainContext;
675 trainContext = m_pLanguageModel->CreateEmptyContext();
677 FILE * fp;
679 char strPath[200];
681 const char* Alph="ABCDEFGHJKLMNPR";
683 char str[4];
684 std::string HZ;
685 // int i, j, iLen;
689 long pos=0;
690 char cget;
692 int trialcount=0;
695 strcpy (strPath, (char *) getenv ("HOME"));
696 strcat (strPath, "/training/corpus/character/");
697 strcat (strPath, "C");
698 strncat (strPath, Alph+index, 1);
699 strcat (strPath, ".txt");
701 printf("strPath is %s\n", strPath);
703 fp = fopen (strPath, "rb");
705 if (!fp)
706 printf("cannot open file or incorrect directory\n");
708 while(!feof(fp)){
709 pos = ftell(fp);
710 cget=fgetc(fp);
712 // printf("OXE$4= %d\n", (unsigned char)0xE4); **228**
713 // printf("OXE9= %d\n", (unsigned char)0xE9); **233**
714 //printf("BEFORE print the integer code for unsigned char %d\n", (unsigned char) cget);
715 while (((unsigned char)cget < (unsigned char) 0xE4 || (unsigned char) cget > (unsigned char) 0xE9)&&!feof(fp)){
717 if((unsigned char) cget > (unsigned char) 0xE9){
719 fseek(fp, pos, SEEK_SET);
720 fread(str, sizeof(char)*3, 1, fp);
721 str[3]='\0';
722 pos = ftell(fp);
723 cget = fgetc(fp);
724 //printf("UNICODE SYMBOL/NUMBER |%s|\n", str);
727 else if(cget == 32){
728 pos = ftell(fp);
729 cget=fgetc(fp);
730 //fputc(32, op);
732 else if(cget ==10){
733 pos = ftell(fp);
734 cget=fgetc(fp);
735 //fputc(10, op);
737 else{
738 //printf("NON-UNICODE character |%c|\n", cget);
740 pos = ftell(fp);
741 cget=fgetc(fp);
742 trialcount ++;
746 if(!feof(fp)){
748 fseek(fp, pos, SEEK_SET);
749 fread(str, sizeof(char)*3, 1, fp);
750 str[3]='\0';
751 //printf("HZ |%s|\n",str);
752 // printf("first byte %d ", (unsigned char)str[0]);
753 // printf("seconde byte %d\n", (unsigned char)str[1]);
754 // printf("third byte %d\n", (unsigned char)str[2]);
756 HZ = static_cast<std::string>(str);
758 // std::cout<<"HZ is "<<HZ<<std::endl;
760 std::vector<symbol> Sym;
761 m_pAlphabet->GetSymbols(&Sym, &HZ, 0);
763 if(Sym.size()!=0)
764 m_pLanguageModel->LearnSymbol(trainContext, Sym[0]);
765 else
766 std::cout<<HZ<<"not found!"<<std::endl;
786 while(!feof(fp)){
787 // pos = ftell(fp);
788 cget=fgetc(fp);
790 if(!feof(fp)){
791 fread(str, sizeof(char)*3, 1, fp);
792 str[3]='\0';
793 printf("HZ |%s|\n",str);
794 // printf("first byte %d ", (unsigned char)str[0]);
795 // printf("seconde byte %d\n", (unsigned char)str[1]);
796 // printf("third byte %d\n", (unsigned char)str[2]);
798 HZ = static_cast<std::string>(str);
800 std::cout<<"HZ is "<<HZ<<std::endl;
802 std::vector<symbol> Sym;
803 m_pAlphabet->GetSymbols(&Sym, &HZ, 0);
805 if(Sym.size()!=0)
806 m_pLanguageModel->LearnSymbol(trainContext, Sym[0]);
807 //else
808 // std::cout<<"not found!"<<std::endl;