6 #include "LanguageModelling/LanguageModel.h"
7 #include "LanguageModelling/PPMLanguageModel.h"
8 #include <ChiCEInterface.h>
11 #include "PinYinConversionHelper.h"
13 #include <SCENodeNew.h>
15 using namespace Dasher
;
17 CPinYinConversionHelper::CPinYinConversionHelper(Dasher::CEventHandler
*pEventHandler
, CSettingsStore
*pSettingsStore
, Dasher::CAlphIO
*pCAlphIO
){
19 //TESTING FOR UTF-8 CHINESE IN C++ STRING
22 char* SimpChAlphabet = (char*)malloc(62);
23 fp = fopen("test.txt", "rw");
24 fread(SimpChAlphabet, 1, 62, fp);
28 const std::string SimpChAlphabet
= "Chinese / 简体中文 (simplified chinese, in pin yin groups)";
31 Dasher::CAlphIO::AlphInfo oAlphInfo
= pCAlphIO
->GetInfo(SimpChAlphabet
);
33 m_pAlphabet
= new CAlphabet(oAlphInfo
);
35 // std::cout<<SimpChAlphabet<<std::endl;
36 // for (int i = 0; i < 20; i++)
37 // std::cout<<m_pAlphabet->GetDisplayText(i)<<std::endl;
39 // TODO: Need to figure out what this does - it was previously set to true in PopulateChildren
41 CSymbolAlphabet
alphabet(m_pAlphabet
->GetNumberTextSymbols());
42 alphabet
.SetSpaceSymbol(m_pAlphabet
->GetSpaceSymbol()); // FIXME - is this right, or do we have to do some kind of translation?
43 alphabet
.SetAlphabetPointer(m_pAlphabet
); // Horrible hack, but ignore for now.
45 m_pLanguageModel
= new CPPMLanguageModel(pEventHandler
, pSettingsStore
, alphabet
);
48 //Train the PPM Model for Chinese Alphabet
54 m_bTraceNeeded
= true;//reset trace bool
56 //clears the process phrase flags
57 for(int i(0); i
<MAX_HZ_NUM
; i
++)
58 m_bPhrasesProcessed
[i
]=0;
66 bool CPinYinConversionHelper::Convert(const std::string
&strSource
, SCENode
** pRoot
, int * childCount
, int CMid
) {
68 SCENodeNew
*pConversionList
;
71 if(CEConvert (strSource
.c_str(), &pConversionList
, &iHZCount
, CMid
)){
72 SCENodeNew
*pHead(pConversionList
);
74 std::vector
<SCENodeNew
*> vHeads
;
77 vHeads
.push_back(pHead
);
78 pHead
= pHead
->pChild
;
81 SCENode
*pTail
= NULL
;
82 SCENode
*pNextTail
= NULL
;
84 for(std::vector
<SCENodeNew
*>::reverse_iterator
it(vHeads
.rbegin()); it
!= vHeads
.rend(); ++it
) {
85 SCENodeNew
*pCurrentNode(*it
);
87 SCENode
*pPreviousNode
= NULL
;
90 SCENode
*pNewNode
= new SCENode
;
92 pNewNode
->pszConversion
= pCurrentNode
->pszConversion
;
93 pNewNode
->IsHeadAndCandNum
= pCurrentNode
->IsHeadAndCandNum
;
94 pNewNode
->CandIndex
= pCurrentNode
->CandIndex
;
95 pNewNode
->Symbol
= pCurrentNode
->Symbol
;
96 // pNewNode->SumPYProbStore = pCurrentNode->SumPYProbStore;
97 pNewNode
->IsComplete
= pCurrentNode
->IsComplete
;
98 pNewNode
->AcCharCount
= pCurrentNode
->AcCharCount
; /*accumulative character count*/
99 pNewNode
->NodeSize
= pCurrentNode
->NodeSize
;
100 pNewNode
->HZFreq
= pCurrentNode
->HZFreq
;
101 pNewNode
->HZProb
= pCurrentNode
->HZProb
;
104 pNewNode
->SetChild(pTail
);
107 pPreviousNode
->SetNext(pNewNode
);
110 pNextTail
= pNewNode
;
115 pPreviousNode
->Unref();
117 pPreviousNode
= pNewNode
;
118 pCurrentNode
= pCurrentNode
->pNext
;
122 pPreviousNode
->Unref();
134 // // TODO: Now need to convert...
138 // // Connect up the rest of the nodes to make a lattice
139 // SCENode *pHead(pStart);
142 // SCENode *pNewChild(pHead->GetChild());
143 // SCENode *pCurrent(pHead->GetNext());
146 // pCurrent->SetChild(pNewChild);
147 // pCurrent = pCurrent->GetNext();
150 // pHead = pHead->GetChild();
161 unsigned int CPinYinConversionHelper::GetSumPYProbs(Dasher::CLanguageModel::Context context
, SCENode
* pPYCandStart
, int norm
){
163 std::vector
<unsigned int > Probs
;
164 unsigned int sumProb
=0;
166 m_pLanguageModel
->GetProbs(context
, Probs
, norm
);
168 SCENode
* pCurrentNode
= pPYCandStart
;
172 std::vector
<symbol
>Symbols
;
173 std::string HZ
= static_cast<std::string
>(pCurrentNode
->pszConversion
);
174 // Distribute the remaining space evenly
176 m_pAlphabet
->GetSymbols(&Symbols
, &HZ
, 0);
178 if(Symbols
.size()!=0)
179 sumProb
+= Probs
[Symbols
[0]];
180 pCurrentNode
= pCurrentNode
->GetNext();
187 void CPinYinConversionHelper::GetProbs(Dasher::CLanguageModel::Context context
, std::vector
< unsigned int >&Probs
, int norm
){
193 void CPinYinConversionHelper::AssignSizes(SCENode
* pStart
, Dasher::CLanguageModel::Context context
, long normalization
, int uniform
, int iNChildren
){
195 SCENode
*pNode
= pStart
;
197 std::vector
<unsigned int > Probs
;
199 int iSymbols
= m_pAlphabet
->GetNumberSymbols();
200 int iLeft(iNChildren
);
201 int iRemaining(normalization
);
206 int iNorm
= normalization
;
209 //IGNORE CONTROL MODE FOR NOW
210 // if(!GetBoolParameter(BP_CONTROL_MODE)) {
212 uniform_add
= ((iNorm
* uniform
) / 1000) / (iSymbols
- 2); // Subtract 2 from no symbols to lose control/root nodes
213 nonuniform_norm
= iNorm
- (iSymbols
- 2) * uniform_add
;
217 control_space = int (iNorm * 0.05);
218 uniform_add = (((iNorm - control_space) * uniform / 1000) / (iSymbols - 2)); // Subtract 2 from no symbols to lose control/root nodes
219 nonuniform_norm = iNorm - control_space - (iSymbols - 2) * uniform_add;
224 // context = m_pLanguageModel->CreateEmptyContext();
226 //Testing Code for PYCHelper GetPYSumProbs
229 CLanguageModel::Context iContext = m_pLanguageModel->CreateEmptyContext();
231 SCENode * pTemp = pStart;
233 std::cout<<"test sum probs"<<GetSumPYProbs(iContext, pTemp, nonuniform_norm)<<std::endl;
234 std::cout<<"test norm"<<nonuniform_norm<<std::endl;
239 m_pLanguageModel
->GetProbs(context
, Probs
, nonuniform_norm
);
242 std::vector<unsigned int>::iterator it;
243 for(it = Probs.begin();it!=Probs.end(); it++)
246 std::cout<<"end"<<std::endl;
252 unsigned long long int sumProb
=0;
254 std::vector
<symbol
>Symbols
;
256 CLanguageModel::Context iCurrentContext
;
259 //std::cout<<"start"<<std::endl;
263 HZ
= static_cast<std::string
>(pNode
->pszConversion
);
264 m_pAlphabet
->GetSymbols(&Symbols
, &HZ
, 0);
266 if(Symbols
.size()!=0){
267 pNode
->Symbol
= Symbols
[0];
268 //sumProb += Probs[Symbols[0]];
271 iCurrentContext
=m_pLanguageModel
->CloneContext(context
);
272 m_pLanguageModel
->EnterSymbol(iCurrentContext
, pNode
->Symbol
);
274 if(pStart
->GetChild()){
275 pNode
->SumPYProbStore
= GetSumPYProbs(iCurrentContext
, pStart
->GetChild(), nonuniform_norm
);
276 //std::cout<<"sumpyprobstore"<<pNode->SumPYProbStore<<std::endl;
279 pNode
->SumPYProbStore
= 1;
281 sumProb
+= (Probs
[pNode
->Symbol
]*(pNode
->SumPYProbStore
));
282 //std::cout<<"Probs[symbol]"<<Probs[Symbols[0]]<<std::endl;
283 //std::cout<<"sumProbs"<<sumProb<<std::endl;
288 pNode
= pNode
->GetNext();
296 std::vector <symbol >Symbols;
297 std::string HZ = static_cast<std::string>(pNode->pszConversion);
300 m_pAlphabet->GetSymbols(&Symbols, &HZ, 0);
303 if(pNode
->Symbol
!=-1){
306 // iCurrentContext=m_pLanguageModel->CloneContext(context);
307 //m_pLanguageModel->EnterSymbol(iCurrentContext, pNode->Symbol);
310 pNode
->NodeSize
=static_cast<unsigned long long int>(Probs
[pNode
->Symbol
])*(pNode
->SumPYProbStore
)*normalization
/sumProb
;
313 std::cout<<"HZ"<<pNode->pszConversion<<std::endl;
314 std::cout<<"Probs"<<Probs[pNode->Symbol]<<std::endl;
315 std::cout<<"SumProbStore"<<pNode->SumPYProbStore<<std::endl;
316 std::cout<<"above"<<Probs[pNode->Symbol]*(pNode->SumPYProbStore)<<std::endl;
317 std::cout<<"sumprob"<<sumProb<<std::endl;
318 std::cout<<"nodesize"<<pNode->NodeSize<<std::endl;
323 pNode
->NodeSize
= 0;//hopefully this will be not be displayed
326 if(pNode
->NodeSize
< 1)
329 iRemaining
-= pNode
->NodeSize
;
331 pNode
= pNode
->GetNext();
338 int iDiff(iRemaining
/ iLeft
);
340 pNode
->NodeSize
+= iDiff
;
344 pNode
= pNode
->GetNext();
349 std::cout<<"size"<<pNode->NodeSize<<std::endl;
350 pNode = pNode ->pNext;
356 std::cout<<pNode->NodeSize<<",";
357 pNode = pNode->pNext;
360 std::cout<<std::endl;
363 //std::cout<<catStr<<std::endl;
365 //for(int i=0; i<Symbols.size(); i++)
366 // std::cout<<Symbols[i]<<",";
368 //std::cout<<std::endl;
370 //for(int i=0; i<Symbols.size();i++)
371 // std::cout<<m_pAlphabet->GetText(Symbols[i]);
373 //std::cout<<std::endl;
375 // pSizes[i] = m_pNCManager->GetLongParameter(LP_NORMALIZATION)*(100+5*freq[i])/(100*iNChildren+5*totalFreq);
379 bool CPinYinConversionHelper::GetPhraseList(int HZIndex
, SCENode
** psOutput
, int CMid
){
382 // if(CEGetPhraseList(HZIndex, &pStart, CMid)){
384 // *psOutput= pStart;
396 void CPinYinConversionHelper::BuildDataBase(){
398 std::vector
<std::vector
<std::vector
<std::vector
<std::vector
<int> > > > >IndexStack
;
399 std::vector
<std::vector
<std::vector
<std::vector
<int> > > >CandStack
;
400 std::vector
<std::vector
<std::vector
<int> > > subIndexStack
;
401 std::vector
<std::vector
<int> > subCandStack
;
402 std::vector
<int> cell
;
404 vContextData
.clear();
406 subCandStack
.push_back(cell
);
409 for(int i(0); i
<MAX_HZ_NUM
; i
++){
410 for(int j(0); j
<MAX_CARE_CAND
; j
++){
411 for(int k(0); k
<i
+1; k
++)
412 subIndexStack
.push_back(subCandStack
);
413 CandStack
.push_back(subIndexStack
);
414 subIndexStack
.clear();
416 IndexStack
.push_back(CandStack
);
420 for(int k(0); k
<MAX_CM_NUM
; k
++)
421 vContextData
.push_back(IndexStack
);
425 void CPinYinConversionHelper::ClearData(int CMid
){
427 std::vector
<int> cell
;
430 for(int i(0); i
<MAX_HZ_NUM
; i
++){
431 for(int j(0); j
<MAX_CARE_CAND
/*(m_pRoot[i]->pChild->IsHeadAndCandNum)*/; j
++){
432 for(int k(0); k
<i
+1; k
++){
433 vContextData
[CMid
][i
][j
][k
].clear();
434 vContextData
[CMid
][i
][j
][k
].push_back(cell
);
442 std::vector
<std::vector
<std::vector
<std::vector
<std::vector
<int> > > > > * CPinYinConversionHelper::GetDP(int CMid
){
444 return &vContextData
[CMid
];
450 //THIS FUNCTION IS CALLED WHEN A SET OF CHILDREN IS BEING POPULATED
451 //AND BEFORE CALCULATING EACH NODE'S SCORE. THE POSITION IN THE
452 //SENTENCE IS GIVEN TO THE PY HELPER->LIBRARY AND A LIST OF PHRASES
453 //CORRESPONDING TO THE CHARACTER IS RETURNED. THESE PHRASES ARE
454 //PROCESSED INTO THE CONTEXT DATA IN PY HELPER
456 void CPinYinConversionHelper::ProcessPhrase(HZIDX HZIndex
){
458 SCENode
* pPhraseList
;
465 int score
[m_iHZCount
-HZIndex
];
467 CANDIDX CandIndex
[m_iHZCount
-HZIndex
]; //list to store candidates
468 //returned from HZlookup,
469 //used to allocate data
472 std::vector
<int> cell
;
474 if(!(GetPhraseList(HZIndex
, &pPhraseList
, m_iCMID
)))
477 if(pPhraseList
->AcCharCount
>4)
478 pNode
= pPhraseList
->GetNext();
482 while((pNode
)&&(iIdx
<=MAX_CARE_PHRASE
)){
484 //this section needs research. What scores would be a good estimate.
486 switch(pNode
->AcCharCount
){
503 for(int j(0); j
< m_iHZCount
-HZIndex
; j
++)
509 for(i
=0 ; (i
<pNode
->AcCharCount
); i
++){
510 strtemp
=pNode
->pszConversion
;
513 //std::cout<<"accharcount"<<pNode->AcCharCount<<std::endl;
514 //std::cout<<"the cut string is"<<strtemp.substr(3*i,3)<<std::endl;
515 //std::cout<<"list to look from
516 //is"<<m_pRoot[HZIndex+i]->pChild->pszConversion<<std::endl;
519 CandIndex
[i
] = HZLookup(HZIndex
+i
, strtemp
.substr(3*i
, 3));
522 //std::cout<<"the lookup is"<<CandIndex[i]<<std::endl;
527 cell
.push_back(score
[i
]);
528 for(int j(0); j
< i
; j
++)
529 cell
.push_back(CandIndex
[i
-j
-1]);
532 //say the phrase is XYZ(this) push back in each cell in the
533 //order: score, Z, Y, X so as to match with vTrace in
536 if(!(HZIndex
+ i
> MAX_HZ_NUM
-1))
537 (*(GetDP(m_iCMID
)))[HZIndex
+i
][CandIndex
[i
]][HZIndex
].push_back(cell
);
543 pNode
= pNode
->GetNext();
546 m_bPhrasesProcessed
[HZIndex
]=1;
551 CANDIDX
CPinYinConversionHelper::HZLookup(HZIDX HZIndex
, const std::string
&strSource
){
554 // this was done before candindex was put into node member, change
558 if((HZIndex
> m_iHZCount
-1)||strSource
.size()!=3)
561 // TODO: Reimplement -----
563 // SCENode * pNode = m_pRoot[HZIndex]->pChild;
565 // while(pNode&&(iIdx<=MAX_CARE_CAND)){
567 // if(strSource== pNode->pszConversion)
569 // pNode = pNode->pNext;
578 //CALCULATES SCORE OF A CERTAIN CANDIDATE HZ CHARACTER NODE TO BE
579 //POPULATED, FROM THE CONTEXT DATA IN PY HELPER. FINDS VTRACE TO MATCH
580 //CONTEXT SEQUENCE STORED IN THE LAST LEVEL OF DATABASE
582 int CPinYinConversionHelper::CalculateScore(CDasherNode
* pNode
, CANDIDX CandIndex
){
583 CDasherNode
*pIterateDNode(pNode
);
589 bool addtick
=1; //bool to signal add score
593 //THIS SECTION IS TO FIND VTRACE, IN THE SAME WAY AS FINDING INPUT
599 while(pIterateDNode
&&(pIterateDNode
->m_pNodeManager
->GetID() == 2)) {
601 pTemp
=static_cast<SCENode
*>(pIterateDNode
->m_pUserData
);
606 vTrace
.push_back(pTemp
->CandIndex
);
607 pIterateDNode
= pIterateDNode
->Parent();
611 //THE FOLLOWING IS TESTING FOR VTRACE
612 // if(vTrace.size()!=0){
613 // std::cout<<"signal"<<std::endl;
614 // for(std::vector<int>::iterator it(vTrace.begin());it!=vTrace.end();it++)
615 // std::cout<< *it <<std::endl;
619 m_bTraceNeeded
= false;
624 if(CandIndex
>=MAX_CARE_CAND
)
629 pTemp
=static_cast<SCENode
*>(pNode
->m_pUserData
);
632 HZIndex
= pTemp
->AcCharCount
;
637 //THE DATA IS CONSTRUCTED OF UNITS OF SINGLE CELLS STORING CONTEXT SEQUENCE
638 //AND A CORRESPONDING SCORE
640 //LEVEL 1 : HZ INDEX : NUMBER OF CHARACTERS CONVERTED
641 //LEVEL 2 : CAND INDEX : NUMBER OF CANDIDATES WITH EACH POSITION
642 //LEVEL 3 : SUB HZ INDEX :(COULD BE REDUNDANT) CORRESPONDES TO
643 // INDEX OF CHARACTERS WHICH WERE CONSISTED IN PHRASES
644 //LEVEL 4 : SUB CAND INDEX : WHICH CAND WAS IN THE PHRASE
645 //LEVLE 5 : CELL: STORING PHASES IN HZ INDEX AND ASSIGNED SCORE
646 // IN THE WAY: 1.SCORE 2.Z 3.Y 4.X FOR PHRASE XYZ
647 // PREVIOUSLY PROCESSED
649 for(std::vector
<std::vector
<std::vector
<int> > >::iterator
itIndex((*(GetDP(m_iCMID
)))[HZIndex
][CandIndex
].begin()); itIndex
!=(*(GetDP(m_iCMID
)))[HZIndex
][CandIndex
].end();itIndex
++){
651 for(std::vector
<std::vector
<int> >::iterator
itCand(itIndex
->begin()); itCand
!=itIndex
->end(); itCand
++){
653 //IF HAS LEFT CONTEXT INFORMATION, MATCH VTRACE WITH CELL
656 if((*itCand
).size() !=1){
657 for(unsigned int i(0); i
<(*itCand
).size()-1; i
++)
658 if((*itCand
)[i
+1]!=vTrace
[i
]){
675 void CPinYinConversionHelper::TrainChPPM(){
677 // TODO: Changed to 1 from 10
678 for(int i
=0; i
<1;i
++)
683 void CPinYinConversionHelper::ProcessFile(int index
){
686 CLanguageModel::Context trainContext
;
687 trainContext
= m_pLanguageModel
->CreateEmptyContext();
693 const char* Alph
="ABCDEFGHJKLMNPR";
707 strcpy (strPath
, (char *) getenv ("HOME"));
708 strcat (strPath
, "/training/corpus/character/");
709 strcat (strPath
, "C");
710 strncat (strPath
, Alph
+index
, 1);
711 strcat (strPath
, ".txt");
713 printf("strPath is %s\n", strPath
);
715 fp
= fopen (strPath
, "rb");
718 printf("cannot open file or incorrect directory\n");
724 // printf("OXE$4= %d\n", (unsigned char)0xE4); **228**
725 // printf("OXE9= %d\n", (unsigned char)0xE9); **233**
726 //printf("BEFORE print the integer code for unsigned char %d\n", (unsigned char) cget);
727 while (((unsigned char)cget
< (unsigned char) 0xE4 || (unsigned char) cget
> (unsigned char) 0xE9)&&!feof(fp
)){
729 if((unsigned char) cget
> (unsigned char) 0xE9){
731 fseek(fp
, pos
, SEEK_SET
);
732 fread(str
, sizeof(char)*3, 1, fp
);
736 //printf("UNICODE SYMBOL/NUMBER |%s|\n", str);
750 //printf("NON-UNICODE character |%c|\n", cget);
760 fseek(fp
, pos
, SEEK_SET
);
761 fread(str
, sizeof(char)*3, 1, fp
);
763 //printf("HZ |%s|\n",str);
764 // printf("first byte %d ", (unsigned char)str[0]);
765 // printf("seconde byte %d\n", (unsigned char)str[1]);
766 // printf("third byte %d\n", (unsigned char)str[2]);
768 HZ
= static_cast<std::string
>(str
);
770 // std::cout<<"HZ is "<<HZ<<std::endl;
772 std::vector
<symbol
> Sym
;
773 m_pAlphabet
->GetSymbols(&Sym
, &HZ
, 0);
776 m_pLanguageModel
->LearnSymbol(trainContext
, Sym
[0]);
778 std::cout
<<HZ
<<"not found!"<<std::endl
;
803 fread(str, sizeof(char)*3, 1, fp);
805 printf("HZ |%s|\n",str);
806 // printf("first byte %d ", (unsigned char)str[0]);
807 // printf("seconde byte %d\n", (unsigned char)str[1]);
808 // printf("third byte %d\n", (unsigned char)str[2]);
810 HZ = static_cast<std::string>(str);
812 std::cout<<"HZ is "<<HZ<<std::endl;
814 std::vector<symbol> Sym;
815 m_pAlphabet->GetSymbols(&Sym, &HZ, 0);
818 m_pLanguageModel->LearnSymbol(trainContext, Sym[0]);
820 // std::cout<<"not found!"<<std::endl;