6 #include "LanguageModelling/LanguageModel.h"
7 #include "LanguageModelling/PPMLanguageModel.h"
8 #include <ChiCEInterface.h>
11 #include "PinYinConversionHelper.h"
13 #include <SCENodeNew.h>
15 using namespace Dasher
;
17 CPinYinConversionHelper::CPinYinConversionHelper(Dasher::CEventHandler
*pEventHandler
, CSettingsStore
*pSettingsStore
, Dasher::CAlphIO
*pCAlphIO
){
19 //TESTING FOR UTF-8 CHINESE IN C++ STRING
22 char* SimpChAlphabet = (char*)malloc(62);
23 fp = fopen("test.txt", "rw");
24 fread(SimpChAlphabet, 1, 62, fp);
28 const std::string SimpChAlphabet
= "Chinese / 简体中文 (simplified chinese, in pin yin groups)";
31 Dasher::CAlphIO::AlphInfo oAlphInfo
= pCAlphIO
->GetInfo(SimpChAlphabet
);
33 m_pAlphabet
= new CAlphabet(oAlphInfo
);
35 // std::cout<<SimpChAlphabet<<std::endl;
36 // for (int i = 0; i < 20; i++)
37 // std::cout<<m_pAlphabet->GetDisplayText(i)<<std::endl;
39 // TODO: Need to figure out what this does - it was previously set to true in PopulateChildren
41 CSymbolAlphabet
alphabet(m_pAlphabet
->GetNumberTextSymbols());
42 alphabet
.SetSpaceSymbol(m_pAlphabet
->GetSpaceSymbol()); // FIXME - is this right, or do we have to do some kind of translation?
43 alphabet
.SetAlphabetPointer(m_pAlphabet
); // Horrible hack, but ignore for now.
45 m_pLanguageModel
= new CPPMLanguageModel(pEventHandler
, pSettingsStore
, alphabet
);
48 //Train the PPM Model for Chinese Alphabet
54 m_bTraceNeeded
= true;//reset trace bool
56 //clears the process phrase flags
57 for(int i(0); i
<MAX_HZ_NUM
; i
++)
58 m_bPhrasesProcessed
[i
]=0;
66 bool CPinYinConversionHelper::Convert(const std::string
&strSource
, SCENode
** pRoot
, int * childCount
, int CMid
) {
68 SCENodeNew
*pConversionList
;
71 if(CEConvert (strSource
.c_str(), &pConversionList
, &iHZCount
, CMid
)){
72 SCENodeNew
*pHead(pConversionList
);
74 std::vector
<SCENodeNew
*> vHeads
;
77 vHeads
.push_back(pHead
);
78 pHead
= pHead
->pChild
;
81 SCENode
*pTail
= NULL
;
82 SCENode
*pNextTail
= NULL
;
84 for(std::vector
<SCENodeNew
*>::reverse_iterator
it(vHeads
.rbegin()); it
!= vHeads
.rend(); ++it
) {
85 SCENodeNew
*pCurrentNode(*it
);
87 SCENode
*pPreviousNode
= NULL
;
90 SCENode
*pNewNode
= new SCENode
;
93 pNewNode
->SetChild(pTail
);
96 pPreviousNode
->SetNext(pNewNode
);
104 pPreviousNode
->Unref();
106 pPreviousNode
= pNewNode
;
107 pCurrentNode
= pCurrentNode
->pNext
;
111 pPreviousNode
->Unref();
123 // // TODO: Now need to convert...
127 // // Connect up the rest of the nodes to make a lattice
128 // SCENode *pHead(pStart);
131 // SCENode *pNewChild(pHead->GetChild());
132 // SCENode *pCurrent(pHead->GetNext());
135 // pCurrent->SetChild(pNewChild);
136 // pCurrent = pCurrent->GetNext();
139 // pHead = pHead->GetChild();
150 unsigned int CPinYinConversionHelper::GetSumPYProbs(Dasher::CLanguageModel::Context context
, SCENode
* pPYCandStart
, int norm
){
152 std::vector
<unsigned int > Probs
;
153 unsigned int sumProb
=0;
155 m_pLanguageModel
->GetProbs(context
, Probs
, norm
);
157 SCENode
* pCurrentNode
= pPYCandStart
;
161 std::vector
<symbol
>Symbols
;
162 std::string HZ
= static_cast<std::string
>(pCurrentNode
->pszConversion
);
163 // Distribute the remaining space evenly
165 m_pAlphabet
->GetSymbols(&Symbols
, &HZ
, 0);
167 if(Symbols
.size()!=0)
168 sumProb
+= Probs
[Symbols
[0]];
169 pCurrentNode
= pCurrentNode
->GetNext();
176 void CPinYinConversionHelper::GetProbs(Dasher::CLanguageModel::Context context
, std::vector
< unsigned int >&Probs
, int norm
){
182 void CPinYinConversionHelper::AssignSizes(SCENode
* pStart
, CLanguageModel::Context context
, long normalization
, int uniform
, int iNChildren
){
184 SCENode
*pNode
= pStart
;
186 std::vector
<unsigned int > Probs
;
188 int iSymbols
= m_pAlphabet
->GetNumberSymbols();
189 int iLeft(iNChildren
);
190 int iRemaining(normalization
);
195 int iNorm
= normalization
;
198 //IGNORE CONTROL MODE FOR NOW
199 // if(!GetBoolParameter(BP_CONTROL_MODE)) {
201 uniform_add
= ((iNorm
* uniform
) / 1000) / (iSymbols
- 2); // Subtract 2 from no symbols to lose control/root nodes
202 nonuniform_norm
= iNorm
- (iSymbols
- 2) * uniform_add
;
206 control_space = int (iNorm * 0.05);
207 uniform_add = (((iNorm - control_space) * uniform / 1000) / (iSymbols - 2)); // Subtract 2 from no symbols to lose control/root nodes
208 nonuniform_norm = iNorm - control_space - (iSymbols - 2) * uniform_add;
213 // context = m_pLanguageModel->CreateEmptyContext();
215 //Testing Code for PYCHelper GetPYSumProbs
218 CLanguageModel::Context iContext = m_pLanguageModel->CreateEmptyContext();
220 SCENode * pTemp = pStart;
222 std::cout<<"test sum probs"<<GetSumPYProbs(iContext, pTemp, nonuniform_norm)<<std::endl;
223 std::cout<<"test norm"<<nonuniform_norm<<std::endl;
228 m_pLanguageModel
->GetProbs(context
, Probs
, nonuniform_norm
);
231 std::vector<unsigned int>::iterator it;
232 for(it = Probs.begin();it!=Probs.end(); it++)
235 std::cout<<"end"<<std::endl;
241 unsigned long long int sumProb
=0;
243 std::vector
<symbol
>Symbols
;
245 CLanguageModel::Context iCurrentContext
;
248 //std::cout<<"start"<<std::endl;
252 HZ
= static_cast<std::string
>(pNode
->pszConversion
);
253 m_pAlphabet
->GetSymbols(&Symbols
, &HZ
, 0);
255 if(Symbols
.size()!=0){
256 pNode
->Symbol
= Symbols
[0];
257 //sumProb += Probs[Symbols[0]];
260 iCurrentContext
=m_pLanguageModel
->CloneContext(context
);
261 m_pLanguageModel
->EnterSymbol(iCurrentContext
, pNode
->Symbol
);
263 if(pStart
->GetChild()){
264 pNode
->SumPYProbStore
= GetSumPYProbs(iCurrentContext
, pStart
->GetChild(), nonuniform_norm
);
265 //std::cout<<"sumpyprobstore"<<pNode->SumPYProbStore<<std::endl;
268 pNode
->SumPYProbStore
= 1;
270 sumProb
+= (Probs
[pNode
->Symbol
]*(pNode
->SumPYProbStore
));
271 //std::cout<<"Probs[symbol]"<<Probs[Symbols[0]]<<std::endl;
272 //std::cout<<"sumProbs"<<sumProb<<std::endl;
277 pNode
= pNode
->GetNext();
285 std::vector <symbol >Symbols;
286 std::string HZ = static_cast<std::string>(pNode->pszConversion);
289 m_pAlphabet->GetSymbols(&Symbols, &HZ, 0);
292 if(pNode
->Symbol
!=-1){
295 // iCurrentContext=m_pLanguageModel->CloneContext(context);
296 //m_pLanguageModel->EnterSymbol(iCurrentContext, pNode->Symbol);
299 pNode
->NodeSize
=static_cast<unsigned long long int>(Probs
[pNode
->Symbol
])*(pNode
->SumPYProbStore
)*normalization
/sumProb
;
302 std::cout<<"HZ"<<pNode->pszConversion<<std::endl;
303 std::cout<<"Probs"<<Probs[pNode->Symbol]<<std::endl;
304 std::cout<<"SumProbStore"<<pNode->SumPYProbStore<<std::endl;
305 std::cout<<"above"<<Probs[pNode->Symbol]*(pNode->SumPYProbStore)<<std::endl;
306 std::cout<<"sumprob"<<sumProb<<std::endl;
307 std::cout<<"nodesize"<<pNode->NodeSize<<std::endl;
312 pNode
->NodeSize
= 0;//hopefully this will be not be displayed
315 if(pNode
->NodeSize
< 1)
318 iRemaining
-= pNode
->NodeSize
;
320 pNode
= pNode
->GetNext();
327 int iDiff(iRemaining
/ iLeft
);
329 pNode
->NodeSize
+= iDiff
;
333 pNode
= pNode
->GetNext();
338 std::cout<<"size"<<pNode->NodeSize<<std::endl;
339 pNode = pNode ->pNext;
345 std::cout<<pNode->NodeSize<<",";
346 pNode = pNode->pNext;
349 std::cout<<std::endl;
352 //std::cout<<catStr<<std::endl;
354 //for(int i=0; i<Symbols.size(); i++)
355 // std::cout<<Symbols[i]<<",";
357 //std::cout<<std::endl;
359 //for(int i=0; i<Symbols.size();i++)
360 // std::cout<<m_pAlphabet->GetText(Symbols[i]);
362 //std::cout<<std::endl;
364 // pSizes[i] = m_pNCManager->GetLongParameter(LP_NORMALIZATION)*(100+5*freq[i])/(100*iNChildren+5*totalFreq);
368 bool CPinYinConversionHelper::GetPhraseList(int HZIndex
, SCENode
** psOutput
, int CMid
){
371 // if(CEGetPhraseList(HZIndex, &pStart, CMid)){
373 // *psOutput= pStart;
385 void CPinYinConversionHelper::BuildDataBase(){
387 std::vector
<std::vector
<std::vector
<std::vector
<std::vector
<int> > > > >IndexStack
;
388 std::vector
<std::vector
<std::vector
<std::vector
<int> > > >CandStack
;
389 std::vector
<std::vector
<std::vector
<int> > > subIndexStack
;
390 std::vector
<std::vector
<int> > subCandStack
;
391 std::vector
<int> cell
;
393 vContextData
.clear();
395 subCandStack
.push_back(cell
);
398 for(int i(0); i
<MAX_HZ_NUM
; i
++){
399 for(int j(0); j
<MAX_CARE_CAND
; j
++){
400 for(int k(0); k
<i
+1; k
++)
401 subIndexStack
.push_back(subCandStack
);
402 CandStack
.push_back(subIndexStack
);
403 subIndexStack
.clear();
405 IndexStack
.push_back(CandStack
);
409 for(int k(0); k
<MAX_CM_NUM
; k
++)
410 vContextData
.push_back(IndexStack
);
414 void CPinYinConversionHelper::ClearData(int CMid
){
416 std::vector
<int> cell
;
419 for(int i(0); i
<MAX_HZ_NUM
; i
++){
420 for(int j(0); j
<MAX_CARE_CAND
/*(m_pRoot[i]->pChild->IsHeadAndCandNum)*/; j
++){
421 for(int k(0); k
<i
+1; k
++){
422 vContextData
[CMid
][i
][j
][k
].clear();
423 vContextData
[CMid
][i
][j
][k
].push_back(cell
);
431 std::vector
<std::vector
<std::vector
<std::vector
<std::vector
<int> > > > > * CPinYinConversionHelper::GetDP(int CMid
){
433 return &vContextData
[CMid
];
439 //THIS FUNCTION IS CALLED WHEN A SET OF CHILDREN IS BEING POPULATED
440 //AND BEFORE CALCULATING EACH NODE'S SCORE. THE POSITION IN THE
441 //SENTENCE IS GIVEN TO THE PY HELPER->LIBRARY AND A LIST OF PHRASES
442 //CORRESPONDING TO THE CHARACTER IS RETURNED. THESE PHRASES ARE
443 //PROCESSED INTO THE CONTEXT DATA IN PY HELPER
445 void CPinYinConversionHelper::ProcessPhrase(HZIDX HZIndex
){
447 SCENode
* pPhraseList
;
454 int score
[m_iHZCount
-HZIndex
];
456 CANDIDX CandIndex
[m_iHZCount
-HZIndex
]; //list to store candidates
457 //returned from HZlookup,
458 //used to allocate data
461 std::vector
<int> cell
;
463 if(!(GetPhraseList(HZIndex
, &pPhraseList
, m_iCMID
)))
466 if(pPhraseList
->AcCharCount
>4)
467 pNode
= pPhraseList
->GetNext();
471 while((pNode
)&&(iIdx
<=MAX_CARE_PHRASE
)){
473 //this section needs research. What scores would be a good estimate.
475 switch(pNode
->AcCharCount
){
492 for(int j(0); j
< m_iHZCount
-HZIndex
; j
++)
498 for(i
=0 ; (i
<pNode
->AcCharCount
); i
++){
499 strtemp
=pNode
->pszConversion
;
502 //std::cout<<"accharcount"<<pNode->AcCharCount<<std::endl;
503 //std::cout<<"the cut string is"<<strtemp.substr(3*i,3)<<std::endl;
504 //std::cout<<"list to look from
505 //is"<<m_pRoot[HZIndex+i]->pChild->pszConversion<<std::endl;
508 CandIndex
[i
] = HZLookup(HZIndex
+i
, strtemp
.substr(3*i
, 3));
511 //std::cout<<"the lookup is"<<CandIndex[i]<<std::endl;
516 cell
.push_back(score
[i
]);
517 for(int j(0); j
< i
; j
++)
518 cell
.push_back(CandIndex
[i
-j
-1]);
521 //say the phrase is XYZ(this) push back in each cell in the
522 //order: score, Z, Y, X so as to match with vTrace in
525 if(!(HZIndex
+ i
> MAX_HZ_NUM
-1))
526 (*(GetDP(m_iCMID
)))[HZIndex
+i
][CandIndex
[i
]][HZIndex
].push_back(cell
);
532 pNode
= pNode
->GetNext();
535 m_bPhrasesProcessed
[HZIndex
]=1;
540 CANDIDX
CPinYinConversionHelper::HZLookup(HZIDX HZIndex
, const std::string
&strSource
){
543 // this was done before candindex was put into node member, change
547 if((HZIndex
> m_iHZCount
-1)||strSource
.size()!=3)
550 // TODO: Reimplement -----
552 // SCENode * pNode = m_pRoot[HZIndex]->pChild;
554 // while(pNode&&(iIdx<=MAX_CARE_CAND)){
556 // if(strSource== pNode->pszConversion)
558 // pNode = pNode->pNext;
567 //CALCULATES SCORE OF A CERTAIN CANDIDATE HZ CHARACTER NODE TO BE
568 //POPULATED, FROM THE CONTEXT DATA IN PY HELPER. FINDS VTRACE TO MATCH
569 //CONTEXT SEQUENCE STORED IN THE LAST LEVEL OF DATABASE
571 int CPinYinConversionHelper::CalculateScore(CDasherNode
* pNode
, CANDIDX CandIndex
){
572 CDasherNode
*pIterateDNode(pNode
);
578 bool addtick
=1; //bool to signal add score
582 //THIS SECTION IS TO FIND VTRACE, IN THE SAME WAY AS FINDING INPUT
588 while(pIterateDNode
&&(pIterateDNode
->m_pNodeManager
->GetID() == 2)) {
590 pTemp
=static_cast<SCENode
*>(pIterateDNode
->m_pUserData
);
595 vTrace
.push_back(pTemp
->CandIndex
);
596 pIterateDNode
= pIterateDNode
->Parent();
600 //THE FOLLOWING IS TESTING FOR VTRACE
601 // if(vTrace.size()!=0){
602 // std::cout<<"signal"<<std::endl;
603 // for(std::vector<int>::iterator it(vTrace.begin());it!=vTrace.end();it++)
604 // std::cout<< *it <<std::endl;
608 m_bTraceNeeded
= false;
613 if(CandIndex
>=MAX_CARE_CAND
)
618 pTemp
=static_cast<SCENode
*>(pNode
->m_pUserData
);
621 HZIndex
= pTemp
->AcCharCount
;
626 //THE DATA IS CONSTRUCTED OF UNITS OF SINGLE CELLS STORING CONTEXT SEQUENCE
627 //AND A CORRESPONDING SCORE
629 //LEVEL 1 : HZ INDEX : NUMBER OF CHARACTERS CONVERTED
630 //LEVEL 2 : CAND INDEX : NUMBER OF CANDIDATES WITH EACH POSITION
631 //LEVEL 3 : SUB HZ INDEX :(COULD BE REDUNDANT) CORRESPONDES TO
632 // INDEX OF CHARACTERS WHICH WERE CONSISTED IN PHRASES
633 //LEVEL 4 : SUB CAND INDEX : WHICH CAND WAS IN THE PHRASE
634 //LEVLE 5 : CELL: STORING PHASES IN HZ INDEX AND ASSIGNED SCORE
635 // IN THE WAY: 1.SCORE 2.Z 3.Y 4.X FOR PHRASE XYZ
636 // PREVIOUSLY PROCESSED
638 for(std::vector
<std::vector
<std::vector
<int> > >::iterator
itIndex((*(GetDP(m_iCMID
)))[HZIndex
][CandIndex
].begin()); itIndex
!=(*(GetDP(m_iCMID
)))[HZIndex
][CandIndex
].end();itIndex
++){
640 for(std::vector
<std::vector
<int> >::iterator
itCand(itIndex
->begin()); itCand
!=itIndex
->end(); itCand
++){
642 //IF HAS LEFT CONTEXT INFORMATION, MATCH VTRACE WITH CELL
645 if((*itCand
).size() !=1){
646 for(unsigned int i(0); i
<(*itCand
).size()-1; i
++)
647 if((*itCand
)[i
+1]!=vTrace
[i
]){
664 void CPinYinConversionHelper::TrainChPPM(){
666 for(int i
=0; i
<10;i
++)
671 void CPinYinConversionHelper::ProcessFile(int index
){
674 CLanguageModel::Context trainContext
;
675 trainContext
= m_pLanguageModel
->CreateEmptyContext();
681 const char* Alph
="ABCDEFGHJKLMNPR";
695 strcpy (strPath
, (char *) getenv ("HOME"));
696 strcat (strPath
, "/training/corpus/character/");
697 strcat (strPath
, "C");
698 strncat (strPath
, Alph
+index
, 1);
699 strcat (strPath
, ".txt");
701 printf("strPath is %s\n", strPath
);
703 fp
= fopen (strPath
, "rb");
706 printf("cannot open file or incorrect directory\n");
712 // printf("OXE$4= %d\n", (unsigned char)0xE4); **228**
713 // printf("OXE9= %d\n", (unsigned char)0xE9); **233**
714 //printf("BEFORE print the integer code for unsigned char %d\n", (unsigned char) cget);
715 while (((unsigned char)cget
< (unsigned char) 0xE4 || (unsigned char) cget
> (unsigned char) 0xE9)&&!feof(fp
)){
717 if((unsigned char) cget
> (unsigned char) 0xE9){
719 fseek(fp
, pos
, SEEK_SET
);
720 fread(str
, sizeof(char)*3, 1, fp
);
724 //printf("UNICODE SYMBOL/NUMBER |%s|\n", str);
738 //printf("NON-UNICODE character |%c|\n", cget);
748 fseek(fp
, pos
, SEEK_SET
);
749 fread(str
, sizeof(char)*3, 1, fp
);
751 //printf("HZ |%s|\n",str);
752 // printf("first byte %d ", (unsigned char)str[0]);
753 // printf("seconde byte %d\n", (unsigned char)str[1]);
754 // printf("third byte %d\n", (unsigned char)str[2]);
756 HZ
= static_cast<std::string
>(str
);
758 // std::cout<<"HZ is "<<HZ<<std::endl;
760 std::vector
<symbol
> Sym
;
761 m_pAlphabet
->GetSymbols(&Sym
, &HZ
, 0);
764 m_pLanguageModel
->LearnSymbol(trainContext
, Sym
[0]);
766 std::cout
<<HZ
<<"not found!"<<std::endl
;
791 fread(str, sizeof(char)*3, 1, fp);
793 printf("HZ |%s|\n",str);
794 // printf("first byte %d ", (unsigned char)str[0]);
795 // printf("seconde byte %d\n", (unsigned char)str[1]);
796 // printf("third byte %d\n", (unsigned char)str[2]);
798 HZ = static_cast<std::string>(str);
800 std::cout<<"HZ is "<<HZ<<std::endl;
802 std::vector<symbol> Sym;
803 m_pAlphabet->GetSymbols(&Sym, &HZ, 0);
806 m_pLanguageModel->LearnSymbol(trainContext, Sym[0]);
808 // std::cout<<"not found!"<<std::endl;