5 #include "CannaConversionHelper.h"
7 #include <canna/jrkanji.h>
13 #include <iostream> //For testing 23 June 2005
15 using namespace Dasher
;
17 CCannaConversionHelper::CCannaConversionHelper(CNodeCreationManager
*pNCManager
, const CAlphInfo
*pAlphabet
, int Type
, int Order
)
18 : CConversionManager(pNCManager
, pAlphabet
) {
24 iType
= Type
;// 0 = uniform, 1 = 1/(n+1),2 = 1/(n+1)/n
25 iOrder
= Order
; // 0 = canna, 1 = Unicode , 2 = Shift_JIS
27 icon
= iconv_open("SJIS", "UTF8");
32 ret
= RkInitialize("");
36 buf
= (char *)malloc(sizeof(char) * BUFSIZE
);
40 /* Create Context ID */
41 context_id
= RkCreateContext();
43 /* Load Dictionaries */
44 dicnum
= RkGetDicList(context_id
, buf
, BUFSIZE
); // Find all useable dictionaries
46 for(int i
= 0; i
< dicnum
; i
++) {
47 ret
= RkMountDic(context_id
, p
, 0); // Mount a dictionary
49 std::cout
<< "Error loading:" << p
<< std::endl
;
51 p
+= (strlen(p
) + 1); // Move to next dictionary name
57 //std::cout << "Init Canna OK." << endl;
60 CCannaConversionHelper::~CCannaConversionHelper() {
61 RkCloseContext(context_id
); // Close working context
62 //std::cout << "Finalizing Canna OK." << endl;
70 bool CCannaConversionHelper::Convert(const std::string
&strSource
, SCENode
** pRoot
) {
72 if(strSource
.size() == 0)
76 char *pQuery
= (char *)strSource
.c_str();
78 unsigned char *buf
= (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE
);
79 unsigned char *str_utf8
= (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE
);
81 char *inbuf
= (char *)pQuery
;
82 char *outbuf
= (char *)buf
;
83 size_t inbytesleft
= strSource
.length();
84 size_t outbytesleft
= BUFSIZE
;
88 // NOTE: As far as I can tell, this requires NFC rather than NFD
89 // normalisation. http://www.cl.cam.ac.uk/~mgk25/unicode.html
91 iconv_t cd
= iconv_open("EUC-JP", "UTF8");
92 iconv(cd
, &inbuf
, &inbytesleft
, &outbuf
, &outbytesleft
);
97 /* Divide given string into phrases */
98 nbun
= RkBgnBun(context_id
, // context ID
99 inbuf
, // given string
100 strlen(inbuf
), // length of given string
101 (RK_XFER
<< RK_XFERBITS
) | RK_KFER
); // mode
105 // Crude error detection - I don't know enough Japanese to figure out how to do this properly :-(
107 std::cerr
<< "Error - Canna conversion failed, possibly could not connect to server." << std::endl
;
110 SCENode
*pDummyRoot(new SCENode
);
111 // pDummyRoot->pChild = NULL;
113 /* Convert each phrase into Kanji */
114 cd
= iconv_open("UTF8", "EUC-JP");
115 for(int i
= nbun
-1; i
>= 0; --i
) {
116 SCENode
*pTail
= pDummyRoot
->GetChild();
121 pDummyRoot
->SetChild(NULL
);
123 RkGoTo(context_id
, i
); // Move to a specific phrase
124 int len
= RkGetKanjiList(context_id
, buf
, BUFSIZE
); // Get a list of Kanji candidates
126 // Use UTF-8 for Dasher
127 char *p
= (char *)buf
;
129 std::vector
<std::string
> vCandidates
;
131 for(int j
= 0; j
< len
; ++j
) {
133 //std::cout << "Canna:" << j << "[" << inbuf << "] ";
134 outbuf
= (char *)str_utf8
;
135 inbytesleft
= strlen((char *)inbuf
);
136 outbytesleft
= BUFSIZE
;
137 //for( int k=0; k<20; k++ ){
138 // std::cout << (int) inbuf[k] << " ";
140 //std::cout << inbytesleft << " ->";
141 iconv(cd
, &inbuf
, &inbytesleft
, &outbuf
, &outbytesleft
);
144 if(strlen((char *)str_utf8
))
145 vCandidates
.push_back((char *)str_utf8
);
147 //std::cout << "[" << str_utf8 << "] " << outbytesleft << std::endl;
148 p
+= (strlen(p
) + 1);
151 for(std::vector
<std::string
>::reverse_iterator
it(vCandidates
.rbegin()); it
!= vCandidates
.rend(); ++it
) {
152 ProcessCandidate(*it
, pDummyRoot
, pTail
);
159 RkEndBun(context_id
, 0); // Close phrase division
165 *pRoot
= pDummyRoot
->GetChild();
174 void CCannaConversionHelper::ProcessCandidate(std::string strCandidate
, SCENode
*pRoot
, SCENode
*pTail
) {
176 SCENode
*pCurrentNode(pRoot
);
180 // TODO: Need phrase-based conversion
181 while(iIdx
< strCandidate
.size()) {
185 // TODO: Really dodgy UTF-8 parser - find a library routine to do this
186 if((static_cast<int>(strCandidate
[iIdx
]) & 0x80) == 0)
188 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xE0) == 0xC0)
190 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xF0) == 0xE0)
192 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xF8) == 0xF0)
194 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xFC) == 0xF8)
199 std::string
strSymbol(strCandidate
.substr(iIdx
, iLength
));
203 SCENode
*pCurrentChild(pCurrentNode
->GetChild()); // TODO: Initialise
205 while(pCurrentChild
) {
206 if(strSymbol
== pCurrentChild
->pszConversion
)
208 pCurrentChild
= pCurrentChild
->GetNext();
211 if(!pCurrentChild
) { // Need a new child
212 pCurrentChild
= new SCENode
;
213 if(pCurrentNode
->GetChild())
214 pCurrentChild
->SetNext(pCurrentNode
->GetChild());
215 if(pTail
&& (iIdx
>= strCandidate
.size()))
216 pCurrentChild
->SetChild(pTail
);
218 pCurrentChild
->pszConversion
= new char[strSymbol
.size() + 1];
219 strcpy(pCurrentChild
->pszConversion
, strSymbol
.c_str());
221 pCurrentNode
->SetChild(pCurrentChild
);
222 pCurrentChild
->Unref();
225 pCurrentNode
= pCurrentChild
;
231 void CCannaConversionHelper::AssignSizes(SCENode
**pStart
, Dasher::CLanguageModel::Context context
, long normalization
, int uniform
, int iNChildren
) {
233 SCENode
*pNode(*pStart
);
236 if(this->iType
== 0) {
237 int iRemaining
= iNChildren
;
238 int iLeft
= normalization
;
241 pNode
->NodeSize
= iLeft
/ iRemaining
;
242 iLeft
-= pNode
->NodeSize
;
243 iCheck
+= pNode
->NodeSize
;
245 pNode
= pNode
->GetNext();
249 else if(this->iType
== 1) {
252 for(int k
= 1; k
<= iNChildren
; k
++) iK
+= 1/(1+(double)k
);
254 pNode
->NodeSize
= normalization
/ (iK
*(1+iN
));
255 pNode
= pNode
->GetNext();
259 else if(this->iType
== 2){
262 for(int k
= 1; k
<= iNChildren
; k
++) iK
+= 1/((1+(double)k
)*(double)k
);
264 pNode
->NodeSize
= normalization
/ (iK
*(1+iN
)*iN
);
265 pNode
= pNode
->GetNext();
271 DASHER_ASSERT(false);
275 int iCode_a
, iCode_b
;
281 //Change the order of Kanji candidates//
282 if(this->iOrder
> 1 ){
283 for(int i
=1; i
<=iNChildren
;i
++){
285 if(pNode
-> GetNext()) {
286 SCENode
*pNext
= pNode
->GetNext();
287 SCENode
*pNext2
= pNext
->GetNext();
288 if (this->iOrder
== 1){
289 iCode_a
= this->iUTF8Decode(pNode
->pszConversion
);
290 iCode_b
= this->iUTF8Decode(pNext
->pszConversion
);
292 else if (this->iOrder
== 2){
293 iCode_a
= this->iShiftJISDecode(pNode
->pszConversion
);
294 iCode_b
= this->iShiftJISDecode(pNext
->pszConversion
);
298 if(iCode_a
> iCode_b
){
300 if(pNext2
) pNext2
->Ref();
303 pNext
->SetNext(*pStart
);
304 (*pStart
)->SetNext(pNext2
);
308 if(pNext2
) pNext2
->Unref();
318 if(pNode
->GetNext() && (pNode
->GetNext())->GetNext()) {
319 pTmp_a
= pNode
->GetNext();
320 pTmp_b
= pTmp_a
->GetNext();
322 if (this->iOrder
== 1){
323 iCode_a
= this->iUTF8Decode(pTmp_a
->pszConversion
);
324 iCode_b
= this->iUTF8Decode(pTmp_b
->pszConversion
);
326 else if (this->iOrder
== 2){
327 iCode_a
= this->iShiftJISDecode(pTmp_a
->pszConversion
);
328 iCode_b
= this->iShiftJISDecode(pTmp_b
->pszConversion
);
331 if(iCode_a
> iCode_b
){
332 SCENode
*pTmp
= pTmp_b
->GetNext();
333 if(pTmp
) pTmp
->Ref();
336 pNode
->SetNext(pTmp_b
);
337 pTmp_b
->SetNext(pTmp_a
);
338 pTmp_a
->SetNext(pTmp
);
341 if(pTmp
) pTmp
->Unref();
344 pNode
= pNode
->GetNext();