3 #include "CannaConversionHelper.h"
5 #include <canna/jrkanji.h>
11 #include <iostream> //For testing 23 June 2005
14 CCannaConversionHelper::CCannaConversionHelper() {
22 ret
= RkInitialize("");
26 buf
= (char *)malloc(sizeof(char) * BUFSIZE
);
30 /* Create Context ID */
31 context_id
= RkCreateContext();
33 /* Load Dictionaries */
34 dicnum
= RkGetDicList(context_id
, buf
, BUFSIZE
); // Find all useable dictionaries
36 for(int i
= 0; i
< dicnum
; i
++) {
37 ret
= RkMountDic(context_id
, p
, 0); // Mount a dictionary
39 std::cout
<< "Error loading:" << p
<< std::endl
;
41 p
+= (strlen(p
) + 1); // Move to next dictionary name
47 //std::cout << "Init Canna OK." << endl;
50 CCannaConversionHelper::~CCannaConversionHelper() {
51 RkCloseContext(context_id
); // Close working context
52 //std::cout << "Finalizing Canna OK." << endl;
58 bool CCannaConversionHelper::Convert(const std::string
&strSource
, SCENode
** pRoot
, int * childCount
, int CMid
) {
60 if(strSource
.size() == 0)
64 char *pQuery
= (char *)strSource
.c_str();
66 unsigned char *buf
= (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE
);
67 unsigned char *str_utf8
= (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE
);
69 char *inbuf
= (char *)pQuery
;
70 char *outbuf
= (char *)buf
;
71 size_t inbytesleft
= strSource
.length();
72 size_t outbytesleft
= BUFSIZE
;
76 // NOTE: As far as I can tell, this requires NFC rather than NFD
77 // normalisation. http://www.cl.cam.ac.uk/~mgk25/unicode.html
79 iconv_t cd
= iconv_open("EUC-JP", "UTF8");
80 iconv(cd
, &inbuf
, &inbytesleft
, &outbuf
, &outbytesleft
);
85 /* Divide given string into phrases */
86 nbun
= RkBgnBun(context_id
, // context ID
87 inbuf
, // given string
88 strlen(inbuf
), // length of given string
89 (RK_XFER
<< RK_XFERBITS
) | RK_KFER
); // mode
93 // Crude error detection - I don't know enough Japanese to figure out how to do this properly :-(
95 std::cerr
<< "Error - Canna conversion failed, possibly could not connect to server." << std::endl
;
98 SCENode
*pDummyRoot(new SCENode
);
99 // pDummyRoot->pChild = NULL;
101 /* Convert each phrase into Kanji */
102 cd
= iconv_open("UTF8", "EUC-JP");
103 for(int i
= nbun
-1; i
>= 0; --i
) {
104 SCENode
*pTail
= pDummyRoot
->GetChild();
109 RkGoTo(context_id
, i
); // Move to a specific phrase
110 int len
= RkGetKanjiList(context_id
, buf
, BUFSIZE
); // Get a list of Kanji candidates
112 // Use UTF-8 for Dasher
113 char *p
= (char *)buf
;
115 std::vector
<std::string
> vCandidates
;
117 for(int j
= 0; j
< len
; ++j
) {
119 //std::cout << "Canna:" << j << "[" << inbuf << "] ";
120 outbuf
= (char *)str_utf8
;
121 inbytesleft
= strlen((char *)inbuf
);
122 outbytesleft
= BUFSIZE
;
123 //for( int k=0; k<20; k++ ){
124 // std::cout << (int) inbuf[k] << " ";
126 //std::cout << inbytesleft << " ->";
127 iconv(cd
, &inbuf
, &inbytesleft
, &outbuf
, &outbytesleft
);
130 if(strlen((char *)str_utf8
))
131 vCandidates
.push_back((char *)str_utf8
);
133 //std::cout << "[" << str_utf8 << "] " << outbytesleft << std::endl;
134 p
+= (strlen(p
) + 1);
137 for(std::vector
<std::string
>::reverse_iterator
it(vCandidates
.rbegin()); it
!= vCandidates
.rend(); ++it
) {
138 ProcessCandidate(*it
, pDummyRoot
, pTail
);
144 RkEndBun(context_id
, 0); // Close phrase division
150 *pRoot
= pDummyRoot
->GetChild();
159 void CCannaConversionHelper::ProcessCandidate(std::string strCandidate
, SCENode
*pRoot
, SCENode
*pTail
) {
161 SCENode
*pCurrentNode(pRoot
);
165 // TODO: Need phrase-based conversion
166 while(iIdx
< strCandidate
.size()) {
170 // TODO: Really dodgy UTF-8 parser - find a library routine to do this
171 if((static_cast<int>(strCandidate
[iIdx
]) & 0x80) == 0)
173 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xE0) == 0xC0)
175 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xF0) == 0xE0)
177 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xF8) == 0xF0)
179 else if((static_cast<int>(strCandidate
[iIdx
]) & 0xFC) == 0xF8)
184 std::string
strSymbol(strCandidate
.substr(iIdx
, iLength
));
188 SCENode
*pCurrentChild(pCurrentNode
->GetChild()); // TODO: Initialise
190 while(pCurrentChild
) {
191 if(strSymbol
== pCurrentChild
->pszConversion
)
193 pCurrentChild
= pCurrentChild
->GetNext();
196 if(!pCurrentChild
) { // Need a new child
197 pCurrentChild
= new SCENode
;
198 if(pCurrentNode
->GetChild())
199 pCurrentChild
->SetNext(pCurrentNode
->GetChild());
200 if(pTail
&& (iIdx
>= strCandidate
.size()))
201 pCurrentChild
->SetChild(pTail
);
203 pCurrentChild
->pszConversion
= new char[strSymbol
.size() + 1];
204 strcpy(pCurrentChild
->pszConversion
, strSymbol
.c_str());
206 pCurrentNode
->SetChild(pCurrentChild
);
207 pCurrentChild
->Unref();
210 pCurrentNode
= pCurrentChild
;
214 void CCannaConversionHelper::AssignSizes(SCENode
*pStart
, Dasher::CLanguageModel::Context context
, long normalization
, int uniform
, int iNChildren
) {
216 SCENode
*pNode(pStart
);
218 int iRemaining
= iNChildren
;
219 int iLeft
= normalization
;
224 pNode
->NodeSize
= iLeft
/ iRemaining
;
225 iLeft
-= pNode
->NodeSize
;
227 iCheck
+= pNode
->NodeSize
;
230 pNode
= pNode
->GetNext();