Updated German translation
[dasher.git] / Src / DasherCore / CannaConversionHelper.cpp
blob5a55a086146ae8b6829f7295ba4a20f37d48da81
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
5 #include "CannaConversionHelper.h"
7 #include <canna/jrkanji.h>
8 #include <canna/RK.h>
9 #include <iconv.h>
11 #define BUFSIZE 10240
13 #include <iostream> //For testing 23 June 2005
15 using namespace Dasher;
17 CCannaConversionHelper::CCannaConversionHelper(CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet, int Type, int Order)
18 : CConversionManager(pNCManager, pAlphabet) {
20 int ret;
21 char *buf;
22 int dicnum;
24 iType = Type;// 0 = uniform, 1 = 1/(n+1),2 = 1/(n+1)/n
25 iOrder = Order; // 0 = canna, 1 = Unicode , 2 = Shift_JIS
27 icon = iconv_open("SJIS", "UTF8");
29 IsInit = 0;
31 /* Initialize */
32 ret = RkInitialize("");
33 if(ret < 0) {
34 return;
36 buf = (char *)malloc(sizeof(char) * BUFSIZE);
37 if(buf == NULL)
38 return;
40 /* Create Context ID */
41 context_id = RkCreateContext();
43 /* Load Dictionaries */
44 dicnum = RkGetDicList(context_id, buf, BUFSIZE); // Find all useable dictionaries
45 char *p = buf;
46 for(int i = 0; i < dicnum; i++) {
47 ret = RkMountDic(context_id, p, 0); // Mount a dictionary
48 if(ret)
49 std::cout << "Error loading:" << p << std::endl;
51 p += (strlen(p) + 1); // Move to next dictionary name
54 free(buf);
55 IsInit = 1;
57 //std::cout << "Init Canna OK." << endl;
60 CCannaConversionHelper::~CCannaConversionHelper() {
61 RkCloseContext(context_id); // Close working context
62 //std::cout << "Finalizing Canna OK." << endl;
64 /* exit */
65 RkFinalize();
66 iconv_close(icon);
70 bool CCannaConversionHelper::Convert(const std::string &strSource, SCENode ** pRoot) {
72 if(strSource.size() == 0)
73 return false;
75 int nbun;
76 char *pQuery = (char *)strSource.c_str();
78 unsigned char *buf = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
79 unsigned char *str_utf8 = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
81 char *inbuf = (char *)pQuery;
82 char *outbuf = (char *)buf;
83 size_t inbytesleft = strSource.length();
84 size_t outbytesleft = BUFSIZE;
86 // Use EUC for Canna
88 // NOTE: As far as I can tell, this requires NFC rather than NFD
89 // normalisation. http://www.cl.cam.ac.uk/~mgk25/unicode.html
91 iconv_t cd = iconv_open("EUC-JP", "UTF8");
92 iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
93 *outbuf = '\0';
94 inbuf = (char *)buf;
95 iconv_close(cd);
97 /* Divide given string into phrases */
98 nbun = RkBgnBun(context_id, // context ID
99 inbuf, // given string
100 strlen(inbuf), // length of given string
101 (RK_XFER << RK_XFERBITS) | RK_KFER); // mode
104 if(nbun == -1) {
105 // Crude error detection - I don't know enough Japanese to figure out how to do this properly :-(
107 std::cerr << "Error - Canna conversion failed, possibly could not connect to server." << std::endl;
110 SCENode *pDummyRoot(new SCENode);
111 // pDummyRoot->pChild = NULL;
113 /* Convert each phrase into Kanji */
114 cd = iconv_open("UTF8", "EUC-JP");
115 for(int i = nbun-1; i >= 0; --i) {
116 SCENode *pTail = pDummyRoot->GetChild();
118 if(pTail)
119 pTail->Ref();
121 pDummyRoot->SetChild(NULL);
123 RkGoTo(context_id, i); // Move to a specific phrase
124 int len = RkGetKanjiList(context_id, buf, BUFSIZE); // Get a list of Kanji candidates
126 // Use UTF-8 for Dasher
127 char *p = (char *)buf;
129 std::vector<std::string> vCandidates;
131 for(int j = 0; j < len; ++j) {
132 inbuf = p;
133 //std::cout << "Canna:" << j << "[" << inbuf << "] ";
134 outbuf = (char *)str_utf8;
135 inbytesleft = strlen((char *)inbuf);
136 outbytesleft = BUFSIZE;
137 //for( int k=0; k<20; k++ ){
138 // std::cout << (int) inbuf[k] << " ";
140 //std::cout << inbytesleft << " ->";
141 iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
142 *outbuf = '\0';
144 if(strlen((char *)str_utf8))
145 vCandidates.push_back((char *)str_utf8);
147 //std::cout << "[" << str_utf8 << "] " << outbytesleft << std::endl;
148 p += (strlen(p) + 1);
151 for(std::vector<std::string>::reverse_iterator it(vCandidates.rbegin()); it != vCandidates.rend(); ++it) {
152 ProcessCandidate(*it, pDummyRoot, pTail);
155 if(pTail)
156 pTail->Unref();
159 RkEndBun(context_id, 0); // Close phrase division
161 iconv_close(cd);
162 free(buf);
163 free(str_utf8);
165 *pRoot = pDummyRoot->GetChild();
167 (*pRoot)->Ref();
168 pDummyRoot->Unref();
170 return true;
174 void CCannaConversionHelper::ProcessCandidate(std::string strCandidate, SCENode *pRoot, SCENode *pTail) {
176 SCENode *pCurrentNode(pRoot);
178 int iIdx(0);
180 // TODO: Need phrase-based conversion
181 while(iIdx < strCandidate.size()) {
183 int iLength;
185 // TODO: Really dodgy UTF-8 parser - find a library routine to do this
186 if((static_cast<int>(strCandidate[iIdx]) & 0x80) == 0)
187 iLength = 1;
188 else if((static_cast<int>(strCandidate[iIdx]) & 0xE0) == 0xC0)
189 iLength = 2;
190 else if((static_cast<int>(strCandidate[iIdx]) & 0xF0) == 0xE0)
191 iLength = 3;
192 else if((static_cast<int>(strCandidate[iIdx]) & 0xF8) == 0xF0)
193 iLength = 4;
194 else if((static_cast<int>(strCandidate[iIdx]) & 0xFC) == 0xF8)
195 iLength = 5;
196 else
197 iLength = 6;
199 std::string strSymbol(strCandidate.substr(iIdx, iLength));
201 iIdx += iLength;
203 SCENode *pCurrentChild(pCurrentNode->GetChild()); // TODO: Initialise
205 while(pCurrentChild) {
206 if(strSymbol == pCurrentChild->pszConversion)
207 break;
208 pCurrentChild = pCurrentChild->GetNext();
211 if(!pCurrentChild) { // Need a new child
212 pCurrentChild = new SCENode;
213 if(pCurrentNode->GetChild())
214 pCurrentChild->SetNext(pCurrentNode->GetChild());
215 if(pTail && (iIdx >= strCandidate.size()))
216 pCurrentChild->SetChild(pTail);
218 pCurrentChild->pszConversion = new char[strSymbol.size() + 1];
219 strcpy(pCurrentChild->pszConversion, strSymbol.c_str());
221 pCurrentNode->SetChild(pCurrentChild);
222 pCurrentChild->Unref();
225 pCurrentNode = pCurrentChild;
231 void CCannaConversionHelper::AssignSizes(SCENode **pStart, Dasher::CLanguageModel::Context context, long normalization, int uniform, int iNChildren) {
233 SCENode *pNode(*pStart);
236 if(this->iType == 0) {
237 int iRemaining = iNChildren;
238 int iLeft = normalization;
239 int iCheck(0);
240 while(pNode) {
241 pNode->NodeSize = iLeft / iRemaining;
242 iLeft -= pNode->NodeSize;
243 iCheck += pNode->NodeSize;
244 --iRemaining;
245 pNode = pNode->GetNext();
249 else if(this->iType == 1) {
250 int iN = 1;
251 double iK = 0;
252 for(int k = 1; k <= iNChildren; k++) iK += 1/(1+(double)k);
253 while(pNode) {
254 pNode->NodeSize = normalization / (iK*(1+iN));
255 pNode = pNode->GetNext();
256 ++iN;
259 else if(this->iType == 2){
260 int iN = 1;
261 double iK = 0;
262 for(int k = 1; k <= iNChildren; k++) iK += 1/((1+(double)k)*(double)k);
263 while(pNode) {
264 pNode->NodeSize = normalization / (iK*(1+iN)*iN);
265 pNode = pNode->GetNext();
266 ++iN;
270 else {
271 DASHER_ASSERT(false);
275 int iCode_a, iCode_b;
276 SCENode *pTmp_a;
277 SCENode *pTmp_b;
279 (*pStart)->Ref();
281 //Change the order of Kanji candidates//
282 if(this->iOrder > 1 ){
283 for(int i=1; i<=iNChildren;i++){
284 pNode = *pStart;
285 if(pNode -> GetNext()) {
286 SCENode *pNext = pNode->GetNext();
287 SCENode *pNext2 = pNext->GetNext();
288 if (this->iOrder == 1){
289 iCode_a = this->iUTF8Decode(pNode->pszConversion);
290 iCode_b = this->iUTF8Decode(pNext->pszConversion);
292 else if (this->iOrder == 2){
293 iCode_a = this->iShiftJISDecode(pNode->pszConversion);
294 iCode_b = this->iShiftJISDecode(pNext->pszConversion);
298 if(iCode_a > iCode_b){
300 if(pNext2) pNext2->Ref();
302 pNext->Ref();
303 pNext->SetNext(*pStart);
304 (*pStart)->SetNext(pNext2);
306 (*pStart)->Unref();
308 if(pNext2) pNext2->Unref();
310 *pStart = pNext;
312 pNode = *pStart;
317 while(pNode){
318 if(pNode->GetNext() && (pNode->GetNext())->GetNext()) {
319 pTmp_a = pNode->GetNext();
320 pTmp_b = pTmp_a->GetNext();
322 if (this->iOrder == 1){
323 iCode_a = this->iUTF8Decode(pTmp_a->pszConversion);
324 iCode_b = this->iUTF8Decode(pTmp_b->pszConversion);
326 else if (this->iOrder == 2){
327 iCode_a = this->iShiftJISDecode(pTmp_a->pszConversion);
328 iCode_b = this->iShiftJISDecode(pTmp_b->pszConversion);
331 if(iCode_a > iCode_b){
332 SCENode *pTmp = pTmp_b->GetNext();
333 if(pTmp) pTmp->Ref();
334 pTmp_a->Ref();
336 pNode->SetNext(pTmp_b);
337 pTmp_b->SetNext(pTmp_a);
338 pTmp_a->SetNext(pTmp);
340 pTmp_a->Unref();
341 if(pTmp) pTmp->Unref();
344 pNode = pNode->GetNext();