Src/DasherCore/CannaConversionHelper.cpp

   1 #ifdef HAVE_CONFIG_H
   2 #include <config.h>
   3 #endif
   4
   5 #include "CannaConversionHelper.h"
   6
   7 #include <canna/jrkanji.h>
   8 #include <canna/RK.h>
   9 #include <iconv.h>
  10
  11 #define BUFSIZE 10240
  12
  13 #include <iostream>             //For testing 23 June 2005
  14
  15 using namespace Dasher;
  16
  17 CCannaConversionHelper::CCannaConversionHelper(CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet, int Type, int Order)
  18 : CConversionManager(pNCManager, pAlphabet) {
  19
  20   int ret;
  21   char *buf;
  22   int dicnum;
  23
  24   iType = Type;// 0 = uniform, 1 = 1/(n+1),2 = 1/(n+1)/n
  25   iOrder = Order; // 0 = canna, 1 = Unicode , 2 = Shift_JIS
  26
  27   icon = iconv_open("SJIS", "UTF8");
  28
  29   IsInit = 0;
  30
  31   /* Initialize */
  32   ret = RkInitialize("");
  33   if(ret < 0) {
  34     return;
  35   }
  36   buf = (char *)malloc(sizeof(char) * BUFSIZE);
  37   if(buf == NULL)
  38     return;
  39
  40   /* Create Context ID */
  41   context_id = RkCreateContext();
  42
  43   /* Load Dictionaries */
  44   dicnum = RkGetDicList(context_id, buf, BUFSIZE);      //      Find all useable dictionaries
  45   char *p = buf;
  46   for(int i = 0; i < dicnum; i++) {
  47     ret = RkMountDic(context_id, p, 0); //      Mount a dictionary
  48     if(ret)
  49       std::cout << "Error loading:" << p << std::endl;
  50
  51     p += (strlen(p) + 1);       //      Move to next dictionary name
  52   }
  53
  54   free(buf);
  55   IsInit = 1;
  56
  57   //std::cout << "Init Canna OK." << endl;
  58 }
  59
  60 CCannaConversionHelper::~CCannaConversionHelper() {
  61   RkCloseContext(context_id);   // Close working context
  62   //std::cout << "Finalizing Canna OK." << endl;
  63
  64   /* exit */
  65   RkFinalize();
  66   iconv_close(icon);
  67
  68 }
  69
  70 bool CCannaConversionHelper::Convert(const std::string &strSource, SCENode ** pRoot) {
  71
  72   if(strSource.size() == 0)
  73     return false;
  74
  75   int nbun;
  76   char *pQuery = (char *)strSource.c_str();
  77
  78   unsigned char *buf = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
  79   unsigned char *str_utf8 = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
  80
  81   char *inbuf = (char *)pQuery;
  82   char *outbuf = (char *)buf;
  83   size_t inbytesleft = strSource.length();
  84   size_t outbytesleft = BUFSIZE;
  85
  86   // Use EUC for Canna
  87   //
  88   // NOTE: As far as I can tell, this requires NFC rather than NFD
  89   // normalisation. http://www.cl.cam.ac.uk/~mgk25/unicode.html
  90   //
  91   iconv_t cd = iconv_open("EUC-JP", "UTF8");
  92   iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
  93   *outbuf = '\0';
  94   inbuf = (char *)buf;
  95   iconv_close(cd);
  96
  97   /* Divide given string into phrases */
  98   nbun = RkBgnBun(context_id,   // context ID
  99                   inbuf,        // given string
 100                   strlen(inbuf),        // length of given string
 101                   (RK_XFER << RK_XFERBITS) | RK_KFER);  // mode
 102
 103
 104   if(nbun == -1) {
 105     // Crude error detection - I don't know enough Japanese to figure out how to do this properly :-(
 106
 107     std::cerr << "Error - Canna conversion failed, possibly could not connect to server." << std::endl;
 108   }
 109
 110   SCENode *pDummyRoot(new SCENode);
 111   //  pDummyRoot->pChild = NULL;
 112
 113   /* Convert each phrase into Kanji */
 114   cd = iconv_open("UTF8", "EUC-JP");
 115   for(int i = nbun-1; i >= 0; --i) {
 116     SCENode *pTail = pDummyRoot->GetChild();
 117
 118     if(pTail)
 119       pTail->Ref();
 120
 121     pDummyRoot->SetChild(NULL);
 122
 123     RkGoTo(context_id, i);      // Move to a specific phrase
 124     int len = RkGetKanjiList(context_id, buf, BUFSIZE); // Get a list of Kanji candidates
 125
 126     // Use UTF-8 for Dasher
 127     char *p = (char *)buf;
 128
 129     std::vector<std::string> vCandidates;
 130
 131     for(int j = 0; j < len; ++j) {
 132       inbuf = p;
 133       //std::cout << "Canna:" << j << "[" << inbuf << "] ";
 134       outbuf = (char *)str_utf8;
 135       inbytesleft = strlen((char *)inbuf);
 136       outbytesleft = BUFSIZE;
 137       //for( int k=0; k<20; k++ ){
 138       //      std::cout << (int) inbuf[k] << " ";
 139       //}
 140       //std::cout << inbytesleft << " ->";
 141       iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 142       *outbuf = '\0';
 143
 144       if(strlen((char *)str_utf8))
 145         vCandidates.push_back((char *)str_utf8);
 146
 147       //std::cout << "[" << str_utf8 << "] " << outbytesleft << std::endl;
 148       p += (strlen(p) + 1);
 149     }
 150
 151     for(std::vector<std::string>::reverse_iterator it(vCandidates.rbegin()); it != vCandidates.rend(); ++it) {
 152       ProcessCandidate(*it, pDummyRoot, pTail);
 153     }
 154
 155     if(pTail)
 156       pTail->Unref();
 157   }
 158
 159   RkEndBun(context_id, 0);      // Close phrase division
 160
 161   iconv_close(cd);
 162   free(buf);
 163   free(str_utf8);
 164
 165   *pRoot = pDummyRoot->GetChild();
 166
 167   (*pRoot)->Ref();
 168   pDummyRoot->Unref();
 169
 170   return true;
 171 }
 172
 173
 174 void CCannaConversionHelper::ProcessCandidate(std::string strCandidate, SCENode *pRoot, SCENode *pTail) {
 175
 176   SCENode *pCurrentNode(pRoot);
 177
 178   int iIdx(0);
 179
 180   // TODO: Need phrase-based conversion
 181   while(iIdx < strCandidate.size()) {
 182
 183     int iLength;
 184
 185     // TODO: Really dodgy UTF-8 parser - find a library routine to do this
 186     if((static_cast<int>(strCandidate[iIdx]) & 0x80) == 0)
 187       iLength = 1;
 188     else if((static_cast<int>(strCandidate[iIdx]) & 0xE0) == 0xC0)
 189       iLength = 2;
 190     else if((static_cast<int>(strCandidate[iIdx]) & 0xF0) == 0xE0)
 191       iLength = 3;
 192     else if((static_cast<int>(strCandidate[iIdx]) & 0xF8) == 0xF0)
 193       iLength = 4;
 194     else if((static_cast<int>(strCandidate[iIdx]) & 0xFC) == 0xF8)
 195       iLength = 5;
 196     else
 197       iLength = 6;
 198
 199     std::string strSymbol(strCandidate.substr(iIdx, iLength));
 200
 201     iIdx += iLength;
 202
 203     SCENode *pCurrentChild(pCurrentNode->GetChild()); // TODO: Initialise
 204
 205     while(pCurrentChild) {
 206       if(strSymbol == pCurrentChild->pszConversion)
 207         break;
 208       pCurrentChild = pCurrentChild->GetNext();
 209     }
 210
 211     if(!pCurrentChild) { // Need a new child
 212       pCurrentChild = new SCENode;
 213       if(pCurrentNode->GetChild())
 214         pCurrentChild->SetNext(pCurrentNode->GetChild());
 215       if(pTail && (iIdx >= strCandidate.size()))
 216         pCurrentChild->SetChild(pTail);
 217
 218       pCurrentChild->pszConversion = new char[strSymbol.size() + 1];
 219       strcpy(pCurrentChild->pszConversion, strSymbol.c_str());
 220
 221       pCurrentNode->SetChild(pCurrentChild);
 222       pCurrentChild->Unref();
 223     }
 224
 225     pCurrentNode = pCurrentChild;
 226   }
 227 }
 228
 229
 230
 231 void CCannaConversionHelper::AssignSizes(SCENode **pStart, Dasher::CLanguageModel::Context context, long normalization, int uniform, int iNChildren) {
 232
 233   SCENode *pNode(*pStart);
 234
 235
 236   if(this->iType == 0) {
 237     int iRemaining = iNChildren;
 238     int iLeft = normalization;
 239     int iCheck(0);
 240     while(pNode) {
 241       pNode->NodeSize = iLeft / iRemaining;
 242       iLeft -= pNode->NodeSize;
 243       iCheck += pNode->NodeSize;
 244       --iRemaining;
 245       pNode = pNode->GetNext();
 246     }
 247   }
 248
 249   else if(this->iType == 1) {
 250     int iN = 1;
 251     double iK = 0;
 252     for(int k = 1; k <= iNChildren; k++) iK += 1/(1+(double)k);
 253     while(pNode) {
 254       pNode->NodeSize = normalization / (iK*(1+iN));
 255       pNode = pNode->GetNext();
 256       ++iN;
 257     }
 258   }
 259   else if(this->iType == 2){
 260     int iN = 1;
 261     double iK = 0;
 262     for(int k = 1; k <= iNChildren; k++) iK += 1/((1+(double)k)*(double)k);
 263     while(pNode) {
 264       pNode->NodeSize = normalization / (iK*(1+iN)*iN);
 265       pNode = pNode->GetNext();
 266       ++iN;
 267     }
 268   }
 269
 270   else {
 271      DASHER_ASSERT(false);
 272   }
 273
 274
 275   int iCode_a, iCode_b;
 276   SCENode *pTmp_a;
 277   SCENode *pTmp_b;
 278
 279   (*pStart)->Ref();
 280
 281   //Change the order of Kanji candidates//
 282   if(this->iOrder > 1 ){
 283     for(int i=1; i<=iNChildren;i++){
 284       pNode = *pStart;
 285       if(pNode -> GetNext()) {
 286         SCENode *pNext = pNode->GetNext();
 287         SCENode *pNext2 = pNext->GetNext();
 288         if (this->iOrder == 1){
 289           iCode_a = this->iUTF8Decode(pNode->pszConversion);
 290           iCode_b = this->iUTF8Decode(pNext->pszConversion);
 291         }
 292         else if (this->iOrder == 2){
 293           iCode_a = this->iShiftJISDecode(pNode->pszConversion);
 294           iCode_b = this->iShiftJISDecode(pNext->pszConversion);
 295         }
 296
 297
 298         if(iCode_a > iCode_b){
 299
 300           if(pNext2) pNext2->Ref();
 301
 302           pNext->Ref();
 303           pNext->SetNext(*pStart);
 304           (*pStart)->SetNext(pNext2);
 305
 306           (*pStart)->Unref();
 307
 308           if(pNext2) pNext2->Unref();
 309
 310           *pStart = pNext;
 311
 312           pNode = *pStart;
 313
 314         }
 315       }
 316
 317       while(pNode){
 318         if(pNode->GetNext() && (pNode->GetNext())->GetNext()) {
 319           pTmp_a = pNode->GetNext();
 320           pTmp_b = pTmp_a->GetNext();
 321
 322           if (this->iOrder == 1){
 323             iCode_a = this->iUTF8Decode(pTmp_a->pszConversion);
 324             iCode_b = this->iUTF8Decode(pTmp_b->pszConversion);
 325           }
 326           else if (this->iOrder == 2){
 327             iCode_a = this->iShiftJISDecode(pTmp_a->pszConversion);
 328             iCode_b = this->iShiftJISDecode(pTmp_b->pszConversion);
 329           }
 330
 331          if(iCode_a > iCode_b){
 332             SCENode *pTmp = pTmp_b->GetNext();
 333             if(pTmp) pTmp->Ref();
 334             pTmp_a->Ref();
 335
 336             pNode->SetNext(pTmp_b);
 337             pTmp_b->SetNext(pTmp_a);
 338             pTmp_a->SetNext(pTmp);
 339
 340             pTmp_a->Unref();
 341             if(pTmp) pTmp->Unref();
 342           }
 343         }
 344         pNode = pNode->GetNext();
 345       }
 346     }
 347   }
 348
 349
 350
 351 }
 352