Src/DasherCore/CannaConversionHelper.cpp

   1 #include "config.h"
   2
   3 #include "CannaConversionHelper.h"
   4
   5 #include <canna/jrkanji.h>
   6 #include <canna/RK.h>
   7 #include <iconv.h>
   8
   9 #define BUFSIZE 10240
  10
  11 #include <iostream>             //For testing 23 June 2005
  12
  13
  14 CCannaConversionHelper::CCannaConversionHelper() {
  15   int ret;
  16   char *buf;
  17   int dicnum;
  18
  19   IsInit = 0;
  20
  21   /* Initialize */
  22   ret = RkInitialize("");
  23   if(ret < 0) {
  24     return;
  25   }
  26   buf = (char *)malloc(sizeof(char) * BUFSIZE);
  27   if(buf == NULL)
  28     return;
  29
  30   /* Create Context ID */
  31   context_id = RkCreateContext();
  32
  33   /* Load Dictionaries */
  34   dicnum = RkGetDicList(context_id, buf, BUFSIZE);      //      Find all useable dictionaries
  35   char *p = buf;
  36   for(int i = 0; i < dicnum; i++) {
  37     ret = RkMountDic(context_id, p, 0); //      Mount a dictionary
  38     if(ret)
  39       std::cout << "Error loading:" << p << std::endl;
  40
  41     p += (strlen(p) + 1);       //      Move to next dictionary name
  42   }
  43
  44   free(buf);
  45   IsInit = 1;
  46
  47   //std::cout << "Init Canna OK." << endl;
  48 }
  49
  50 CCannaConversionHelper::~CCannaConversionHelper() {
  51   RkCloseContext(context_id);   // Close working context
  52   //std::cout << "Finalizing Canna OK." << endl;
  53
  54   /* exit */
  55   RkFinalize();
  56 }
  57
  58 bool CCannaConversionHelper::Convert(const std::string &strSource, SCENode ** pRoot, int * childCount, int CMid) {
  59
  60   if(strSource.size() == 0)
  61     return false;
  62
  63   int nbun;
  64   char *pQuery = (char *)strSource.c_str();
  65
  66   unsigned char *buf = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
  67   unsigned char *str_utf8 = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
  68
  69   char *inbuf = (char *)pQuery;
  70   char *outbuf = (char *)buf;
  71   size_t inbytesleft = strSource.length();
  72   size_t outbytesleft = BUFSIZE;
  73
  74   // Use EUC for Canna
  75   //
  76   // NOTE: As far as I can tell, this requires NFC rather than NFD
  77   // normalisation. http://www.cl.cam.ac.uk/~mgk25/unicode.html
  78   //
  79   iconv_t cd = iconv_open("EUC-JP", "UTF8");
  80   iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
  81   *outbuf = '\0';
  82   inbuf = (char *)buf;
  83   iconv_close(cd);
  84
  85   /* Divide given string into phrases */
  86   nbun = RkBgnBun(context_id,   // context ID
  87                   inbuf,        // given string
  88                   strlen(inbuf),        // length of given string
  89                   (RK_XFER << RK_XFERBITS) | RK_KFER);  // mode
  90
  91
  92   if(nbun == -1) {
  93     // Crude error detection - I don't know enough Japanese to figure out how to do this properly :-(
  94
  95     std::cerr << "Error - Canna conversion failed, possibly could not connect to server." << std::endl;
  96   }
  97
  98   SCENode *pDummyRoot(new SCENode);
  99   //  pDummyRoot->pChild = NULL;
 100
 101   /* Convert each phrase into Kanji */
 102   cd = iconv_open("UTF8", "EUC-JP");
 103   for(int i = nbun-1; i >= 0; --i) {
 104     SCENode *pTail = pDummyRoot->GetChild();
 105
 106     if(pTail)
 107       pTail->Ref();
 108
 109     RkGoTo(context_id, i);      // Move to a specific phrase
 110     int len = RkGetKanjiList(context_id, buf, BUFSIZE); // Get a list of Kanji candidates
 111
 112     // Use UTF-8 for Dasher
 113     char *p = (char *)buf;
 114
 115     std::vector<std::string> vCandidates;
 116
 117     for(int j = 0; j < len; ++j) {
 118       inbuf = p;
 119       //std::cout << "Canna:" << j << "[" << inbuf << "] ";
 120       outbuf = (char *)str_utf8;
 121       inbytesleft = strlen((char *)inbuf);
 122       outbytesleft = BUFSIZE;
 123       //for( int k=0; k<20; k++ ){
 124       //      std::cout << (int) inbuf[k] << " ";
 125       //}
 126       //std::cout << inbytesleft << " ->";
 127       iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
 128       *outbuf = '\0';
 129
 130       if(strlen((char *)str_utf8))
 131         vCandidates.push_back((char *)str_utf8);
 132
 133       //std::cout << "[" << str_utf8 << "] " << outbytesleft << std::endl;
 134       p += (strlen(p) + 1);
 135     }
 136
 137     for(std::vector<std::string>::reverse_iterator it(vCandidates.rbegin()); it != vCandidates.rend(); ++it) {
 138       ProcessCandidate(*it, pDummyRoot, pTail);
 139     }
 140
 141     if(pTail)
 142       pTail->Unref();
 143   }
 144   RkEndBun(context_id, 0);      // Close phrase division
 145
 146   iconv_close(cd);
 147   free(buf);
 148   free(str_utf8);
 149
 150   *pRoot = pDummyRoot->GetChild();
 151
 152   (*pRoot)->Ref();
 153   pDummyRoot->Unref();
 154
 155   return true;
 156 }
 157
 158
 159 void CCannaConversionHelper::ProcessCandidate(std::string strCandidate, SCENode *pRoot, SCENode *pTail) {
 160
 161   SCENode *pCurrentNode(pRoot);
 162
 163   int iIdx(0);
 164
 165   // TODO: Need phrase-based conversion
 166   while(iIdx < strCandidate.size()) {
 167
 168     int iLength;
 169
 170     // TODO: Really dodgy UTF-8 parser - find a library routine to do this
 171     if((static_cast<int>(strCandidate[iIdx]) & 0x80) == 0)
 172       iLength = 1;
 173     else if((static_cast<int>(strCandidate[iIdx]) & 0xE0) == 0xC0)
 174       iLength = 2;
 175     else if((static_cast<int>(strCandidate[iIdx]) & 0xF0) == 0xE0)
 176       iLength = 3;
 177     else if((static_cast<int>(strCandidate[iIdx]) & 0xF8) == 0xF0)
 178       iLength = 4;
 179     else if((static_cast<int>(strCandidate[iIdx]) & 0xFC) == 0xF8)
 180       iLength = 5;
 181     else
 182       iLength = 6;
 183
 184     std::string strSymbol(strCandidate.substr(iIdx, iLength));
 185
 186     iIdx += iLength;
 187
 188     SCENode *pCurrentChild(pCurrentNode->GetChild()); // TODO: Initialise
 189
 190     while(pCurrentChild) {
 191       if(strSymbol == pCurrentChild->pszConversion)
 192         break;
 193       pCurrentChild = pCurrentChild->GetNext();
 194     }
 195
 196     if(!pCurrentChild) { // Need a new child
 197       pCurrentChild = new SCENode;
 198       if(pCurrentNode->GetChild())
 199         pCurrentChild->SetNext(pCurrentNode->GetChild());
 200       if(pTail && (iIdx >= strCandidate.size()))
 201         pCurrentChild->SetChild(pTail);
 202
 203       pCurrentChild->pszConversion = new char[strSymbol.size() + 1];
 204       strcpy(pCurrentChild->pszConversion, strSymbol.c_str());
 205
 206       pCurrentNode->SetChild(pCurrentChild);
 207       pCurrentChild->Unref();
 208     }
 209
 210     pCurrentNode = pCurrentChild;
 211   }
 212 }
 213
 214 void CCannaConversionHelper::AssignSizes(SCENode *pStart, Dasher::CLanguageModel::Context context, long normalization, int uniform, int iNChildren) {
 215
 216   SCENode *pNode(pStart);
 217
 218   int iRemaining = iNChildren;
 219   int iLeft = normalization;
 220
 221   int iCheck(0);
 222
 223   while(pNode) {
 224     pNode->NodeSize = iLeft / iRemaining;
 225     iLeft -= pNode->NodeSize;
 226
 227     iCheck += pNode->NodeSize;
 228
 229     --iRemaining;
 230     pNode = pNode->GetNext();
 231   }
 232 }