tagging release
[dasher.git] / Src / DasherCore / CannaConversionHelper.cpp
blob11a1a71463f7e419db6c9cdd612e93e89245fcae
1 #include "config.h"
3 #include "CannaConversionHelper.h"
5 #include <canna/jrkanji.h>
6 #include <canna/RK.h>
7 #include <iconv.h>
9 #define BUFSIZE 10240
11 #include <iostream> //For testing 23 June 2005
14 CCannaConversionHelper::CCannaConversionHelper() {
15 int ret;
16 char *buf;
17 int dicnum;
19 IsInit = 0;
21 /* Initialize */
22 ret = RkInitialize("");
23 if(ret < 0) {
24 return;
26 buf = (char *)malloc(sizeof(char) * BUFSIZE);
27 if(buf == NULL)
28 return;
30 /* Create Context ID */
31 context_id = RkCreateContext();
33 /* Load Dictionaries */
34 dicnum = RkGetDicList(context_id, buf, BUFSIZE); // Find all useable dictionaries
35 char *p = buf;
36 for(int i = 0; i < dicnum; i++) {
37 ret = RkMountDic(context_id, p, 0); // Mount a dictionary
38 if(ret)
39 std::cout << "Error loading:" << p << std::endl;
41 p += (strlen(p) + 1); // Move to next dictionary name
44 free(buf);
45 IsInit = 1;
47 //std::cout << "Init Canna OK." << endl;
50 CCannaConversionHelper::~CCannaConversionHelper() {
51 RkCloseContext(context_id); // Close working context
52 //std::cout << "Finalizing Canna OK." << endl;
54 /* exit */
55 RkFinalize();
58 bool CCannaConversionHelper::Convert(const std::string &strSource, SCENode ** pRoot, int * childCount, int CMid) {
60 if(strSource.size() == 0)
61 return false;
63 int nbun;
64 char *pQuery = (char *)strSource.c_str();
66 unsigned char *buf = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
67 unsigned char *str_utf8 = (unsigned char *)malloc(sizeof(unsigned char) * BUFSIZE);
69 char *inbuf = (char *)pQuery;
70 char *outbuf = (char *)buf;
71 size_t inbytesleft = strSource.length();
72 size_t outbytesleft = BUFSIZE;
74 // Use EUC for Canna
76 // NOTE: As far as I can tell, this requires NFC rather than NFD
77 // normalisation. http://www.cl.cam.ac.uk/~mgk25/unicode.html
79 iconv_t cd = iconv_open("EUC-JP", "UTF8");
80 iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
81 *outbuf = '\0';
82 inbuf = (char *)buf;
83 iconv_close(cd);
85 /* Divide given string into phrases */
86 nbun = RkBgnBun(context_id, // context ID
87 inbuf, // given string
88 strlen(inbuf), // length of given string
89 (RK_XFER << RK_XFERBITS) | RK_KFER); // mode
92 if(nbun == -1) {
93 // Crude error detection - I don't know enough Japanese to figure out how to do this properly :-(
95 std::cerr << "Error - Canna conversion failed, possibly could not connect to server." << std::endl;
98 SCENode *pDummyRoot(new SCENode);
99 // pDummyRoot->pChild = NULL;
101 /* Convert each phrase into Kanji */
102 cd = iconv_open("UTF8", "EUC-JP");
103 for(int i = nbun-1; i >= 0; --i) {
104 SCENode *pTail = pDummyRoot->GetChild();
106 if(pTail)
107 pTail->Ref();
109 RkGoTo(context_id, i); // Move to a specific phrase
110 int len = RkGetKanjiList(context_id, buf, BUFSIZE); // Get a list of Kanji candidates
112 // Use UTF-8 for Dasher
113 char *p = (char *)buf;
115 std::vector<std::string> vCandidates;
117 for(int j = 0; j < len; ++j) {
118 inbuf = p;
119 //std::cout << "Canna:" << j << "[" << inbuf << "] ";
120 outbuf = (char *)str_utf8;
121 inbytesleft = strlen((char *)inbuf);
122 outbytesleft = BUFSIZE;
123 //for( int k=0; k<20; k++ ){
124 // std::cout << (int) inbuf[k] << " ";
126 //std::cout << inbytesleft << " ->";
127 iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
128 *outbuf = '\0';
130 if(strlen((char *)str_utf8))
131 vCandidates.push_back((char *)str_utf8);
133 //std::cout << "[" << str_utf8 << "] " << outbytesleft << std::endl;
134 p += (strlen(p) + 1);
137 for(std::vector<std::string>::reverse_iterator it(vCandidates.rbegin()); it != vCandidates.rend(); ++it) {
138 ProcessCandidate(*it, pDummyRoot, pTail);
141 if(pTail)
142 pTail->Unref();
144 RkEndBun(context_id, 0); // Close phrase division
146 iconv_close(cd);
147 free(buf);
148 free(str_utf8);
150 *pRoot = pDummyRoot->GetChild();
152 (*pRoot)->Ref();
153 pDummyRoot->Unref();
155 return true;
159 void CCannaConversionHelper::ProcessCandidate(std::string strCandidate, SCENode *pRoot, SCENode *pTail) {
161 SCENode *pCurrentNode(pRoot);
163 int iIdx(0);
165 // TODO: Need phrase-based conversion
166 while(iIdx < strCandidate.size()) {
168 int iLength;
170 // TODO: Really dodgy UTF-8 parser - find a library routine to do this
171 if((static_cast<int>(strCandidate[iIdx]) & 0x80) == 0)
172 iLength = 1;
173 else if((static_cast<int>(strCandidate[iIdx]) & 0xE0) == 0xC0)
174 iLength = 2;
175 else if((static_cast<int>(strCandidate[iIdx]) & 0xF0) == 0xE0)
176 iLength = 3;
177 else if((static_cast<int>(strCandidate[iIdx]) & 0xF8) == 0xF0)
178 iLength = 4;
179 else if((static_cast<int>(strCandidate[iIdx]) & 0xFC) == 0xF8)
180 iLength = 5;
181 else
182 iLength = 6;
184 std::string strSymbol(strCandidate.substr(iIdx, iLength));
186 iIdx += iLength;
188 SCENode *pCurrentChild(pCurrentNode->GetChild()); // TODO: Initialise
190 while(pCurrentChild) {
191 if(strSymbol == pCurrentChild->pszConversion)
192 break;
193 pCurrentChild = pCurrentChild->GetNext();
196 if(!pCurrentChild) { // Need a new child
197 pCurrentChild = new SCENode;
198 if(pCurrentNode->GetChild())
199 pCurrentChild->SetNext(pCurrentNode->GetChild());
200 if(pTail && (iIdx >= strCandidate.size()))
201 pCurrentChild->SetChild(pTail);
203 pCurrentChild->pszConversion = new char[strSymbol.size() + 1];
204 strcpy(pCurrentChild->pszConversion, strSymbol.c_str());
206 pCurrentNode->SetChild(pCurrentChild);
207 pCurrentChild->Unref();
210 pCurrentNode = pCurrentChild;
214 void CCannaConversionHelper::AssignSizes(SCENode *pStart, Dasher::CLanguageModel::Context context, long normalization, int uniform, int iNChildren) {
216 SCENode *pNode(pStart);
218 int iRemaining = iNChildren;
219 int iLeft = normalization;
221 int iCheck(0);
223 while(pNode) {
224 pNode->NodeSize = iLeft / iRemaining;
225 iLeft -= pNode->NodeSize;
227 iCheck += pNode->NodeSize;
229 --iRemaining;
230 pNode = pNode->GetNext();