Various changes to preferences object, file loading, and error logging.
[jben.git] / kdict.cpp
blob28f1b7168673c0aede61052816bffda7dd72d47e
1 /*
2 Project: J-Ben
3 Author: Paul Goins
4 Website: http://www.vultaire.net/software/jben/
5 License: GNU General Public License (GPL) version 2
6 (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
8 File: kanjidic.cpp
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>
24 #include "kdict.h"
25 #include "preferences.h"
26 #include "encoding_convert.h"
27 #include "string_utils.h"
28 #include "file_utils.h"
29 #include "jutils.h"
30 #include "errorlog.h"
31 #include <libxml/xmlreader.h>
32 #include <iomanip>
33 #include <fstream>
34 #include <sstream>
35 #include <list>
36 using namespace std;
38 #ifdef __WXMSW__
39 # define FALLBACK_DICTDIR "dicts\\"
40 #else
41 # define FALLBACK_DICTDIR "dicts/"
42 #endif
44 KDict* KDict::kdictSingleton = NULL;
46 KInfo::KInfo() {
47 radical = radicalNelson = (unsigned char) 0;
48 grade = strokeCount = freq = 0;
51 const KDict* KDict::Get() {
52 if(!kdictSingleton)
53 kdictSingleton = new KDict;
54 return kdictSingleton;
57 KDict::KDict() {
58 Preferences* p = Preferences::Get();
59 int result;
60 /* Load KANJIDIC2, if present. */
61 result = LoadKanjidic2(p->GetSetting("kdict_kanjidic2").c_str());
62 if(result!=KD_SUCCESS)
63 result = LoadKanjidic2(FALLBACK_DICTDIR "kanjidic2.xml");
65 /* If KANJIDIC2 is not present, load KANJIDIC and/or KANJD212 */
66 if(result!=KD_SUCCESS) {
67 result = LoadKanjidic(p->GetSetting("kdict_kanjidic").c_str());
68 if(result!=KD_SUCCESS) LoadKanjidic(FALLBACK_DICTDIR "kanjidic");
69 result =
70 LoadKanjidic(p->GetSetting("kdict_kanjd212").c_str(), "jis212");
71 if(result!=KD_SUCCESS)
72 LoadKanjidic(FALLBACK_DICTDIR "kanjd212", "jis212");
75 /* Load supplemental dictionary files */
76 result = LoadKradfile(p->GetSetting("kdict_kradfile").c_str());
77 if(result!=KD_SUCCESS) LoadKradfile(FALLBACK_DICTDIR "kradfile");
78 result = LoadRadkfile(p->GetSetting("kdict_radkfile").c_str());
79 if(result!=KD_SUCCESS) LoadRadkfile(FALLBACK_DICTDIR "radkfile");
82 void KDict::Destroy() {
83 if(kdictSingleton) {
84 delete kdictSingleton;
85 kdictSingleton = NULL;
89 int KDict::LoadKanjidic(const char* filename, const char* jisStd) {
90 char* rawData = NULL;
91 unsigned int size;
92 int returnCode=KD_FAILURE;
94 ifstream ifile(filename, ios::ate); /* "at end" to get our file size */
95 if(ifile) {
96 size = ifile.tellg();
97 ifile.seekg(0);
98 rawData = new char[size+1];
99 rawData[size] = '\0';
100 ifile.read(rawData, size);
101 if(strlen(rawData)!=size) {
102 ostringstream oss;
103 oss << ERR_PREF
104 << "kanjidic file size: "
105 << strlen(rawData)
106 << ", read-in string: "
107 << size;
108 el.Push(EL_Warning, oss.str());
111 /* Create the kanjidic object with our string data. */
112 this->KanjidicParser(rawData, jisStd);
114 returnCode = KD_SUCCESS;
115 el.Push(EL_Silent, string("Kanji dictionary file \"")
116 .append(filename).append("\" loaded successfully."));
118 else
119 returnCode = KD_FAILURE;
121 if(rawData) delete[] rawData;
122 return returnCode;
125 int KDict::LoadKanjidic2(const char* filename) {
126 int returnCode = KD_FAILURE;
127 xmlTextReaderPtr reader;
128 xmlChar* ptr;
129 int ret;
131 /* Vars for navigating through the data */
132 string element, d1element;
133 map<string, string> attributes;
134 map<string, string>::iterator mssi;
135 int nodeType;
136 bool isAttribute=false;
137 /* Var for storing values of the entries */
138 string sValue;
139 /* GP vars */
140 string temp;
142 reader = xmlNewTextReaderFilename(filename);
143 KInfo* k=NULL;
144 if(reader) {
145 ret = xmlTextReaderRead(reader);
146 while(ret==1) {
147 /* Act based on node type */
148 nodeType = xmlTextReaderNodeType(reader);
149 switch(nodeType) {
150 case XML_READER_TYPE_ELEMENT:
151 element = (char*)xmlTextReaderName(reader);
152 if(xmlTextReaderDepth(reader)==1) d1element=element;
153 if(element=="character") {
154 /* Opening of character entry - create new data object */
155 if(k) delete k;
156 k = new KInfo;
158 attributes.clear();
159 break;
160 case XML_READER_TYPE_END_ELEMENT:
161 element = (char*)xmlTextReaderName(reader);
162 if(element=="character") {
163 wchar_t wc = utfconv_mw(k->literal)[0];
164 /* End of character entry: append to data list */
165 if(!kdictData.assign(wc, *k)) {
166 ostringstream oss;
167 oss << ERR_PREF
168 << "Error assigning kanjidic2 entry to hash table!";
169 el.Push(EL_Error, oss.str());
171 delete k;
172 k = NULL;
174 attributes.clear();
175 break;
176 case XML_READER_TYPE_ATTRIBUTE:
177 temp = (char*)xmlTextReaderName(reader);
178 ptr = xmlTextReaderValue(reader);
179 attributes[temp] = (char*)ptr;
180 xmlFree(ptr);
181 break;
182 case XML_READER_TYPE_TEXT:
183 ptr = xmlTextReaderValue(reader);
184 sValue = (char*)ptr;
185 xmlFree(ptr);
187 if(d1element=="header") {
188 if(element=="file_version") {
189 if(sValue!="4") {
190 ostringstream oss;
191 oss << ERR_PREF
192 << "Warning: the KANJIDIC2 reader only"
193 " supports KANJIDIC2 version 4!";
194 el.Push(EL_Warning, oss.str());
198 if(d1element=="character") {
199 if(!k) {
200 ostringstream oss;
201 oss << ERR_PREF << "k is NULL!";
202 el.Push(EL_Error, oss.str());
203 } else if(element=="literal") {
204 k->literal = sValue;
205 } else if(element=="cp_value") {
206 k->codepoint[attributes["cp_type"]] = sValue;
207 } else if(element=="rad_value") {
208 temp = attributes["rad_type"];
209 if(temp == "classical")
210 k->radical
211 = (unsigned char)atoi(sValue.c_str());
212 else if(temp == "nelson_c")
213 k->radicalNelson
214 = (unsigned char)atoi(sValue.c_str());
215 else {
216 ostringstream oss;
217 oss << ERR_PREF
218 << "Unhandled radical: "
219 << "type=" << temp
220 << ", value=[" << sValue
221 << "]!";
222 el.Push(EL_Error, oss.str());
224 } else if(element=="grade") {
225 k->grade = (unsigned char)atoi(sValue.c_str());
226 } else if(element=="stroke_count") {
227 k->strokeCount = (unsigned char)atoi(sValue.c_str());
228 } else if(element=="variant") {
229 k->variant[attributes["var_type"]] = sValue;
230 } else if(element=="freq") {
231 k->freq = atoi(sValue.c_str());
232 } else if(element=="rad_name") {
233 k->radicalName = sValue;
234 } else if(element=="dic_ref") {
235 k->dictCode[attributes["dr_type"]] = sValue;
236 if(attributes["dr_type"]=="moro"
237 && attributes["m_vol"].length()>0) {
238 temp = "V";
239 temp.append(attributes["m_vol"]);
240 temp.append(1, 'P');
241 temp.append(attributes["m_page"]);
242 k->dictCode["moro"].append(temp);
244 } else if(element=="q_code") {
245 if(attributes["qc_type"]=="skip"
246 && attributes["skip_misclass"].length()>0) {
247 k->skipMisclass.push_back(
248 pair<string,string>(
249 attributes["skip_misclass"],
250 sValue));
251 } else {
252 k->queryCode[attributes["qc_type"]] = sValue;
254 } else if(element=="reading") {
255 temp = attributes["r_type"];
256 if(temp=="pinyin") {
257 k->pinyin.push_back(sValue);
258 } else if(temp=="korean_r") {
259 k->korean_r.push_back(sValue);
260 } else if(temp=="korean_h") {
261 k->korean_h.push_back(sValue);
262 } else if(temp=="ja_on") {
263 /* Need to handle r_status and on_type! */
264 /* Need to convert xx.x to xx(x) notation. */
265 k->onyomi.push_back(sValue);
266 } else if(temp=="ja_kun") {
267 /* Need to handle r_status! */
268 /* Need to convert xx.x to xx(x) notation. */
269 k->kunyomi.push_back(sValue);
270 } else {
271 ostringstream oss;
272 oss << ERR_PREF << "Invalid r_type: " << temp;
273 el.Push(EL_Error, oss.str());
275 /* This section is "to-do" */
276 } else if(element=="meaning") {
277 temp = attributes["m_lang"];
278 if(temp.length()==0) temp = "en";
279 k->meaning[temp].push_back(sValue);;
280 } else if(element=="nanori") {
281 k->nanori.push_back(sValue);
282 } else {
283 ostringstream oss;
284 oss << ERR_PREF << "UNHANDLED element: " << element;
285 el.Push(EL_Error, oss.str());
288 /* default parsing */
289 else {
290 /*cout << "DEBUG: Depth 1 element is " << d1element
291 << ", element is " << element
292 << ", value is " << sValue << endl;*/
294 break;
295 default:
296 /* do nothing */
297 break;
300 /* If element has attributes, go to the next attribute if present.
301 Otherwise, go to the next element. */
302 if(!isAttribute) ret = xmlTextReaderHasAttributes(reader);
303 if(isAttribute || ret==1) {
304 ret = xmlTextReaderMoveToNextAttribute(reader);
306 /* ret==-1 is an error */
307 if(ret==-1) {
308 ostringstream oss;
309 oss << ERR_PREF
310 << "xmlTextReaderMoveToNextAttribute returned an error!";
311 el.Push(EL_Error, oss.str());
313 /* If ret==1, an attribute was loaded.
314 If not, go to the next element. */
315 if(ret==1) {
316 isAttribute=true;
317 } else {
318 isAttribute=false;
319 ret = xmlTextReaderRead(reader);
322 xmlFreeTextReader(reader);
323 if(ret!=0) {
324 ostringstream oss;
325 oss << ERR_PREF
326 << "Parsing error occurred in " << filename << ".";
327 el.Push(EL_Error, oss.str());
330 returnCode = KD_SUCCESS;
331 el.Push(EL_Silent, string("Kanji dictionary file \"")
332 .append(filename).append("\" loaded successfully."));
333 } else return returnCode;
335 if(k) {
336 ostringstream oss;
337 oss << ERR_PREF << ": k is not NULL! This shouldn't happen!";
338 el.Push(EL_Error, oss.str());
339 delete k;
340 k = NULL;
343 return returnCode;
346 int KDict::LoadKradfile(const char* filename) {
347 int returnCode = KD_FAILURE;
348 stringbuf sb;
349 ifstream f(filename, ios::in|ios::binary);
350 if(f.is_open()) {
351 f >> &sb;
352 f.close();
354 list<wstring> data =
355 StrTokenize<wchar_t>(utfconv_mw(sb.str()), L"\n");
356 while(data.size()>0) {
357 wstring token = data.front();
358 data.pop_front();
359 if(token.length()>0 && token[0]!=L'#') {
360 /* KRADFILE-specific stuff here */
361 /* Get rid of the spaces in the string */
362 token = TextReplace<wchar_t>(token, L" ", L"");
363 /* Now we can easily pull in the data */
364 if(!kradData.assign(token[0], token.substr(2))) {
365 ostringstream oss;
366 oss << ERR_PREF << "KRADFILE: Error assigning ("
367 << utfconv_wm(token.substr(0,1)) << ", "
368 << utfconv_wm(token.substr(2)) << ") to hash table!";
369 el.Push(EL_Error, oss.str());
374 returnCode = KD_SUCCESS;
375 el.Push(EL_Silent, string("Kanji dictionary file \"")
376 .append(filename).append("\" loaded successfully."));
378 return returnCode;
381 int KDict::LoadRadkfile(const char* filename) {
382 int returnCode = KD_FAILURE;
383 stringbuf sb;
384 ifstream f(filename, ios::in|ios::binary);
385 if(f.is_open()) {
386 f >> &sb;
387 f.close();
389 /* RADKFILE entries all start with $.
390 Split on $, and discard the first entry since it is the explanation
391 preceding the first entry. */
392 list<wstring> data =
393 StrTokenize<wchar_t>(utfconv_mw(sb.str()), L"$");
394 data.pop_front();
396 while(data.size()>0) {
397 wstring entry = data.front();
398 data.pop_front();
399 if(entry.length()>0 && entry[0]!=L'#') {
400 /* RADKFILE-specific stuff here */
401 list<wstring> entryData =
402 StrTokenize<wchar_t>(entry, L"\n", false, 2);
403 if(entryData.size()!=2) {
404 ostringstream oss;
405 oss << ERR_PREF
406 << "Error: entryData.size() == " << entryData.size()
407 << " for entry \"" << utfconv_wm(entry) << "!!";
408 el.Push(EL_Error, oss.str());
409 } else {
410 wchar_t key;
411 int strokeCount;
412 wstring value;
413 /* entryData.front() contains our key.
414 It's a space delimited string,
415 first token is our kanji, second is the stroke count.
416 A third token may be present, but is irrelevant. */
417 list<wstring> keyData =
418 StrTokenize<wchar_t>(entryData.front(), L" ");
419 wistringstream wiss;
420 wiss.str(keyData.front());
421 wiss >> key;
422 keyData.pop_front();
423 wiss.str(keyData.front());
424 wiss >> strokeCount;
426 /* entryData.back() contains the characters our key
427 maps to. */
428 /* Get rid of the spaces in the string */
429 value = entryData.back();
430 value = TextReplace<wchar_t>(value, L"\n", L"");
431 value = TextReplace<wchar_t>(value, L" ", L"");
433 if(!radkData.assign(key, value)) {
434 ostringstream oss;
435 oss << ERR_PREF << "RADKFILE: Error assigning ("
436 << utfconv_wm(wstring().append(1,key)) << ", "
437 << utfconv_wm(value) << ") to hash table!";
438 el.Push(EL_Error, oss.str());
440 if(!radkDataStrokes.assign(key, strokeCount)) {
441 ostringstream oss;
442 oss << ERR_PREF << "RADKFILE: Error assigning ("
443 << utfconv_wm(wstring().append(1,key))
444 << ", " << strokeCount << ") to hash table!";
445 el.Push(EL_Error, oss.str());
451 returnCode = KD_SUCCESS;
452 el.Push(EL_Silent, string("Kanji dictionary file \"")
453 .append(filename).append("\" loaded successfully."));
455 return returnCode;
458 string JisHexToKuten(const string& jisHex) {
459 int i;
460 stringstream ss(jisHex);
461 ss >> hex >> i;
462 ss.clear();
463 ss << (((i & 0xFF00) >> 8) - 0x20)
464 << '-' << ((i & 0xFF) - 0x20);
465 return ss.str();
468 /* This function converts from KANJIDIC-style entries to internally used
469 KInfo objects (which are structurally based off the newer KANJIDIC2). */
470 void KDict::KanjidicToKInfo(const string& kanjidicEntry,
471 KInfo& k, const char* jisStd) {
472 list<string> tl = StrTokenize<char>(kanjidicEntry, " ");
473 if(tl.size()<2) return; /* KANJIDIC entries must AT LEAST have the char
474 and the JIS hex code. */
475 int tmode = 0;
476 string sTemp;
477 wstring wsTemp;
478 wchar_t cKanaTest;
480 /* First 2 fields are always the same: process them here */
481 k.literal = tl.front(); tl.pop_front();
482 /* JIS code needs to be converted to ku-ten
483 format to coincide with KANJIDIC2. */
484 k.codepoint[jisStd] = JisHexToKuten(tl.front()); tl.pop_front();
486 /* Now, just loop through the remaining entries in the list. */
487 string* ps;
488 while(tl.size()>0) {
489 ps = &(tl.front());
490 switch ((*ps)[0]) {
491 case 'T': /* Change "t mode" */
492 tmode = atoi(ps->substr(1).c_str());
493 break;
494 case 'B': /* Nelson-reclassified radical */
495 k.radicalNelson = (unsigned char)atoi(ps->substr(1).c_str());
496 break;
497 case 'C': /* Classical radical (KangXi Zidian) */
498 k.radical = (unsigned char)atoi(ps->substr(1).c_str());
499 break;
500 case 'F': /* Frequency */
501 k.freq = atoi(ps->substr(1).c_str());
502 break;
503 case 'G': /* Grade level */
504 k.grade = atoi(ps->substr(1).c_str());
505 break;
506 case 'S': /* Stroke count */
507 if(k.strokeCount==0)
508 k.strokeCount = atoi(ps->substr(1).c_str());
509 else
510 k.misstrokes.push_back(atoi(ps->substr(1).c_str()));
511 break;
512 case 'U': /* Unicode value */
513 k.codepoint["ucs"] = ps->substr(1);
514 break;
515 /* Dictionary codes for most of the following */
516 case 'H':
517 /* New Japanese-English Character Dictionary (Halpern) */
518 k.dictCode["halpern_njecd"] = ps->substr(1);
519 break;
520 case 'N':
521 /* Modern Reader's Japanese-English Character Dictionary (Nelson) */
522 k.dictCode["nelson_c"] = ps->substr(1);
523 break;
524 case 'V':
525 /* The New Nelson's Japanese-English Character Dictionary */
526 k.dictCode["nelson_n"] = ps->substr(1);
527 break;
528 case 'P':
529 /* SKIP codes. */
530 /* Thanks to changes in permissible SKIP code usage (change to
531 Creative Commons licensing in January 2008), we can now use
532 this without problems. */
533 k.queryCode["skip"] = ps->substr(1);
534 break;
535 case 'I': /* Spahn/Hadamitzky dictionaries */
536 if((*ps)[1]=='N') {
537 /* Kanji & Kana (Spahn, Hadamitzky) */
538 k.dictCode["sh_kk"] = ps->substr(2);
539 } else {
540 /* Query Code: Kanji Dictionary (Spahn, Hadamitzky) */
541 k.queryCode["sh_desc"] = ps->substr(1);
543 break;
544 case 'Q':
545 /* Four Corner code */
546 k.queryCode["four_corner"] = ps->substr(1);
547 break;
548 case 'M':
549 if((*ps)[1]=='N') {
550 /* Morohashi Daikanwajiten Index */
551 k.dictCode["moro"].insert(0, ps->substr(2));
552 } else if((*ps)[1]=='P') {
553 /* Morohashi Daikanwajiten Volume/Page */
554 k.dictCode["moro"]
555 .append(1, '/')
556 .append(ps->substr(2));
558 break;
559 case 'E':
560 /* A Guide to Remembering Japanese Characters (Henshall) */
561 k.dictCode["henshall"] = ps->substr(1);
562 break;
563 case 'K':
564 /* Gakken Kanji Dictionary ("A New Dictionary of Kanji Usage") */
565 k.dictCode["gakken"] = ps->substr(1);
566 break;
567 case 'L':
568 /* Remembering the Kanji (Heisig) */
569 k.dictCode["heisig"] = ps->substr(1);
570 break;
571 case 'O':
572 /* Japanese Names (O'Neill) */
573 k.dictCode["oneill_names"] = ps->substr(1);
574 break;
575 case 'D':
576 switch((*ps)[1]) {
577 case 'B':
578 /* Japanese for Busy People (AJLT) */
579 k.dictCode["busy_people"] = ps->substr(1);
580 break;
581 case 'C':
582 /* The Kanji Way to Japanese Language Power (Crowley) */
583 k.dictCode["crowley"] = ps->substr(1);
584 break;
585 case 'F':
586 /* Japanese Kanji Flashcards (White Rabbit Press) */
587 k.dictCode["jf_cards"] = ps->substr(1);
588 break;
589 case 'G':
590 /* Kodansha Compact Kanji Guide */
591 k.dictCode["kodansha_compact"] = ps->substr(1);
592 break;
593 case 'H':
594 /* A Guide To Reading and Writing Japanese (Henshall) */
595 k.dictCode["henshall3"] = ps->substr(1);
596 break;
597 case 'J':
598 /* Kanji in Context (Nishiguchi and Kono) */
599 k.dictCode["kanji_in_context"] = ps->substr(1);
600 break;
601 case 'K':
602 /* Kodansha Kanji Learner's Dictionary (Halpern) */
603 k.dictCode["halpern_kkld"] = ps->substr(1);
604 break;
605 case 'O':
606 /* Essential Kanji (O'Neill) */
607 k.dictCode["oneill_kk"] = ps->substr(1);
608 break;
609 case 'R':
610 /* Query Code: 2001 Kanji (De Roo) */
611 k.queryCode["deroo"] = ps->substr(1);
612 break;
613 case 'S':
614 /* A Guide to Reading and Writing Japanese (Sakade) */
615 k.dictCode["sakade"] = ps->substr(1);
616 break;
617 case 'T':
618 /* Tuttle Kanji Cards (Kask) */
619 k.dictCode["tutt_cards"] = ps->substr(1);
620 break;
621 default:
623 ostringstream oss;
624 oss << ERR_PREF << "Unhandled: " << *ps;
625 el.Push(EL_Error, oss.str());
627 break;
629 break;
630 /* Crossreferences and miscodes */
631 case 'X':
632 switch((*ps)[1]) {
633 case 'D':
634 /* De Roo code */
635 k.variant["deroo"]=ps->substr(2);
636 break;
637 case 'H':
638 /* NJECD code */
639 k.variant["njecd"]=ps->substr(2);
640 break;
641 case 'I':
642 /* S_H code */
643 k.variant["s_h"]=ps->substr(2);
644 break;
645 case 'J':
646 /* XJ# = JIS hex code: 0=jis208, 1=jis212, 2=jis213 */
647 switch((*ps)[2]) {
648 case '0':
649 k.variant["jis208"]=JisHexToKuten(ps->substr(3));
650 break;
651 case '1':
652 k.variant["jis212"]=JisHexToKuten(ps->substr(3));
653 break;
654 case '2':
655 k.variant["jis213"]=JisHexToKuten(ps->substr(3));
656 break;
658 break;
659 case 'N':
660 /* nelson_c code */
661 k.variant["nelson_c"]=ps->substr(2);
662 break;
663 case 'O':
664 /* oneill code */
665 k.variant["oneill"]=ps->substr(2);
666 break;
667 default:
669 ostringstream oss;
670 oss << ERR_PREF << "Unknown entry \"" << *ps << "\" found!";
671 el.Push(EL_Error, oss.str());
674 break;
675 case 'Z':
676 sTemp = ps->substr(0,3);
677 if(sTemp == "ZBP")
678 k.skipMisclass.push_back(
679 pair<string,string>("stroke_and_posn", ps->substr(3)));
680 else if(sTemp == "ZPP") {
681 k.skipMisclass.push_back(
682 pair<string,string>("posn", ps->substr(3)));
683 } else if(sTemp == "ZRP") {
684 k.skipMisclass.push_back(
685 pair<string,string>("stroke_diff", ps->substr(3)));
686 } else if(sTemp == "ZSP") {
687 k.skipMisclass.push_back(
688 pair<string,string>("stroke_count", ps->substr(3)));
689 } else {
690 ostringstream oss;
691 oss << ERR_PREF << "Unknown entry \"" << *ps << "\" found!";
692 el.Push(EL_Error, oss.str());
694 break;
695 /* Korean/Pinyin (Chinese) romanization */
696 case 'W':
697 k.korean_r.push_back(ps->substr(1));
698 break;
699 case 'Y':
700 k.pinyin.push_back(ps->substr(1));
701 break;
702 case '{':
703 /* MEANINGS */
704 sTemp = *ps;
705 /* Make sure we grab the whole meaning entry - pop more tokens and
706 append if necessary. */
707 while(*(sTemp.rbegin()) != '}') {
708 tl.pop_front();
709 if(tl.size()==0) break;
710 sTemp.append(1, ' ');
711 sTemp.append(tl.front());
713 if(*(sTemp.rbegin()) != '}') {
714 /* Shouldn't happen, but I want to be safe. */
715 ostringstream oss;
716 oss << ERR_PREF << "Unable to find ending '}' character!\n"
717 << "Entry responsible: [" << kanjidicEntry << "]";
718 el.Push(EL_Error, oss.str());
719 /* Strip only the starting {, since } is not present. */
720 sTemp = sTemp.substr(1, sTemp.length()-1);
721 } else {
722 /* Strip {} from around the string. */
723 sTemp = sTemp.substr(1, sTemp.length()-2);
725 k.meaning["en"].push_back(sTemp);
726 break;
727 default:
728 switch(tmode) {
729 case 0:
730 /* Check for readings */
731 /* The first character may be 〜, but if so, it -will- be
732 followed by a kana character. */
733 wsTemp = utfconv_mw(*ps);
734 if(wsTemp[0]==L'〜')
735 cKanaTest = wsTemp[1];
736 else cKanaTest = wsTemp[0];
738 if(IsHiragana(cKanaTest)) {
739 k.kunyomi.push_back(*ps);
740 } else if(IsKatakana(cKanaTest)) {
741 k.onyomi.push_back(*ps);
742 } else {
743 ostringstream oss;
744 oss << ERR_PREF
745 << "UNHANDLED entry \"" << *ps << "\" encountered!";
746 el.Push(EL_Error, oss.str());
749 break;
750 case 1:
751 k.nanori.push_back(*ps);
752 break;
753 case 2:
754 k.radicalName = *ps;
755 break;
756 default:
758 ostringstream oss;
759 oss << ERR_PREF
760 << "Unknown tmode value (" << tmode << ") encountered!";
761 el.Push(EL_Error, oss.str());
765 break;
767 tl.pop_front();
771 /* This could be sped up: copy the first UTF-8 character into a string, then
772 run a conversion on that. Trivial though. */
773 void KDict::KanjidicParser(char* kanjidicRawData, const char* jisStd) {
774 char* token = strtok(kanjidicRawData, "\n");
775 wstring wToken;
776 while(token) {
777 if( (strlen(token)>0) && (token[0]!='#') ) {
778 wToken = utfconv_mw(token);
779 /* Convert token to proper format */
780 wToken = ConvertKanjidicEntry(wToken);
781 /* Create new KInfo object.
782 If one already exists for this character, copy over the
783 information. */
784 KInfo k;
785 BoostHM<wchar_t, KInfo>::iterator it = kdictData.find(wToken[0]);
786 if(it!=kdictData.end()) k = it->second;
787 /* Fill the KInfo structure */
788 KanjidicToKInfo(utfconv_wm(wToken), k, jisStd);
790 /* Add to hash table */
791 if(!kdictData.assign(wToken[0], k)) {
792 ostringstream oss;
793 string temp = utfconv_wm(wToken);
794 oss << ERR_PREF << "Error assigning (" << temp[0]<< ", "
795 << temp << ") to hash table!";
796 el.Push(EL_Error, oss.str());
799 token = strtok(NULL, "\n");
803 KDict::~KDict() {
804 /* Currently: nothing here. */
808 * Performs transformations on a KANJIDIC string for our internal usage.
809 * Currently, this includes the following:
810 * - Changing あ.いう notation to あ(いう), a la JWPce/JFC.
811 * - Changing -あい notation to 〜あい, also a la JWPce/JFC.
813 wstring KDict::ConvertKanjidicEntry(const wstring& s) {
814 size_t index, lastIndex;
815 wstring temp = s;
817 /* First conversion: あ.いう to あ(いう) */
818 index = temp.find(L'.', 0);
819 while(index!=wstring::npos) {
820 /* Proceed if the character preceding the "." is hiragana/katakana. */
821 if(IsFurigana(temp[index-1])) {
822 temp[index] = L'(';
823 index = temp.find(L' ', index+1);
824 if(index==wstring::npos) {
825 temp.append(1, L')');
826 break;
827 } else
828 temp.insert(index, 1, L')');
830 lastIndex = index;
831 index = temp.find(L'.', lastIndex+1);
834 /* Second conversion: - to 〜, when a neighboring
835 character is hiragana/katakana */
836 index = temp.find(L'-', 0);
837 while(index!=wstring::npos) {
838 /* Proceed if the character before or after
839 the "-" is hiragana/katakana. */
840 if(IsFurigana(temp[index-1]) || IsFurigana(temp[index+1]))
841 temp[index]=L'〜';
843 lastIndex = index;
844 index = temp.find(L'-', lastIndex+1);
847 /* Return the converted string */
848 return temp;
851 wstring KDict::KInfoToHtml(const KInfo& kInfo) {
852 Preferences* prefs = Preferences::Get();
853 return KInfoToHtml(kInfo,
854 prefs->kanjidicOptions,
855 prefs->kanjidicDictionaries);
858 wstring KDict::KInfoToHtml(const KInfo& kInfo,
859 long options, long dictionaries) {
860 /* return wstring(L"<p>")
861 .append(s[0])
862 .append(L"</p>");*/
863 #warning KInfoToHtml currently is unimplemented!
864 #if 0
865 wostringstream result;
866 wostringstream header;
867 wstring onyomi, kunyomi, nanori, radicalReading, english;
868 wstring dictionaryInfo;
869 wstring lowRelevance;
870 wstring unhandled;
871 long grade = -1, frequency = -1, tmode = 0;
872 wstring strokes;
873 wstring koreanRomanization, pinyinRomanization, crossReferences, miscodes;
874 wstring sTemp, token;
875 list<wstring> t = StrTokenize<wchar_t>(kanjidicStr, L" ");
876 wchar_t c, c2;
878 /* Special processing for the first 2 entries of the line. */
879 if(t.size()>1) {
880 /* header = "<h1><font size=\"-6\">" + args[0] + "</font></h1>"; */
881 /*header.append(L"<p style=\"font-size:32pt\">") */
882 header << L"<p><font size=\"7\">" << t.front() << L"</font></p>";
883 t.pop_front();
884 lowRelevance.append(L"<li>JIS code: 0x")
885 .append(t.front())
886 .append(L"</li>");
887 t.pop_front();
890 /* NEW! Temporary code for loading in SODs and SODAs from KanjiCafe! */
891 if(options & (KDO_SOD_STATIC | KDO_SOD_ANIM) != 0) {
892 string utfStr;
893 /* Get a UTF8-encoded string for the kanji. */
894 utfStr = utfconv_wm(kanjidicStr.substr(0,1));
896 /* Convert to a low-to-high-byte hex string. */
897 ostringstream ss;
898 for(unsigned int i=0;i<utfStr.length();i++) {
899 ss << hex << setw(2) << setfill('0')
900 << (unsigned int)((unsigned char)utfStr[i]);
903 wstringstream sod;
904 /* Load static SOD, if present */
905 if((options & KDO_SOD_STATIC) != 0) {
906 Preferences* p = Preferences::Get();
907 ostringstream fn;
908 string sodDir = p->GetSetting("sod_dir");
909 if(sodDir.length()==0) sodDir = "sods";
910 fn << sodDir << DSCHAR
911 << "sod-utf8-hex" << DSCHAR
912 << ss.str() << ".png";
914 #ifdef DEBUG
915 printf("DEBUG: Checking for existance of file \"%s\"...\n",
916 fn.str().c_str());
917 #endif
918 ifstream f(fn.str().c_str());
919 if(f.is_open()) {
920 f.close();
921 if(sod.str().length()>0) sod << L"<br />";
922 sod << L"<img src=\"" << utfconv_mw(fn.str()) << L"\" />";
925 /* Load animated SOD, if present */
926 if((options & KDO_SOD_ANIM) != 0) {
927 ostringstream fn;
928 fn << "sods" << DSCHAR
929 << "soda-utf8-hex" << DSCHAR
930 << ss.str() << ".gif";
931 #ifdef DEBUG
932 printf("DEBUG: Checking for existance of file \"%s\"...\n",
933 fn.str().c_str());
934 #endif
935 ifstream f(fn.str().c_str());
936 if(f.is_open()) {
937 f.close();
938 if(sod.str().length()>0) sod << L"<br />";
939 sod << L"<img src=\"" << utfconv_mw(fn.str()) << L"\" />";
942 /* Append the chart(s) in a paragraph object. */
943 if(sod.str().length()>0) {
944 header << L"<p>" << sod.str() <<
945 L"<br /><font size=\"1\">(Kanji stroke order graphics "
946 L"used under license from KanjiCafe.com.)</font></p>";
950 while(t.size()>0) {
951 token = t.front();
952 t.pop_front();
953 sTemp = token;
954 c = sTemp[0];
955 /* If a preceding character is detected, strip it */
956 if(c == L'(' || c == L'〜') {
957 sTemp = sTemp.substr(1);
958 c = sTemp[0];
960 if(tmode==0) {
961 if(IsKatakana(c)) {
962 /* Onyomi reading detected */
963 /*if(onyomi.length()>0) onyomi.append(L" "); */
964 if(onyomi.length()>0) onyomi.append(L"&nbsp; ");
965 onyomi.append(token); /* Copy the original string, including ()'s and 〜's */
966 continue;
968 else if(IsHiragana(c)) {
969 /* Kunyomi reading detected */
970 if(kunyomi.length()>0) kunyomi.append(L"&nbsp; ");
971 kunyomi.append(token); /* Copy the original string, including ()'s and 〜's */
972 continue;
974 } else if(tmode==1) {
975 if(IsFurigana(c)) {
976 /* Nanori reading detected */
977 if(nanori.length()>0) nanori.append(L"&nbsp; ");
978 nanori.append(token); /* Copy the original string, including ()'s and 〜's */
979 continue;
981 } else if(tmode==2) {
982 if(IsFurigana(c)) {
983 /* Special radical reading detected */
984 if(radicalReading.length()>0) radicalReading.append(L"&nbsp; ");
985 radicalReading.append(token);
986 continue;
989 if(c == L'{') {
990 /* English meaning detected
991 Special handling is needed to take care of spaces, though.
992 We'll "cheat" and mess with our iterator a bit if a space is detected. */
993 while(t.size()>0 && sTemp[sTemp.length()-1] != L'}') {
994 sTemp.append(L" ").append(t.front());
995 t.pop_front();
997 if(english.length()>0) english.append(L", ");
998 english.append(sTemp.substr(1,sTemp.length()-2)); /* Strip the {} */
1000 else {
1001 switch(c) {
1002 case L'T': /* Change "t mode" */
1003 /*wstring(sTemp.substr(1)).ToLong(&tmode);*/
1004 wistringstream(sTemp.substr(1)) >> tmode;
1005 #ifdef DEBUG
1006 if(tmode>2) printf("WARNING: T-mode set to %d.\nT-modes above 2 are not currently documented!", (int)tmode);
1007 #endif
1008 break;
1009 case L'B': /* Bushu radical */
1010 lowRelevance.append(L"<li>Bushu radical: ").append(sTemp.substr(1)).append(L"</li>");
1011 break;
1012 case L'C': /* Classical radical */
1013 lowRelevance.append(L"<li>Classical radical: ").append(sTemp.substr(1)).append(L"</li>");
1014 break;
1015 case L'F': /* Frequency */
1016 /*wstring(sTemp.substr(1)).ToLong(&frequency);*/
1017 wistringstream(sTemp.substr(1)) >> frequency;
1018 break;
1019 case L'G': /* Grade level */
1020 /*wstring(sTemp.substr(1)).ToLong(&grade);*/
1021 wistringstream(sTemp.substr(1)) >> grade;
1022 break;
1023 case L'S': /* Stroke count */
1024 if(strokes.length()==0) {
1025 strokes = sTemp.substr(1);
1026 } else if(!strokes.find(L' ')!=wstring::npos) {
1027 strokes.append(L" (Miscounts: ")
1028 .append(sTemp.substr(1))
1029 .append(L")");
1030 } else {
1031 strokes = strokes.substr(0, strokes.length()-1)
1032 .append(L", ")
1033 .append(sTemp.substr(1))
1034 .append(L")");
1036 break;
1037 case L'U': /* Unicode value */
1038 lowRelevance.append(L"<li>Unicode: 0x").append(sTemp.substr(1)).append(L"</li>");
1039 break;
1040 /* From here, it's all dictionary codes */
1041 case L'H':
1042 if((dictionaries & KDD_NJECD)!=0)
1043 dictionaryInfo.append(L"<li>New Japanese-English Character Dictionary (Halpern): ")
1044 .append(sTemp.substr(1)).append(L"</li>");
1045 break;
1046 case L'N':
1047 if((dictionaries & KDD_MRJECD)!=0)
1048 dictionaryInfo.append(L"<li>Modern Reader's Japanese-English Character Dictionary (Nelson): ")
1049 .append(sTemp.substr(1)).append(L"</li>");
1050 break;
1051 case L'V':
1052 if((dictionaries & KDD_NNJECD)!=0)
1053 dictionaryInfo.append(L"<li>The New Nelson's Japanese-English Character Dictionary: ")
1054 .append(sTemp.substr(1)).append(L"</li>");
1055 break;
1056 case L'P':
1057 /* SKIP codes. */
1058 /* Thanks to changes in permissible SKIP code usage (change to
1059 Creative Commons licensing in January 2008), we can now use
1060 this without problems. */
1061 if((dictionaries & KDD_SKIP)!=0)
1062 dictionaryInfo.append(L"<li>SKIP code: ")
1063 .append(sTemp.substr(1)).append(L"</li>");
1064 break;
1065 case L'I': /* Spahn/Hadamitzky dictionaries */
1066 if(sTemp[1]==L'N') {
1067 if((dictionaries & KDD_KK)!=0) {
1068 dictionaryInfo.append(L"<li>Kanji & Kana (Spahn, Hadamitzky): ")
1069 .append(sTemp.substr(2)).append(L"</li>");
1071 } else {
1072 if((dictionaries & KDD_KD)!=0) {
1073 dictionaryInfo.append(L"<li>Kanji Dictionary (Spahn, Hadamitzky): ")
1074 .append(sTemp.substr(1)).append(L"</li>");
1077 break;
1078 case L'Q':
1079 if((dictionaries & KDD_FC)!=0) {
1080 dictionaryInfo.append(L"<li>Four Corner code: ")
1081 .append(sTemp.substr(1)).append(L"</li>");
1083 break;
1084 case L'M':
1085 c2 = sTemp[1];
1086 if(c2==L'N') {
1087 if((dictionaries & KDD_MOROI)!=0) {
1088 dictionaryInfo.append(L"<li>Morohashi Daikanwajiten Index: ")
1089 .append(sTemp.substr(2)).append(L"</li>");
1091 } else if(c2==L'P') {
1092 if((dictionaries & KDD_MOROVP)!=0) {
1093 dictionaryInfo.append(L"<li>Morohashi Daikanwajiten Volume/Page: ")
1094 .append(sTemp.substr(2)).append(L"</li>");
1097 break;
1098 case L'E':
1099 if((dictionaries & KDD_GRJC)!=0) {
1100 dictionaryInfo.append(L"<li>A Guide to Remembering Japanese Characters (Henshal): ")
1101 .append(sTemp.substr(1)).append(L"</li>");
1103 break;
1104 case L'K':
1105 if((dictionaries & KDD_GKD)!=0) {
1106 dictionaryInfo.append(L"<li>Gakken Kanji Dictionary (\"A New Dictionary of Kanji Usage\"): ")
1107 .append(sTemp.substr(1)).append(L"</li>");
1109 break;
1110 case L'L':
1111 if((dictionaries & KDD_RTK)!=0) {
1112 dictionaryInfo.append(L"<li>Remembering the Kanji (Heisig): ")
1113 .append(sTemp.substr(1)).append(L"</li>");
1115 break;
1116 case L'O':
1117 if((dictionaries & KDD_JN)!=0) {
1118 dictionaryInfo.append(L"<li>Japanese Names (O'Neill): ")
1119 .append(sTemp.substr(1)).append(L"</li>");
1121 break;
1122 case L'D':
1123 c2 = sTemp[1];
1124 switch(c2) {
1125 case L'B':
1126 if((dictionaries & KDD_JBP)!=0) {
1127 dictionaryInfo.append(L"<li>Japanese for Busy People (AJLT): ")
1128 .append(sTemp.substr(2)).append(L"</li>");
1130 break;
1131 case L'C':
1132 if((dictionaries & KDD_KWJLP)!=0) {
1133 dictionaryInfo.append(L"<li>The Kanji Way to Japanese Language Power (Crowley): ")
1134 .append(sTemp.substr(2)).append(L"</li>");
1136 break;
1137 case L'F':
1138 if((dictionaries & KDD_JKF)!=0) {
1139 dictionaryInfo.append(L"<li>Japanese Kanji Flashcards (White Rabbit Press): ")
1140 .append(sTemp.substr(2)).append(L"</li>");
1142 break;
1143 case L'G':
1144 if((dictionaries & KDD_KCKG)!=0) {
1145 dictionaryInfo.append(L"<li>Kodansha Compact Kanji Guide: ")
1146 .append(sTemp.substr(2)).append(L"</li>");
1148 break;
1149 case L'H':
1150 if((dictionaries & KDD_GTRWJH)!=0) {
1151 dictionaryInfo.append(L"<li>A Guide To Reading and Writing Japanese (Hensall): ")
1152 .append(sTemp.substr(2)).append(L"</li>");
1154 break;
1155 case L'J':
1156 if((dictionaries & KDD_KIC)!=0) {
1157 dictionaryInfo.append(L"<li>Kanji in Context (Nishiguchi and Kono): ")
1158 .append(sTemp.substr(2)).append(L"</li>");
1160 break;
1161 case L'K':
1162 if((dictionaries & KDD_KLD)!=0) {
1163 dictionaryInfo.append(L"<li>Kanji Learner's Dictionary (Halpern): ")
1164 .append(sTemp.substr(2)).append(L"</li>");
1166 break;
1167 case L'O':
1168 if((dictionaries & KDD_EK)!=0) {
1169 dictionaryInfo.append(L"<li>Essential Kanji (O'Neill): ")
1170 .append(sTemp.substr(2)).append(L"</li>");
1172 break;
1173 case L'R':
1174 if((dictionaries & KDD_DR)!=0) {
1175 dictionaryInfo.append(L"<li>2001 Kanji (De Roo): ")
1176 .append(sTemp.substr(2)).append(L"</li>");
1178 break;
1179 case L'S':
1180 if((dictionaries & KDD_GTRWJS)!=0) {
1181 dictionaryInfo.append(L"<li>A Guide to Reading and Writing Japanese (Sakade): ")
1182 .append(sTemp.substr(2)).append(L"</li>");
1184 break;
1185 case L'T':
1186 if((dictionaries & KDD_TKC)!=0) {
1187 dictionaryInfo.append(L"<li>Tuttle Kanji Cards (Kask): ")
1188 .append(sTemp.substr(2)).append(L"</li>");
1190 break;
1191 default:
1192 if(unhandled.length()>0) unhandled.append(L" ");
1193 unhandled.append(sTemp);
1194 break;
1196 break;
1197 /* Crossreferences and miscodes */
1198 case L'X':
1199 if(crossReferences.length()>0) crossReferences.append(L", ");
1200 crossReferences.append(sTemp.substr(1));
1201 break;
1202 case L'Z':
1203 if(miscodes.length()>0) miscodes.append(L", ");
1204 miscodes.append(sTemp.substr(1));
1205 break;
1206 /* Korean/Pinyin (Chinese) romanization */
1207 case L'W':
1208 if(koreanRomanization.length()>0) koreanRomanization.append(L", ");
1209 koreanRomanization.append(sTemp.substr(1));
1210 break;
1211 case L'Y':
1212 if(pinyinRomanization.length()>0) pinyinRomanization.append(L", ");
1213 pinyinRomanization.append(sTemp.substr(1));
1214 break;
1215 default:
1216 if(unhandled.length()>0) unhandled.append(L" ");
1217 unhandled.append(sTemp);
1218 break;
1221 } /* while(t.HasMoreTokens()) */
1223 if(header.str().length() > 0) result << header.str();
1224 #ifdef DEBUG
1225 printf("DEBUG: header=[%ls]\n", header.str().c_str());
1226 #endif
1227 result << L"<ul>";
1228 if((options & KDO_READINGS) != 0) {
1229 if(onyomi.length() > 0)
1230 result << L"<li>Onyomi Readings: " << onyomi << L"</li>";
1231 if(kunyomi.length() > 0)
1232 result << L"<li>Kunyomi Readings: " << kunyomi << L"</li>";
1233 if(nanori.length() > 0)
1234 result << L"<li>Nanori Readings: " << nanori << L"</li>";
1235 if(radicalReading.length() > 0)
1236 result << L"<li>Special Radical Reading: " << radicalReading <<
1237 L"</li>";
1239 if((options & KDO_MEANINGS) != 0) {
1240 if(english.length() > 0)
1241 result << L"<li>English Meanings: " << english << L"</li>";
1243 if((options & KDO_HIGHIMPORTANCE) != 0) {
1244 if(strokes.length() > 0)
1245 result << L"<li>Stroke count: " << strokes << L"</li>";
1246 else
1247 result << L"<li>Stroke count: not specified in KANJIDIC</li>";
1248 result << L"<li>Grade Level: ";
1249 if(grade<=6 && grade >= 1) { /* Jouyou (Grade #) */
1250 result << L"Jouyou (Grade " << grade << L")";
1251 } else if(grade==8) { /* Jouyou (General usage) */
1252 result << L"Jouyou (General usage)";
1253 } else if(grade==9) { /* Jinmeiyou (Characters for names) */
1254 result << L"Jinmeiyou (Characters for names)";
1255 } else if(grade==-1) { /* No flag specified in kanjidic string */
1256 result << L"Unspecified";
1257 } else {
1258 result << L"Unhandled grade level (Grade " << grade << L")";
1260 result << L"</li>";
1261 if(frequency!=-1)
1262 result << L"<li>Frequency Ranking: " << frequency << L"</li>";
1263 else result << L"<li>Frequency Ranking: Unspecified</li>";
1265 if((options & KDO_DICTIONARIES) != 0) {
1266 if(dictionaryInfo.length()>0)
1267 result << L"<li>Dictionary Codes:<ul>" << dictionaryInfo
1268 << L"</ul></li>";
1270 if((options & KDO_VOCABCROSSREF) != 0) {
1271 vector<wstring> *vList = &(jben->vocabList->GetVocabList());
1272 wchar_t thisKanji = kanjidicStr[0];
1273 vector<wstring> crossRefList;
1274 vector<wstring>::iterator vIt;
1275 for(vIt=vList->begin(); vIt!=vList->end(); vIt++) {
1276 if(vIt->find(thisKanji)!=wstring::npos) {
1277 crossRefList.push_back(*vIt);
1280 if(crossRefList.size()>0) {
1281 result << L"<li>This kanji is used by words in your study list:<br><font size=\"7\">";
1282 vIt = crossRefList.begin();
1283 result << *vIt;
1284 for(++vIt; vIt!=crossRefList.end(); vIt++) {
1285 result << L"&nbsp; " << *vIt;
1287 result << L"</font></li>";
1290 if((options & KDO_LOWIMPORTANCE) != 0) {
1291 if(koreanRomanization.length()>0) lowRelevance.append(L"<li>Korean romanization: ").append(koreanRomanization).append(L"</li>");
1292 if(pinyinRomanization.length()>0) lowRelevance.append(L"<li>Pinyin romanization: ").append(pinyinRomanization).append(L"</li>");
1293 if(crossReferences.length()>0) lowRelevance.append(L"<li>Cross reference codes: ").append(crossReferences).append(L"</li>");
1294 if(miscodes.length()>0) lowRelevance.append(L"<li>Miscodes: ").append(miscodes).append(L"</li>");
1295 if(lowRelevance.length()>0)
1296 result << L"<li>Extra Information:<ul>" << lowRelevance
1297 << L"</ul></li>";
1299 if((options & KDO_UNHANDLED) != 0) {
1300 if(unhandled.length()>0)
1301 result << L"<li>Unhandled: " << unhandled << L"</li>";
1303 result << L"</ul>";
1305 return result.str();
1306 #endif
1307 return wstring();
1310 const BoostHM<wchar_t,KInfo>* KDict::GetHashTable() const {
1311 return &kdictData;
1314 bool KDict::MainDataLoaded() const {
1315 if(kdictData.size()>0) return true;
1316 return false;
1319 const KInfo* KDict::GetEntry(const wchar_t key) const {
1320 BoostHM<wchar_t, KInfo>::const_iterator kci = kdictData.find(key);
1321 if(kci != kdictData.end())
1322 return &(kci->second);
1323 return NULL;;