4 Website: http://www.vultaire.net/software/jben/
5 License: GNU General Public License (GPL) version 2
6 (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>
25 #include "preferences.h"
26 #include "encoding_convert.h"
27 #include "string_utils.h"
28 #include "file_utils.h"
31 #include <libxml/xmlreader.h>
39 # define FALLBACK_DICTDIR "dicts\\"
41 # define FALLBACK_DICTDIR "dicts/"
44 KDict
* KDict::kdictSingleton
= NULL
;
47 radical
= radicalNelson
= (unsigned char) 0;
48 grade
= strokeCount
= freq
= 0;
51 const KDict
* KDict::Get() {
53 kdictSingleton
= new KDict
;
54 return kdictSingleton
;
58 Preferences
* p
= Preferences::Get();
60 /* Load KANJIDIC2, if present. */
61 result
= LoadKanjidic2(p
->GetSetting("kdict_kanjidic2").c_str());
62 if(result
!=KD_SUCCESS
)
63 result
= LoadKanjidic2(FALLBACK_DICTDIR
"kanjidic2.xml");
65 /* If KANJIDIC2 is not present, load KANJIDIC and/or KANJD212 */
66 if(result
!=KD_SUCCESS
) {
67 result
= LoadKanjidic(p
->GetSetting("kdict_kanjidic").c_str());
68 if(result
!=KD_SUCCESS
) LoadKanjidic(FALLBACK_DICTDIR
"kanjidic");
70 LoadKanjidic(p
->GetSetting("kdict_kanjd212").c_str(), "jis212");
71 if(result
!=KD_SUCCESS
)
72 LoadKanjidic(FALLBACK_DICTDIR
"kanjd212", "jis212");
75 /* Load supplemental dictionary files */
76 result
= LoadKradfile(p
->GetSetting("kdict_kradfile").c_str());
77 if(result
!=KD_SUCCESS
) LoadKradfile(FALLBACK_DICTDIR
"kradfile");
78 result
= LoadRadkfile(p
->GetSetting("kdict_radkfile").c_str());
79 if(result
!=KD_SUCCESS
) LoadRadkfile(FALLBACK_DICTDIR
"radkfile");
82 void KDict::Destroy() {
84 delete kdictSingleton
;
85 kdictSingleton
= NULL
;
89 int KDict::LoadKanjidic(const char* filename
, const char* jisStd
) {
92 int returnCode
=KD_FAILURE
;
94 ifstream
ifile(filename
, ios::ate
); /* "at end" to get our file size */
98 rawData
= new char[size
+1];
100 ifile
.read(rawData
, size
);
101 if(strlen(rawData
)!=size
) {
104 << "kanjidic file size: "
106 << ", read-in string: "
108 el
.Push(EL_Warning
, oss
.str());
111 /* Create the kanjidic object with our string data. */
112 this->KanjidicParser(rawData
, jisStd
);
114 returnCode
= KD_SUCCESS
;
115 el
.Push(EL_Silent
, string("Kanji dictionary file \"")
116 .append(filename
).append("\" loaded successfully."));
119 returnCode
= KD_FAILURE
;
121 if(rawData
) delete[] rawData
;
125 int KDict::LoadKanjidic2(const char* filename
) {
126 int returnCode
= KD_FAILURE
;
127 xmlTextReaderPtr reader
;
131 /* Vars for navigating through the data */
132 string element
, d1element
;
133 map
<string
, string
> attributes
;
134 map
<string
, string
>::iterator mssi
;
136 bool isAttribute
=false;
137 /* Var for storing values of the entries */
142 reader
= xmlNewTextReaderFilename(filename
);
145 ret
= xmlTextReaderRead(reader
);
147 /* Act based on node type */
148 nodeType
= xmlTextReaderNodeType(reader
);
150 case XML_READER_TYPE_ELEMENT
:
151 element
= (char*)xmlTextReaderName(reader
);
152 if(xmlTextReaderDepth(reader
)==1) d1element
=element
;
153 if(element
=="character") {
154 /* Opening of character entry - create new data object */
160 case XML_READER_TYPE_END_ELEMENT
:
161 element
= (char*)xmlTextReaderName(reader
);
162 if(element
=="character") {
163 wchar_t wc
= utfconv_mw(k
->literal
)[0];
164 /* End of character entry: append to data list */
165 if(!kdictData
.assign(wc
, *k
)) {
168 << "Error assigning kanjidic2 entry to hash table!";
169 el
.Push(EL_Error
, oss
.str());
176 case XML_READER_TYPE_ATTRIBUTE
:
177 temp
= (char*)xmlTextReaderName(reader
);
178 ptr
= xmlTextReaderValue(reader
);
179 attributes
[temp
] = (char*)ptr
;
182 case XML_READER_TYPE_TEXT
:
183 ptr
= xmlTextReaderValue(reader
);
187 if(d1element
=="header") {
188 if(element
=="file_version") {
192 << "Warning: the KANJIDIC2 reader only"
193 " supports KANJIDIC2 version 4!";
194 el
.Push(EL_Warning
, oss
.str());
198 if(d1element
=="character") {
201 oss
<< ERR_PREF
<< "k is NULL!";
202 el
.Push(EL_Error
, oss
.str());
203 } else if(element
=="literal") {
205 } else if(element
=="cp_value") {
206 k
->codepoint
[attributes
["cp_type"]] = sValue
;
207 } else if(element
=="rad_value") {
208 temp
= attributes
["rad_type"];
209 if(temp
== "classical")
211 = (unsigned char)atoi(sValue
.c_str());
212 else if(temp
== "nelson_c")
214 = (unsigned char)atoi(sValue
.c_str());
218 << "Unhandled radical: "
220 << ", value=[" << sValue
222 el
.Push(EL_Error
, oss
.str());
224 } else if(element
=="grade") {
225 k
->grade
= (unsigned char)atoi(sValue
.c_str());
226 } else if(element
=="stroke_count") {
227 k
->strokeCount
= (unsigned char)atoi(sValue
.c_str());
228 } else if(element
=="variant") {
229 k
->variant
[attributes
["var_type"]] = sValue
;
230 } else if(element
=="freq") {
231 k
->freq
= atoi(sValue
.c_str());
232 } else if(element
=="rad_name") {
233 k
->radicalName
= sValue
;
234 } else if(element
=="dic_ref") {
235 k
->dictCode
[attributes
["dr_type"]] = sValue
;
236 if(attributes
["dr_type"]=="moro"
237 && attributes
["m_vol"].length()>0) {
239 temp
.append(attributes
["m_vol"]);
241 temp
.append(attributes
["m_page"]);
242 k
->dictCode
["moro"].append(temp
);
244 } else if(element
=="q_code") {
245 if(attributes
["qc_type"]=="skip"
246 && attributes
["skip_misclass"].length()>0) {
247 k
->skipMisclass
.push_back(
249 attributes
["skip_misclass"],
252 k
->queryCode
[attributes
["qc_type"]] = sValue
;
254 } else if(element
=="reading") {
255 temp
= attributes
["r_type"];
257 k
->pinyin
.push_back(sValue
);
258 } else if(temp
=="korean_r") {
259 k
->korean_r
.push_back(sValue
);
260 } else if(temp
=="korean_h") {
261 k
->korean_h
.push_back(sValue
);
262 } else if(temp
=="ja_on") {
263 /* Need to handle r_status and on_type! */
264 /* Need to convert xx.x to xx(x) notation. */
265 k
->onyomi
.push_back(sValue
);
266 } else if(temp
=="ja_kun") {
267 /* Need to handle r_status! */
268 /* Need to convert xx.x to xx(x) notation. */
269 k
->kunyomi
.push_back(sValue
);
272 oss
<< ERR_PREF
<< "Invalid r_type: " << temp
;
273 el
.Push(EL_Error
, oss
.str());
275 /* This section is "to-do" */
276 } else if(element
=="meaning") {
277 temp
= attributes
["m_lang"];
278 if(temp
.length()==0) temp
= "en";
279 k
->meaning
[temp
].push_back(sValue
);;
280 } else if(element
=="nanori") {
281 k
->nanori
.push_back(sValue
);
284 oss
<< ERR_PREF
<< "UNHANDLED element: " << element
;
285 el
.Push(EL_Error
, oss
.str());
288 /* default parsing */
290 /*cout << "DEBUG: Depth 1 element is " << d1element
291 << ", element is " << element
292 << ", value is " << sValue << endl;*/
300 /* If element has attributes, go to the next attribute if present.
301 Otherwise, go to the next element. */
302 if(!isAttribute
) ret
= xmlTextReaderHasAttributes(reader
);
303 if(isAttribute
|| ret
==1) {
304 ret
= xmlTextReaderMoveToNextAttribute(reader
);
306 /* ret==-1 is an error */
310 << "xmlTextReaderMoveToNextAttribute returned an error!";
311 el
.Push(EL_Error
, oss
.str());
313 /* If ret==1, an attribute was loaded.
314 If not, go to the next element. */
319 ret
= xmlTextReaderRead(reader
);
322 xmlFreeTextReader(reader
);
326 << "Parsing error occurred in " << filename
<< ".";
327 el
.Push(EL_Error
, oss
.str());
330 returnCode
= KD_SUCCESS
;
331 el
.Push(EL_Silent
, string("Kanji dictionary file \"")
332 .append(filename
).append("\" loaded successfully."));
333 } else return returnCode
;
337 oss
<< ERR_PREF
<< ": k is not NULL! This shouldn't happen!";
338 el
.Push(EL_Error
, oss
.str());
346 int KDict::LoadKradfile(const char* filename
) {
347 int returnCode
= KD_FAILURE
;
349 ifstream
f(filename
, ios::in
|ios::binary
);
355 StrTokenize
<wchar_t>(utfconv_mw(sb
.str()), L
"\n");
356 while(data
.size()>0) {
357 wstring token
= data
.front();
359 if(token
.length()>0 && token
[0]!=L
'#') {
360 /* KRADFILE-specific stuff here */
361 /* Get rid of the spaces in the string */
362 token
= TextReplace
<wchar_t>(token
, L
" ", L
"");
363 /* Now we can easily pull in the data */
364 if(!kradData
.assign(token
[0], token
.substr(2))) {
366 oss
<< ERR_PREF
<< "KRADFILE: Error assigning ("
367 << utfconv_wm(token
.substr(0,1)) << ", "
368 << utfconv_wm(token
.substr(2)) << ") to hash table!";
369 el
.Push(EL_Error
, oss
.str());
374 returnCode
= KD_SUCCESS
;
375 el
.Push(EL_Silent
, string("Kanji dictionary file \"")
376 .append(filename
).append("\" loaded successfully."));
381 int KDict::LoadRadkfile(const char* filename
) {
382 int returnCode
= KD_FAILURE
;
384 ifstream
f(filename
, ios::in
|ios::binary
);
389 /* RADKFILE entries all start with $.
390 Split on $, and discard the first entry since it is the explanation
391 preceding the first entry. */
393 StrTokenize
<wchar_t>(utfconv_mw(sb
.str()), L
"$");
396 while(data
.size()>0) {
397 wstring entry
= data
.front();
399 if(entry
.length()>0 && entry
[0]!=L
'#') {
400 /* RADKFILE-specific stuff here */
401 list
<wstring
> entryData
=
402 StrTokenize
<wchar_t>(entry
, L
"\n", false, 2);
403 if(entryData
.size()!=2) {
406 << "Error: entryData.size() == " << entryData
.size()
407 << " for entry \"" << utfconv_wm(entry
) << "!!";
408 el
.Push(EL_Error
, oss
.str());
413 /* entryData.front() contains our key.
414 It's a space delimited string,
415 first token is our kanji, second is the stroke count.
416 A third token may be present, but is irrelevant. */
417 list
<wstring
> keyData
=
418 StrTokenize
<wchar_t>(entryData
.front(), L
" ");
420 wiss
.str(keyData
.front());
423 wiss
.str(keyData
.front());
426 /* entryData.back() contains the characters our key
428 /* Get rid of the spaces in the string */
429 value
= entryData
.back();
430 value
= TextReplace
<wchar_t>(value
, L
"\n", L
"");
431 value
= TextReplace
<wchar_t>(value
, L
" ", L
"");
433 if(!radkData
.assign(key
, value
)) {
435 oss
<< ERR_PREF
<< "RADKFILE: Error assigning ("
436 << utfconv_wm(wstring().append(1,key
)) << ", "
437 << utfconv_wm(value
) << ") to hash table!";
438 el
.Push(EL_Error
, oss
.str());
440 if(!radkDataStrokes
.assign(key
, strokeCount
)) {
442 oss
<< ERR_PREF
<< "RADKFILE: Error assigning ("
443 << utfconv_wm(wstring().append(1,key
))
444 << ", " << strokeCount
<< ") to hash table!";
445 el
.Push(EL_Error
, oss
.str());
451 returnCode
= KD_SUCCESS
;
452 el
.Push(EL_Silent
, string("Kanji dictionary file \"")
453 .append(filename
).append("\" loaded successfully."));
458 string
JisHexToKuten(const string
& jisHex
) {
460 stringstream
ss(jisHex
);
463 ss
<< (((i
& 0xFF00) >> 8) - 0x20)
464 << '-' << ((i
& 0xFF) - 0x20);
468 /* This function converts from KANJIDIC-style entries to internally used
469 KInfo objects (which are structurally based off the newer KANJIDIC2). */
470 void KDict::KanjidicToKInfo(const string
& kanjidicEntry
,
471 KInfo
& k
, const char* jisStd
) {
472 list
<string
> tl
= StrTokenize
<char>(kanjidicEntry
, " ");
473 if(tl
.size()<2) return; /* KANJIDIC entries must AT LEAST have the char
474 and the JIS hex code. */
480 /* First 2 fields are always the same: process them here */
481 k
.literal
= tl
.front(); tl
.pop_front();
482 /* JIS code needs to be converted to ku-ten
483 format to coincide with KANJIDIC2. */
484 k
.codepoint
[jisStd
] = JisHexToKuten(tl
.front()); tl
.pop_front();
486 /* Now, just loop through the remaining entries in the list. */
491 case 'T': /* Change "t mode" */
492 tmode
= atoi(ps
->substr(1).c_str());
494 case 'B': /* Nelson-reclassified radical */
495 k
.radicalNelson
= (unsigned char)atoi(ps
->substr(1).c_str());
497 case 'C': /* Classical radical (KangXi Zidian) */
498 k
.radical
= (unsigned char)atoi(ps
->substr(1).c_str());
500 case 'F': /* Frequency */
501 k
.freq
= atoi(ps
->substr(1).c_str());
503 case 'G': /* Grade level */
504 k
.grade
= atoi(ps
->substr(1).c_str());
506 case 'S': /* Stroke count */
508 k
.strokeCount
= atoi(ps
->substr(1).c_str());
510 k
.misstrokes
.push_back(atoi(ps
->substr(1).c_str()));
512 case 'U': /* Unicode value */
513 k
.codepoint
["ucs"] = ps
->substr(1);
515 /* Dictionary codes for most of the following */
517 /* New Japanese-English Character Dictionary (Halpern) */
518 k
.dictCode
["halpern_njecd"] = ps
->substr(1);
521 /* Modern Reader's Japanese-English Character Dictionary (Nelson) */
522 k
.dictCode
["nelson_c"] = ps
->substr(1);
525 /* The New Nelson's Japanese-English Character Dictionary */
526 k
.dictCode
["nelson_n"] = ps
->substr(1);
530 /* Thanks to changes in permissible SKIP code usage (change to
531 Creative Commons licensing in January 2008), we can now use
532 this without problems. */
533 k
.queryCode
["skip"] = ps
->substr(1);
535 case 'I': /* Spahn/Hadamitzky dictionaries */
537 /* Kanji & Kana (Spahn, Hadamitzky) */
538 k
.dictCode
["sh_kk"] = ps
->substr(2);
540 /* Query Code: Kanji Dictionary (Spahn, Hadamitzky) */
541 k
.queryCode
["sh_desc"] = ps
->substr(1);
545 /* Four Corner code */
546 k
.queryCode
["four_corner"] = ps
->substr(1);
550 /* Morohashi Daikanwajiten Index */
551 k
.dictCode
["moro"].insert(0, ps
->substr(2));
552 } else if((*ps
)[1]=='P') {
553 /* Morohashi Daikanwajiten Volume/Page */
556 .append(ps
->substr(2));
560 /* A Guide to Remembering Japanese Characters (Henshall) */
561 k
.dictCode
["henshall"] = ps
->substr(1);
564 /* Gakken Kanji Dictionary ("A New Dictionary of Kanji Usage") */
565 k
.dictCode
["gakken"] = ps
->substr(1);
568 /* Remembering the Kanji (Heisig) */
569 k
.dictCode
["heisig"] = ps
->substr(1);
572 /* Japanese Names (O'Neill) */
573 k
.dictCode
["oneill_names"] = ps
->substr(1);
578 /* Japanese for Busy People (AJLT) */
579 k
.dictCode
["busy_people"] = ps
->substr(1);
582 /* The Kanji Way to Japanese Language Power (Crowley) */
583 k
.dictCode
["crowley"] = ps
->substr(1);
586 /* Japanese Kanji Flashcards (White Rabbit Press) */
587 k
.dictCode
["jf_cards"] = ps
->substr(1);
590 /* Kodansha Compact Kanji Guide */
591 k
.dictCode
["kodansha_compact"] = ps
->substr(1);
594 /* A Guide To Reading and Writing Japanese (Henshall) */
595 k
.dictCode
["henshall3"] = ps
->substr(1);
598 /* Kanji in Context (Nishiguchi and Kono) */
599 k
.dictCode
["kanji_in_context"] = ps
->substr(1);
602 /* Kodansha Kanji Learner's Dictionary (Halpern) */
603 k
.dictCode
["halpern_kkld"] = ps
->substr(1);
606 /* Essential Kanji (O'Neill) */
607 k
.dictCode
["oneill_kk"] = ps
->substr(1);
610 /* Query Code: 2001 Kanji (De Roo) */
611 k
.queryCode
["deroo"] = ps
->substr(1);
614 /* A Guide to Reading and Writing Japanese (Sakade) */
615 k
.dictCode
["sakade"] = ps
->substr(1);
618 /* Tuttle Kanji Cards (Kask) */
619 k
.dictCode
["tutt_cards"] = ps
->substr(1);
624 oss
<< ERR_PREF
<< "Unhandled: " << *ps
;
625 el
.Push(EL_Error
, oss
.str());
630 /* Crossreferences and miscodes */
635 k
.variant
["deroo"]=ps
->substr(2);
639 k
.variant
["njecd"]=ps
->substr(2);
643 k
.variant
["s_h"]=ps
->substr(2);
646 /* XJ# = JIS hex code: 0=jis208, 1=jis212, 2=jis213 */
649 k
.variant
["jis208"]=JisHexToKuten(ps
->substr(3));
652 k
.variant
["jis212"]=JisHexToKuten(ps
->substr(3));
655 k
.variant
["jis213"]=JisHexToKuten(ps
->substr(3));
661 k
.variant
["nelson_c"]=ps
->substr(2);
665 k
.variant
["oneill"]=ps
->substr(2);
670 oss
<< ERR_PREF
<< "Unknown entry \"" << *ps
<< "\" found!";
671 el
.Push(EL_Error
, oss
.str());
676 sTemp
= ps
->substr(0,3);
678 k
.skipMisclass
.push_back(
679 pair
<string
,string
>("stroke_and_posn", ps
->substr(3)));
680 else if(sTemp
== "ZPP") {
681 k
.skipMisclass
.push_back(
682 pair
<string
,string
>("posn", ps
->substr(3)));
683 } else if(sTemp
== "ZRP") {
684 k
.skipMisclass
.push_back(
685 pair
<string
,string
>("stroke_diff", ps
->substr(3)));
686 } else if(sTemp
== "ZSP") {
687 k
.skipMisclass
.push_back(
688 pair
<string
,string
>("stroke_count", ps
->substr(3)));
691 oss
<< ERR_PREF
<< "Unknown entry \"" << *ps
<< "\" found!";
692 el
.Push(EL_Error
, oss
.str());
695 /* Korean/Pinyin (Chinese) romanization */
697 k
.korean_r
.push_back(ps
->substr(1));
700 k
.pinyin
.push_back(ps
->substr(1));
705 /* Make sure we grab the whole meaning entry - pop more tokens and
706 append if necessary. */
707 while(*(sTemp
.rbegin()) != '}') {
709 if(tl
.size()==0) break;
710 sTemp
.append(1, ' ');
711 sTemp
.append(tl
.front());
713 if(*(sTemp
.rbegin()) != '}') {
714 /* Shouldn't happen, but I want to be safe. */
716 oss
<< ERR_PREF
<< "Unable to find ending '}' character!\n"
717 << "Entry responsible: [" << kanjidicEntry
<< "]";
718 el
.Push(EL_Error
, oss
.str());
719 /* Strip only the starting {, since } is not present. */
720 sTemp
= sTemp
.substr(1, sTemp
.length()-1);
722 /* Strip {} from around the string. */
723 sTemp
= sTemp
.substr(1, sTemp
.length()-2);
725 k
.meaning
["en"].push_back(sTemp
);
730 /* Check for readings */
731 /* The first character may be 〜, but if so, it -will- be
732 followed by a kana character. */
733 wsTemp
= utfconv_mw(*ps
);
735 cKanaTest
= wsTemp
[1];
736 else cKanaTest
= wsTemp
[0];
738 if(IsHiragana(cKanaTest
)) {
739 k
.kunyomi
.push_back(*ps
);
740 } else if(IsKatakana(cKanaTest
)) {
741 k
.onyomi
.push_back(*ps
);
745 << "UNHANDLED entry \"" << *ps
<< "\" encountered!";
746 el
.Push(EL_Error
, oss
.str());
751 k
.nanori
.push_back(*ps
);
760 << "Unknown tmode value (" << tmode
<< ") encountered!";
761 el
.Push(EL_Error
, oss
.str());
771 /* This could be sped up: copy the first UTF-8 character into a string, then
772 run a conversion on that. Trivial though. */
773 void KDict::KanjidicParser(char* kanjidicRawData
, const char* jisStd
) {
774 char* token
= strtok(kanjidicRawData
, "\n");
777 if( (strlen(token
)>0) && (token
[0]!='#') ) {
778 wToken
= utfconv_mw(token
);
779 /* Convert token to proper format */
780 wToken
= ConvertKanjidicEntry(wToken
);
781 /* Create new KInfo object.
782 If one already exists for this character, copy over the
785 BoostHM
<wchar_t, KInfo
>::iterator it
= kdictData
.find(wToken
[0]);
786 if(it
!=kdictData
.end()) k
= it
->second
;
787 /* Fill the KInfo structure */
788 KanjidicToKInfo(utfconv_wm(wToken
), k
, jisStd
);
790 /* Add to hash table */
791 if(!kdictData
.assign(wToken
[0], k
)) {
793 string temp
= utfconv_wm(wToken
);
794 oss
<< ERR_PREF
<< "Error assigning (" << temp
[0]<< ", "
795 << temp
<< ") to hash table!";
796 el
.Push(EL_Error
, oss
.str());
799 token
= strtok(NULL
, "\n");
804 /* Currently: nothing here. */
808 * Performs transformations on a KANJIDIC string for our internal usage.
809 * Currently, this includes the following:
810 * - Changing あ.いう notation to あ(いう), a la JWPce/JFC.
811 * - Changing -あい notation to 〜あい, also a la JWPce/JFC.
813 wstring
KDict::ConvertKanjidicEntry(const wstring
& s
) {
814 size_t index
, lastIndex
;
817 /* First conversion: あ.いう to あ(いう) */
818 index
= temp
.find(L
'.', 0);
819 while(index
!=wstring::npos
) {
820 /* Proceed if the character preceding the "." is hiragana/katakana. */
821 if(IsFurigana(temp
[index
-1])) {
823 index
= temp
.find(L
' ', index
+1);
824 if(index
==wstring::npos
) {
825 temp
.append(1, L
')');
828 temp
.insert(index
, 1, L
')');
831 index
= temp
.find(L
'.', lastIndex
+1);
834 /* Second conversion: - to 〜, when a neighboring
835 character is hiragana/katakana */
836 index
= temp
.find(L
'-', 0);
837 while(index
!=wstring::npos
) {
838 /* Proceed if the character before or after
839 the "-" is hiragana/katakana. */
840 if(IsFurigana(temp
[index
-1]) || IsFurigana(temp
[index
+1]))
844 index
= temp
.find(L
'-', lastIndex
+1);
847 /* Return the converted string */
851 wstring
KDict::KInfoToHtml(const KInfo
& kInfo
) {
852 Preferences
* prefs
= Preferences::Get();
853 return KInfoToHtml(kInfo
,
854 prefs
->kanjidicOptions
,
855 prefs
->kanjidicDictionaries
);
858 wstring
KDict::KInfoToHtml(const KInfo
& kInfo
,
859 long options
, long dictionaries
) {
860 /* return wstring(L"<p>")
863 #warning KInfoToHtml currently is unimplemented!
865 wostringstream result
;
866 wostringstream header
;
867 wstring onyomi
, kunyomi
, nanori
, radicalReading
, english
;
868 wstring dictionaryInfo
;
869 wstring lowRelevance
;
871 long grade
= -1, frequency
= -1, tmode
= 0;
873 wstring koreanRomanization
, pinyinRomanization
, crossReferences
, miscodes
;
874 wstring sTemp
, token
;
875 list
<wstring
> t
= StrTokenize
<wchar_t>(kanjidicStr
, L
" ");
878 /* Special processing for the first 2 entries of the line. */
880 /* header = "<h1><font size=\"-6\">" + args[0] + "</font></h1>"; */
881 /*header.append(L"<p style=\"font-size:32pt\">") */
882 header
<< L
"<p><font size=\"7\">" << t
.front() << L
"</font></p>";
884 lowRelevance
.append(L
"<li>JIS code: 0x")
890 /* NEW! Temporary code for loading in SODs and SODAs from KanjiCafe! */
891 if(options
& (KDO_SOD_STATIC
| KDO_SOD_ANIM
) != 0) {
893 /* Get a UTF8-encoded string for the kanji. */
894 utfStr
= utfconv_wm(kanjidicStr
.substr(0,1));
896 /* Convert to a low-to-high-byte hex string. */
898 for(unsigned int i
=0;i
<utfStr
.length();i
++) {
899 ss
<< hex
<< setw(2) << setfill('0')
900 << (unsigned int)((unsigned char)utfStr
[i
]);
904 /* Load static SOD, if present */
905 if((options
& KDO_SOD_STATIC
) != 0) {
906 Preferences
* p
= Preferences::Get();
908 string sodDir
= p
->GetSetting("sod_dir");
909 if(sodDir
.length()==0) sodDir
= "sods";
910 fn
<< sodDir
<< DSCHAR
911 << "sod-utf8-hex" << DSCHAR
912 << ss
.str() << ".png";
915 printf("DEBUG: Checking for existance of file \"%s\"...\n",
918 ifstream
f(fn
.str().c_str());
921 if(sod
.str().length()>0) sod
<< L
"<br />";
922 sod
<< L
"<img src=\"" << utfconv_mw(fn
.str()) << L
"\" />";
925 /* Load animated SOD, if present */
926 if((options
& KDO_SOD_ANIM
) != 0) {
928 fn
<< "sods" << DSCHAR
929 << "soda-utf8-hex" << DSCHAR
930 << ss
.str() << ".gif";
932 printf("DEBUG: Checking for existance of file \"%s\"...\n",
935 ifstream
f(fn
.str().c_str());
938 if(sod
.str().length()>0) sod
<< L
"<br />";
939 sod
<< L
"<img src=\"" << utfconv_mw(fn
.str()) << L
"\" />";
942 /* Append the chart(s) in a paragraph object. */
943 if(sod
.str().length()>0) {
944 header
<< L
"<p>" << sod
.str() <<
945 L
"<br /><font size=\"1\">(Kanji stroke order graphics "
946 L
"used under license from KanjiCafe.com.)</font></p>";
955 /* If a preceding character is detected, strip it */
956 if(c
== L
'(' || c
== L
'〜') {
957 sTemp
= sTemp
.substr(1);
962 /* Onyomi reading detected */
963 /*if(onyomi.length()>0) onyomi.append(L" "); */
964 if(onyomi
.length()>0) onyomi
.append(L
" ");
965 onyomi
.append(token
); /* Copy the original string, including ()'s and 〜's */
968 else if(IsHiragana(c
)) {
969 /* Kunyomi reading detected */
970 if(kunyomi
.length()>0) kunyomi
.append(L
" ");
971 kunyomi
.append(token
); /* Copy the original string, including ()'s and 〜's */
974 } else if(tmode
==1) {
976 /* Nanori reading detected */
977 if(nanori
.length()>0) nanori
.append(L
" ");
978 nanori
.append(token
); /* Copy the original string, including ()'s and 〜's */
981 } else if(tmode
==2) {
983 /* Special radical reading detected */
984 if(radicalReading
.length()>0) radicalReading
.append(L
" ");
985 radicalReading
.append(token
);
990 /* English meaning detected
991 Special handling is needed to take care of spaces, though.
992 We'll "cheat" and mess with our iterator a bit if a space is detected. */
993 while(t
.size()>0 && sTemp
[sTemp
.length()-1] != L
'}') {
994 sTemp
.append(L
" ").append(t
.front());
997 if(english
.length()>0) english
.append(L
", ");
998 english
.append(sTemp
.substr(1,sTemp
.length()-2)); /* Strip the {} */
1002 case L
'T': /* Change "t mode" */
1003 /*wstring(sTemp.substr(1)).ToLong(&tmode);*/
1004 wistringstream(sTemp
.substr(1)) >> tmode
;
1006 if(tmode
>2) printf("WARNING: T-mode set to %d.\nT-modes above 2 are not currently documented!", (int)tmode
);
1009 case L
'B': /* Bushu radical */
1010 lowRelevance
.append(L
"<li>Bushu radical: ").append(sTemp
.substr(1)).append(L
"</li>");
1012 case L
'C': /* Classical radical */
1013 lowRelevance
.append(L
"<li>Classical radical: ").append(sTemp
.substr(1)).append(L
"</li>");
1015 case L
'F': /* Frequency */
1016 /*wstring(sTemp.substr(1)).ToLong(&frequency);*/
1017 wistringstream(sTemp
.substr(1)) >> frequency
;
1019 case L
'G': /* Grade level */
1020 /*wstring(sTemp.substr(1)).ToLong(&grade);*/
1021 wistringstream(sTemp
.substr(1)) >> grade
;
1023 case L
'S': /* Stroke count */
1024 if(strokes
.length()==0) {
1025 strokes
= sTemp
.substr(1);
1026 } else if(!strokes
.find(L
' ')!=wstring::npos
) {
1027 strokes
.append(L
" (Miscounts: ")
1028 .append(sTemp
.substr(1))
1031 strokes
= strokes
.substr(0, strokes
.length()-1)
1033 .append(sTemp
.substr(1))
1037 case L
'U': /* Unicode value */
1038 lowRelevance
.append(L
"<li>Unicode: 0x").append(sTemp
.substr(1)).append(L
"</li>");
1040 /* From here, it's all dictionary codes */
1042 if((dictionaries
& KDD_NJECD
)!=0)
1043 dictionaryInfo
.append(L
"<li>New Japanese-English Character Dictionary (Halpern): ")
1044 .append(sTemp
.substr(1)).append(L
"</li>");
1047 if((dictionaries
& KDD_MRJECD
)!=0)
1048 dictionaryInfo
.append(L
"<li>Modern Reader's Japanese-English Character Dictionary (Nelson): ")
1049 .append(sTemp
.substr(1)).append(L
"</li>");
1052 if((dictionaries
& KDD_NNJECD
)!=0)
1053 dictionaryInfo
.append(L
"<li>The New Nelson's Japanese-English Character Dictionary: ")
1054 .append(sTemp
.substr(1)).append(L
"</li>");
1058 /* Thanks to changes in permissible SKIP code usage (change to
1059 Creative Commons licensing in January 2008), we can now use
1060 this without problems. */
1061 if((dictionaries
& KDD_SKIP
)!=0)
1062 dictionaryInfo
.append(L
"<li>SKIP code: ")
1063 .append(sTemp
.substr(1)).append(L
"</li>");
1065 case L
'I': /* Spahn/Hadamitzky dictionaries */
1066 if(sTemp
[1]==L
'N') {
1067 if((dictionaries
& KDD_KK
)!=0) {
1068 dictionaryInfo
.append(L
"<li>Kanji & Kana (Spahn, Hadamitzky): ")
1069 .append(sTemp
.substr(2)).append(L
"</li>");
1072 if((dictionaries
& KDD_KD
)!=0) {
1073 dictionaryInfo
.append(L
"<li>Kanji Dictionary (Spahn, Hadamitzky): ")
1074 .append(sTemp
.substr(1)).append(L
"</li>");
1079 if((dictionaries
& KDD_FC
)!=0) {
1080 dictionaryInfo
.append(L
"<li>Four Corner code: ")
1081 .append(sTemp
.substr(1)).append(L
"</li>");
1087 if((dictionaries
& KDD_MOROI
)!=0) {
1088 dictionaryInfo
.append(L
"<li>Morohashi Daikanwajiten Index: ")
1089 .append(sTemp
.substr(2)).append(L
"</li>");
1091 } else if(c2
==L
'P') {
1092 if((dictionaries
& KDD_MOROVP
)!=0) {
1093 dictionaryInfo
.append(L
"<li>Morohashi Daikanwajiten Volume/Page: ")
1094 .append(sTemp
.substr(2)).append(L
"</li>");
1099 if((dictionaries
& KDD_GRJC
)!=0) {
1100 dictionaryInfo
.append(L
"<li>A Guide to Remembering Japanese Characters (Henshal): ")
1101 .append(sTemp
.substr(1)).append(L
"</li>");
1105 if((dictionaries
& KDD_GKD
)!=0) {
1106 dictionaryInfo
.append(L
"<li>Gakken Kanji Dictionary (\"A New Dictionary of Kanji Usage\"): ")
1107 .append(sTemp
.substr(1)).append(L
"</li>");
1111 if((dictionaries
& KDD_RTK
)!=0) {
1112 dictionaryInfo
.append(L
"<li>Remembering the Kanji (Heisig): ")
1113 .append(sTemp
.substr(1)).append(L
"</li>");
1117 if((dictionaries
& KDD_JN
)!=0) {
1118 dictionaryInfo
.append(L
"<li>Japanese Names (O'Neill): ")
1119 .append(sTemp
.substr(1)).append(L
"</li>");
1126 if((dictionaries
& KDD_JBP
)!=0) {
1127 dictionaryInfo
.append(L
"<li>Japanese for Busy People (AJLT): ")
1128 .append(sTemp
.substr(2)).append(L
"</li>");
1132 if((dictionaries
& KDD_KWJLP
)!=0) {
1133 dictionaryInfo
.append(L
"<li>The Kanji Way to Japanese Language Power (Crowley): ")
1134 .append(sTemp
.substr(2)).append(L
"</li>");
1138 if((dictionaries
& KDD_JKF
)!=0) {
1139 dictionaryInfo
.append(L
"<li>Japanese Kanji Flashcards (White Rabbit Press): ")
1140 .append(sTemp
.substr(2)).append(L
"</li>");
1144 if((dictionaries
& KDD_KCKG
)!=0) {
1145 dictionaryInfo
.append(L
"<li>Kodansha Compact Kanji Guide: ")
1146 .append(sTemp
.substr(2)).append(L
"</li>");
1150 if((dictionaries
& KDD_GTRWJH
)!=0) {
1151 dictionaryInfo
.append(L
"<li>A Guide To Reading and Writing Japanese (Hensall): ")
1152 .append(sTemp
.substr(2)).append(L
"</li>");
1156 if((dictionaries
& KDD_KIC
)!=0) {
1157 dictionaryInfo
.append(L
"<li>Kanji in Context (Nishiguchi and Kono): ")
1158 .append(sTemp
.substr(2)).append(L
"</li>");
1162 if((dictionaries
& KDD_KLD
)!=0) {
1163 dictionaryInfo
.append(L
"<li>Kanji Learner's Dictionary (Halpern): ")
1164 .append(sTemp
.substr(2)).append(L
"</li>");
1168 if((dictionaries
& KDD_EK
)!=0) {
1169 dictionaryInfo
.append(L
"<li>Essential Kanji (O'Neill): ")
1170 .append(sTemp
.substr(2)).append(L
"</li>");
1174 if((dictionaries
& KDD_DR
)!=0) {
1175 dictionaryInfo
.append(L
"<li>2001 Kanji (De Roo): ")
1176 .append(sTemp
.substr(2)).append(L
"</li>");
1180 if((dictionaries
& KDD_GTRWJS
)!=0) {
1181 dictionaryInfo
.append(L
"<li>A Guide to Reading and Writing Japanese (Sakade): ")
1182 .append(sTemp
.substr(2)).append(L
"</li>");
1186 if((dictionaries
& KDD_TKC
)!=0) {
1187 dictionaryInfo
.append(L
"<li>Tuttle Kanji Cards (Kask): ")
1188 .append(sTemp
.substr(2)).append(L
"</li>");
1192 if(unhandled
.length()>0) unhandled
.append(L
" ");
1193 unhandled
.append(sTemp
);
1197 /* Crossreferences and miscodes */
1199 if(crossReferences
.length()>0) crossReferences
.append(L
", ");
1200 crossReferences
.append(sTemp
.substr(1));
1203 if(miscodes
.length()>0) miscodes
.append(L
", ");
1204 miscodes
.append(sTemp
.substr(1));
1206 /* Korean/Pinyin (Chinese) romanization */
1208 if(koreanRomanization
.length()>0) koreanRomanization
.append(L
", ");
1209 koreanRomanization
.append(sTemp
.substr(1));
1212 if(pinyinRomanization
.length()>0) pinyinRomanization
.append(L
", ");
1213 pinyinRomanization
.append(sTemp
.substr(1));
1216 if(unhandled
.length()>0) unhandled
.append(L
" ");
1217 unhandled
.append(sTemp
);
1221 } /* while(t.HasMoreTokens()) */
1223 if(header
.str().length() > 0) result
<< header
.str();
1225 printf("DEBUG: header=[%ls]\n", header
.str().c_str());
1228 if((options
& KDO_READINGS
) != 0) {
1229 if(onyomi
.length() > 0)
1230 result
<< L
"<li>Onyomi Readings: " << onyomi
<< L
"</li>";
1231 if(kunyomi
.length() > 0)
1232 result
<< L
"<li>Kunyomi Readings: " << kunyomi
<< L
"</li>";
1233 if(nanori
.length() > 0)
1234 result
<< L
"<li>Nanori Readings: " << nanori
<< L
"</li>";
1235 if(radicalReading
.length() > 0)
1236 result
<< L
"<li>Special Radical Reading: " << radicalReading
<<
1239 if((options
& KDO_MEANINGS
) != 0) {
1240 if(english
.length() > 0)
1241 result
<< L
"<li>English Meanings: " << english
<< L
"</li>";
1243 if((options
& KDO_HIGHIMPORTANCE
) != 0) {
1244 if(strokes
.length() > 0)
1245 result
<< L
"<li>Stroke count: " << strokes
<< L
"</li>";
1247 result
<< L
"<li>Stroke count: not specified in KANJIDIC</li>";
1248 result
<< L
"<li>Grade Level: ";
1249 if(grade
<=6 && grade
>= 1) { /* Jouyou (Grade #) */
1250 result
<< L
"Jouyou (Grade " << grade
<< L
")";
1251 } else if(grade
==8) { /* Jouyou (General usage) */
1252 result
<< L
"Jouyou (General usage)";
1253 } else if(grade
==9) { /* Jinmeiyou (Characters for names) */
1254 result
<< L
"Jinmeiyou (Characters for names)";
1255 } else if(grade
==-1) { /* No flag specified in kanjidic string */
1256 result
<< L
"Unspecified";
1258 result
<< L
"Unhandled grade level (Grade " << grade
<< L
")";
1262 result
<< L
"<li>Frequency Ranking: " << frequency
<< L
"</li>";
1263 else result
<< L
"<li>Frequency Ranking: Unspecified</li>";
1265 if((options
& KDO_DICTIONARIES
) != 0) {
1266 if(dictionaryInfo
.length()>0)
1267 result
<< L
"<li>Dictionary Codes:<ul>" << dictionaryInfo
1270 if((options
& KDO_VOCABCROSSREF
) != 0) {
1271 vector
<wstring
> *vList
= &(jben
->vocabList
->GetVocabList());
1272 wchar_t thisKanji
= kanjidicStr
[0];
1273 vector
<wstring
> crossRefList
;
1274 vector
<wstring
>::iterator vIt
;
1275 for(vIt
=vList
->begin(); vIt
!=vList
->end(); vIt
++) {
1276 if(vIt
->find(thisKanji
)!=wstring::npos
) {
1277 crossRefList
.push_back(*vIt
);
1280 if(crossRefList
.size()>0) {
1281 result
<< L
"<li>This kanji is used by words in your study list:<br><font size=\"7\">";
1282 vIt
= crossRefList
.begin();
1284 for(++vIt
; vIt
!=crossRefList
.end(); vIt
++) {
1285 result
<< L
" " << *vIt
;
1287 result
<< L
"</font></li>";
1290 if((options
& KDO_LOWIMPORTANCE
) != 0) {
1291 if(koreanRomanization
.length()>0) lowRelevance
.append(L
"<li>Korean romanization: ").append(koreanRomanization
).append(L
"</li>");
1292 if(pinyinRomanization
.length()>0) lowRelevance
.append(L
"<li>Pinyin romanization: ").append(pinyinRomanization
).append(L
"</li>");
1293 if(crossReferences
.length()>0) lowRelevance
.append(L
"<li>Cross reference codes: ").append(crossReferences
).append(L
"</li>");
1294 if(miscodes
.length()>0) lowRelevance
.append(L
"<li>Miscodes: ").append(miscodes
).append(L
"</li>");
1295 if(lowRelevance
.length()>0)
1296 result
<< L
"<li>Extra Information:<ul>" << lowRelevance
1299 if((options
& KDO_UNHANDLED
) != 0) {
1300 if(unhandled
.length()>0)
1301 result
<< L
"<li>Unhandled: " << unhandled
<< L
"</li>";
1305 return result
.str();
1310 const BoostHM
<wchar_t,KInfo
>* KDict::GetHashTable() const {
1314 bool KDict::MainDataLoaded() const {
1315 if(kdictData
.size()>0) return true;
1319 const KInfo
* KDict::GetEntry(const wchar_t key
) const {
1320 BoostHM
<wchar_t, KInfo
>::const_iterator kci
= kdictData
.find(key
);
1321 if(kci
!= kdictData
.end())
1322 return &(kci
->second
);