kdict.cpp

   1 /*
   2 Project: J-Ben
   3 Author:  Paul Goins
   4 Website: http://www.vultaire.net/software/jben/
   5 License: GNU General Public License (GPL) version 2
   6          (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
   7
   8 File: kanjidic.cpp
   9
  10 This program is free software; you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation; either version 2 of the License, or
  13 (at your option) any later version.
  14
  15 This program is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with this program.  If not, see <http://www.gnu.org/licenses/>
  22 */
  23
  24 #include "jben.h"
  25 #include "kdict.h"
  26 #include "file_utils.h"
  27 #include "jutils.h"
  28 #include "encoding_convert.h"
  29 #include "string_utils.h"
  30 #include "errorlog.h"
  31 #include <sstream>
  32 #include <iomanip>
  33 #include <fstream>
  34 #include <list>
  35 using namespace std;
  36
  37 KDict* KDict::kdictSingleton = NULL;
  38
  39 const KDict *KDict::Get() {
  40         if(!kdictSingleton)
  41                 kdictSingleton = new KDict;
  42         return kdictSingleton;
  43 }
  44
  45 KDict::KDict() {
  46         Preferences *p = Preferences::Get();
  47         LoadKanjidic(p->GetSetting("kdict_kanjidic").c_str());
  48         LoadKradfile(p->GetSetting("kdict_kradfile").c_str());
  49         LoadRadkfile(p->GetSetting("kdict_radkfile").c_str());
  50 }
  51
  52 void KDict::Destroy() {
  53         if(kdictSingleton) {
  54                 delete kdictSingleton;
  55                 kdictSingleton = NULL;
  56         }
  57 }
  58
  59 int KDict::LoadKanjidic(const char *filename) {
  60         char *rawData = NULL;
  61         unsigned int size;
  62         int returnCode=KD_FAILURE;
  63
  64         ifstream ifile(filename, ios::ate); /* "at end" to get our file size */
  65         if(ifile) {
  66                 size = ifile.tellg();
  67                 ifile.seekg(0);
  68                 rawData = new char[size+1];
  69                 rawData[size] = '\0';
  70                 ifile.read(rawData, size);
  71                 if(strlen(rawData)!=size) {
  72                         ostringstream os;
  73                         os << "kanjidic file size: "
  74                            << strlen(rawData)
  75                            << ", read-in string: "
  76                            << size << "\n";
  77                         el.Push(EL_Warning, os.str());
  78                 }
  79
  80                 /* Create the kanjidic object with our string data. */
  81                 this->KanjidicParser(rawData);
  82
  83                 returnCode = KD_SUCCESS;
  84         }
  85         else
  86                 returnCode = KD_FAILURE;
  87
  88         if(rawData) delete[] rawData;
  89         return returnCode;
  90 }
  91
  92 int KDict::LoadKradfile(const char *filename) {
  93         int returnCode = KD_FAILURE;
  94         stringbuf sb;
  95         ifstream f(filename, ios::in|ios::binary);
  96         if(f.is_open()) {
  97                 f >> &sb;
  98                 f.close();
  99
 100                 list<wstring> data =
 101                         StrTokenize<wchar_t>(utfconv_mw(sb.str()), L"\n");
 102                 while(data.size()>0) {
 103                         wstring token = data.front();
 104                         data.pop_front();
 105                         if(token.length()>0 && token[0]!=L'#') {
 106                                 /* KRADFILE-specific stuff here */
 107                                 /* Get rid of the spaces in the string */
 108                                 token = TextReplace<wchar_t>(token, L" ", L"");
 109                                 /* Now we can easily pull in the data */
 110                                 if(!kradData.assign(token[0], token.substr(2))) {
 111                                         ostringstream os;
 112                                         os << "KRADFILE: Error assigning ("
 113                                            << utfconv_wm(token.substr(0,1))
 114                                            << ", "
 115                                            << utfconv_wm(token.substr(2))
 116                                            << ") to hash table!\n";
 117                                         el.Push(EL_Error, os.str());
 118                                 }
 119                         }
 120                 }
 121
 122                 returnCode = KD_SUCCESS;
 123         }
 124         return returnCode;
 125 }
 126
 127 int KDict::LoadRadkfile(const char *filename) {
 128         int returnCode = KD_FAILURE;
 129         stringbuf sb;
 130         ifstream f(filename, ios::in|ios::binary);
 131         if(f.is_open()) {
 132                 f >> &sb;
 133                 f.close();
 134
 135                 /* RADKFILE entries all start with $.
 136                    Split on $, and discard the first entry since it is the explanation
 137                    preceding the first entry. */
 138                 list<wstring> data =
 139                         StrTokenize<wchar_t>(utfconv_mw(sb.str()), L"$");
 140                 data.pop_front();
 141
 142                 while(data.size()>0) {
 143                         wstring entry = data.front();
 144                         data.pop_front();
 145                         if(entry.length()>0 && entry[0]!=L'#') {
 146                                 /* RADKFILE-specific stuff here */
 147                                 list<wstring> entryData =
 148                                         StrTokenize<wchar_t>(entry, L"\n", false, 2);
 149                                 if(entryData.size()!=2) {
 150                                         cerr << "Error: entryData.size() == "
 151                                                  << entryData.size() << "!!" << endl;
 152                                 } else {
 153                                         wchar_t key;
 154                                         int strokeCount;
 155                                         wstring value;
 156                                         /* entryData.front() contains our key.
 157                                            It's a space delimited string,
 158                                            first token is our kanji, second is the stroke count.
 159                                            A third token may be present, but is irrelevant. */
 160                                         list<wstring> keyData =
 161                                                 StrTokenize<wchar_t>(entryData.front(), L" ");
 162                                         wistringstream wiss;
 163                                         wiss.str(keyData.front());
 164                                         wiss >> key;
 165                                         keyData.pop_front();
 166                                         wiss.str(keyData.front());
 167                                         wiss >> strokeCount;
 168
 169                                         /* entryData.back() contains the characters our key
 170                                            maps to. */
 171                                         /* Get rid of the spaces in the string */
 172                                         value = entryData.back();
 173                                         value = TextReplace<wchar_t>(value, L"\n", L"");
 174                                         value = TextReplace<wchar_t>(value, L" ", L"");
 175
 176                                         if(!radkData.assign(key, value)) {
 177                                                 ostringstream os;
 178                                                 os << "RADKFILE: Error assigning ("
 179                                                    << utfconv_wm(wstring().append(1,key))
 180                                                    << ", "
 181                                                    << utfconv_wm(value)
 182                                                    << ") to hash table!\n";
 183                                                 el.Push(EL_Error, os.str());
 184                                         }
 185                                         if(!radkDataStrokes.assign(key, strokeCount)) {
 186                                                 ostringstream os;
 187                                                 os << "RADKFILE: Error assigning ("
 188                                                    << utfconv_wm(wstring().append(1,key))
 189                                                    << ", " << strokeCount << ") to hash table!\n";
 190                                                 el.Push(EL_Error, os.str());
 191                                         }
 192                                 }
 193                         }
 194                 }
 195
 196                 returnCode = KD_SUCCESS;
 197         }
 198         return returnCode;
 199 }
 200
 201 /* This could be sped up: copy the first UTF-8 character into a string, then
 202    run a conversion on that.  Trivial though. */
 203 void KDict::KanjidicParser(char *kanjidicRawData) {
 204         char *token = strtok(kanjidicRawData, "\n");
 205         wstring wToken;
 206         while(token) {
 207                 if( (strlen(token)>0) && (token[0]!='#') ) {
 208                         wToken = utfconv_mw(token);
 209                         /* Convert token to proper format */
 210                         wToken = ConvertKanjidicEntry(wToken);
 211                         /* Add to hash table */
 212                         if(!kanjidicData.assign(wToken[0], token)) {
 213                                 ostringstream os;
 214                                 string temp = utfconv_wm(wToken);
 215                                 os << "Error assigning (" << temp[0]
 216                                    << ", " << temp << ") to hash table!\n";
 217                                 el.Push(EL_Error, os.str());
 218                         }
 219                 }
 220                 token = strtok(NULL, "\n");
 221         }
 222 }
 223
 224 KDict::~KDict() {
 225         /* Currently: nothing here. */
 226 }
 227
 228 /* This function returns a wstring containing the desired line of the
 229    kanjidic hash table.  A conversion from string to wstring is included
 230    in this call since standardstrings are only used for more compressed
 231    internal storage.  This is followed by a slight reformatting of the
 232    string for better presentation. */
 233 wstring KDict::GetKanjidicStr(wchar_t c) const {
 234         BoostHM<wchar_t,string>::iterator it = kanjidicData.find(c);
 235         if(it==kanjidicData.end()) return L"";
 236         wstring s;
 237         s = utfconv_mw(it->second);
 238         return ConvertKanjidicEntry(s);
 239 }
 240
 241 /*
 242  * Performs transformations on a KANJIDIC string for our internal usage.
 243  * Currently, this includes the following:
 244  * - Changing あ.いう notation to あ(いう), a la JWPce/JFC.
 245  * - Changing -あい notation to 〜あい, also a la JWPce/JFC.
 246  */
 247 wstring KDict::ConvertKanjidicEntry(const wstring& s) {
 248         size_t index, lastIndex;
 249         wstring temp = s;
 250
 251         /* First conversion: あ.いう to あ(いう) */
 252         index = temp.find(L'.', 0);
 253         while(index!=wstring::npos) {
 254                 /* Proceed if the character preceding the "." is hiragana/katakana. */
 255                 if(IsFurigana(temp[index-1])) {
 256                         temp[index] = L'(';
 257                         index = temp.find(L' ', index+1);
 258                         if(index==wstring::npos) {
 259                                 temp.append(1, L')');
 260                                 break;
 261                         } else
 262                                 temp.insert(index, 1, L')');
 263                 }
 264                 lastIndex = index;
 265                 index = temp.find(L'.', lastIndex+1);
 266         }
 267
 268         /* Second conversion: - to 〜, when a neighboring character is hiragana/katakana */
 269         index = temp.find(L'-', 0);
 270         while(index!=wstring::npos) {
 271                 /* Proceed if the character before or after the "-" is hiragana/katakana. */
 272                 if(IsFurigana(temp[index-1]) || IsFurigana(temp[index+1]))
 273                         temp[index]=L'〜';
 274
 275                 lastIndex = index;
 276                 index = temp.find(L'-', lastIndex+1);
 277         }
 278
 279         /* Return the converted string */
 280         return temp;
 281 }
 282
 283 wstring KDict::KanjidicToHtml(const wstring& kanjidicStr) {
 284         Preferences *prefs = Preferences::Get();
 285         return KanjidicToHtml(kanjidicStr,
 286                                                   prefs->kanjidicOptions,
 287                                                   prefs->kanjidicDictionaries);
 288 }
 289
 290 wstring KDict::KanjidicToHtml(const wstring& kanjidicStr,
 291                                                           long options, long dictionaries) {
 292 /*      return wstring(L"<p>")
 293                 .append(s[0])
 294                 .append(L"</p>");*/
 295
 296         wostringstream result;
 297         wostringstream header;
 298         wstring onyomi, kunyomi, nanori, radicalReading, english;
 299         wstring dictionaryInfo;
 300         wstring lowRelevance;
 301         wstring unhandled;
 302         long grade = -1, frequency = -1, tmode = 0;
 303         wstring strokes;
 304         wstring koreanRomanization, pinyinRomanization, crossReferences, miscodes;
 305         wstring sTemp, token;
 306         list<wstring> t = StrTokenize<wchar_t>(kanjidicStr, L" ");
 307         wchar_t c, c2;
 308
 309         /* Special processing for the first 2 entries of the line. */
 310         if(t.size()>1) {
 311                 /* header = "<h1><font size=\"-6\">" + args[0] + "</font></h1>"; */
 312                 /*header.append(L"<p style=\"font-size:32pt\">") */
 313                 header << L"<p><font size=\"7\">" << t.front() << L"</font></p>";
 314                 t.pop_front();
 315                 lowRelevance.append(L"<li>JIS code: 0x")
 316                         .append(t.front())
 317                         .append(L"</li>");
 318                 t.pop_front();
 319         }
 320
 321         /* NEW!  Temporary code for loading in SODs and SODAs from KanjiCafe! */
 322         if(options & (KDO_SOD_STATIC | KDO_SOD_ANIM) != 0) {
 323                 string utfStr;
 324                 /* Get a UTF8-encoded string for the kanji. */
 325                 utfStr = utfconv_wm(kanjidicStr.substr(0,1));
 326
 327                 /* Convert to a low-to-high-byte hex string. */
 328                 ostringstream ss;
 329                 for(unsigned int i=0;i<utfStr.length();i++) {
 330                         ss << hex << setw(2) << setfill('0')
 331                            << (unsigned int)((unsigned char)utfStr[i]);
 332                 }
 333
 334                 wstringstream sod;
 335                 /* Load static SOD, if present */
 336                 if((options & KDO_SOD_STATIC) != 0) {
 337                         ostringstream fn;
 338                         fn << "sods" << DIRSEP
 339                            << "sod-utf8-hex" << DIRSEP
 340                            << ss.str() << ".png";
 341 #ifdef DEBUG
 342                         printf("DEBUG: Checking for existance of file \"%s\"...\n", fn.str().c_str());
 343 #endif
 344                         ifstream f(fn.str().c_str());
 345                         if(f.is_open()) {
 346                                 f.close();
 347                                 if(sod.str().length()>0) sod << L"<br />";
 348                                 sod << L"<img src=\"" << utfconv_mw(fn.str()) << L"\" />";
 349                         }
 350                 }
 351                 /* Load animated SOD, if present */
 352                 if((options & KDO_SOD_ANIM) != 0) {
 353                         ostringstream fn;
 354                         fn << "sods" << DIRSEP
 355                            << "soda-utf8-hex" << DIRSEP
 356                            << ss.str() << ".gif";
 357 #ifdef DEBUG
 358                         printf("DEBUG: Checking for existance of file \"%s\"...\n", fn.str().c_str());
 359 #endif
 360                         ifstream f(fn.str().c_str());
 361                         if(f.is_open()) {
 362                                 f.close();
 363                                 if(sod.str().length()>0) sod << L"<br />";
 364                                 sod << L"<img src=\"" << utfconv_mw(fn.str()) << L"\" />";
 365                         }
 366                 }
 367                 /* Append the chart(s) in a paragraph object. */
 368                 if(sod.str().length()>0) {
 369                         header << L"<p>" << sod.str() <<
 370                                 L"<br /><font size=\"1\">(Kanji stroke order graphics used under license from KanjiCafe.com.)</font></p>";
 371                 }
 372         }
 373
 374         while(t.size()>0) {
 375                 token = t.front();
 376                 t.pop_front();
 377                 sTemp = token;
 378                 c = sTemp[0];
 379                 /* If a preceding character is detected, strip it */
 380                 if(c == L'(' || c == L'〜') {
 381                         sTemp = sTemp.substr(1);
 382                         c = sTemp[0];
 383                 }
 384                 if(tmode==0) {
 385                         if(IsKatakana(c)) {
 386                                 /* Onyomi reading detected */
 387                                 /*if(onyomi.length()>0) onyomi.append(L"　"); */
 388                                 if(onyomi.length()>0) onyomi.append(L"&nbsp; ");
 389                                 onyomi.append(token);   /* Copy the original string, including ()'s and 〜's */
 390                                 continue;
 391                         }
 392                         else if(IsHiragana(c)) {
 393                                 /* Kunyomi reading detected */
 394                                 if(kunyomi.length()>0) kunyomi.append(L"&nbsp; ");
 395                                 kunyomi.append(token);  /* Copy the original string, including ()'s and 〜's */
 396                                 continue;
 397                         }
 398                 } else if(tmode==1) {
 399                         if(IsFurigana(c)) {
 400                                 /* Nanori reading detected */
 401                                 if(nanori.length()>0) nanori.append(L"&nbsp; ");
 402                                 nanori.append(token);   /* Copy the original string, including ()'s and 〜's */
 403                                 continue;
 404                         }
 405                 } else if(tmode==2) {
 406                         if(IsFurigana(c)) {
 407                                 /* Special radical reading detected */
 408                                 if(radicalReading.length()>0) radicalReading.append(L"&nbsp; ");
 409                                 radicalReading.append(token);
 410                                 continue;
 411                         }
 412                 }
 413                 if(c == L'{') {
 414                         /* English meaning detected
 415                            Special handling is needed to take care of spaces, though.
 416                            We'll "cheat" and mess with our iterator a bit if a space is detected. */
 417                         while(t.size()>0 && sTemp[sTemp.length()-1] != L'}') {
 418                                 sTemp.append(L" ").append(t.front());
 419                                 t.pop_front();
 420                         }
 421                         if(english.length()>0) english.append(L", ");
 422                         english.append(sTemp.substr(1,sTemp.length()-2));  /* Strip the {} */
 423                 }
 424                 else {
 425                         switch(c) {
 426                         case L'T':  /* Change "t mode" */
 427                                 /*wstring(sTemp.substr(1)).ToLong(&tmode);*/
 428                                 wistringstream(sTemp.substr(1)) >> tmode;
 429 #ifdef DEBUG
 430                                 if(tmode>2) printf("WARNING: T-mode set to %d.\nT-modes above 2 are not currently documented!", (int)tmode);
 431 #endif
 432                                 break;
 433                         case L'B':  /* Bushu radical */
 434                                 lowRelevance.append(L"<li>Bushu radical: ").append(sTemp.substr(1)).append(L"</li>");
 435                                 break;
 436                         case L'C':  /* Classical radical */
 437                                 lowRelevance.append(L"<li>Classical radical: ").append(sTemp.substr(1)).append(L"</li>");
 438                                 break;
 439                         case L'F':  /* Frequency */
 440                                 /*wstring(sTemp.substr(1)).ToLong(&frequency);*/
 441                                 wistringstream(sTemp.substr(1)) >> frequency;
 442                                 break;
 443                         case L'G':  /* Grade level */
 444                                 /*wstring(sTemp.substr(1)).ToLong(&grade);*/
 445                                 wistringstream(sTemp.substr(1)) >> grade;
 446                                 break;
 447                         case L'S':  /* Stroke count */
 448                                 if(strokes.length()==0) {
 449                                         strokes = sTemp.substr(1);
 450                                 } else if(!strokes.find(L' ')!=wstring::npos) {
 451                                         strokes.append(L" (Miscounts: ")
 452                                                 .append(sTemp.substr(1))
 453                                                 .append(L")");
 454                                 } else {
 455                                         strokes = strokes.substr(0, strokes.length()-1)
 456                                                 .append(L", ")
 457                                                 .append(sTemp.substr(1))
 458                                                 .append(L")");
 459                                 }
 460                                 break;
 461                         case L'U':  /* Unicode value */
 462                                 lowRelevance.append(L"<li>Unicode: 0x").append(sTemp.substr(1)).append(L"</li>");
 463                                 break;
 464                         /* From here, it's all dictionary codes */
 465                         case L'H':
 466                                 if((dictionaries & KDD_NJECD)!=0)
 467                                         dictionaryInfo.append(L"<li>New Japanese-English Character Dictionary (Halpern): ")
 468                                                 .append(sTemp.substr(1)).append(L"</li>");
 469                                 break;
 470                         case L'N':
 471                                 if((dictionaries & KDD_MRJECD)!=0)
 472                                         dictionaryInfo.append(L"<li>Modern Reader's Japanese-English Character Dictionary (Nelson): ")
 473                                                 .append(sTemp.substr(1)).append(L"</li>");
 474                                 break;
 475                         case L'V':
 476                                 if((dictionaries & KDD_NNJECD)!=0)
 477                                         dictionaryInfo.append(L"<li>The New Nelson's Japanese-English Character Dictionary: ")
 478                                                 .append(sTemp.substr(1)).append(L"</li>");
 479                                 break;
 480                         case L'P':
 481                                 /* SKIP codes. */
 482                                 /* Thanks to changes in permissible SKIP code usage (change to
 483                                    Creative Commons licensing in January 2008), we can now use
 484                                    this without problems. */
 485                                 if((dictionaries & KDD_SKIP)!=0)
 486                                         dictionaryInfo.append(L"<li>SKIP code: ")
 487                                                 .append(sTemp.substr(1)).append(L"</li>");
 488                                 break;
 489                         case L'I':  /* Spahn/Hadamitzky dictionaries */
 490                                 if(sTemp[1]==L'N') {
 491                                         if((dictionaries & KDD_KK)!=0) {
 492                                                 dictionaryInfo.append(L"<li>Kanji & Kana (Spahn, Hadamitzky): ")
 493                                                         .append(sTemp.substr(2)).append(L"</li>");
 494                                         }
 495                                 } else {
 496                                         if((dictionaries & KDD_KD)!=0) {
 497                                                 dictionaryInfo.append(L"<li>Kanji Dictionary (Spahn, Hadamitzky): ")
 498                                                         .append(sTemp.substr(1)).append(L"</li>");
 499                                         }
 500                                 }
 501                                 break;
 502                         case L'Q':
 503                                 if((dictionaries & KDD_FC)!=0) {
 504                                         dictionaryInfo.append(L"<li>Four Corner code: ")
 505                                                 .append(sTemp.substr(1)).append(L"</li>");
 506                                 }
 507                                 break;
 508                         case L'M':
 509                                 c2 = sTemp[1];
 510                                 if(c2==L'N') {
 511                                         if((dictionaries & KDD_MOROI)!=0) {
 512                                                 dictionaryInfo.append(L"<li>Morohashi Daikanwajiten Index: ")
 513                                                         .append(sTemp.substr(2)).append(L"</li>");
 514                                         }
 515                                 } else if(c2==L'P') {
 516                                         if((dictionaries & KDD_MOROVP)!=0) {
 517                                                 dictionaryInfo.append(L"<li>Morohashi Daikanwajiten Volume/Page: ")
 518                                                         .append(sTemp.substr(2)).append(L"</li>");
 519                                         }
 520                                 }
 521                                 break;
 522                         case L'E':
 523                                 if((dictionaries & KDD_GRJC)!=0) {
 524                                         dictionaryInfo.append(L"<li>A Guide to Remembering Japanese Characters (Henshal): ")
 525                                                 .append(sTemp.substr(1)).append(L"</li>");
 526                                 }
 527                                 break;
 528                         case L'K':
 529                                 if((dictionaries & KDD_GKD)!=0) {
 530                                         dictionaryInfo.append(L"<li>Gakken Kanji Dictionary (\"A New Dictionary of Kanji Usage\"): ")
 531                                                 .append(sTemp.substr(1)).append(L"</li>");
 532                                 }
 533                                 break;
 534                         case L'L':
 535                                 if((dictionaries & KDD_RTK)!=0) {
 536                                         dictionaryInfo.append(L"<li>Remembering the Kanji (Heisig): ")
 537                                                 .append(sTemp.substr(1)).append(L"</li>");
 538                                 }
 539                                 break;
 540                         case L'O':
 541                                 if((dictionaries & KDD_JN)!=0) {
 542                                         dictionaryInfo.append(L"<li>Japanese Names (O'Neill): ")
 543                                                 .append(sTemp.substr(1)).append(L"</li>");
 544                                 }
 545                                 break;
 546                         case L'D':
 547                                 c2 = sTemp[1];
 548                                 switch(c2) {
 549                                 case L'B':
 550                                         if((dictionaries & KDD_JBP)!=0) {
 551                                                 dictionaryInfo.append(L"<li>Japanese for Busy People (AJLT): ")
 552                                                         .append(sTemp.substr(2)).append(L"</li>");
 553                                         }
 554                                         break;
 555                                 case L'C':
 556                                         if((dictionaries & KDD_KWJLP)!=0) {
 557                                                 dictionaryInfo.append(L"<li>The Kanji Way to Japanese Language Power (Crowley): ")
 558                                                         .append(sTemp.substr(2)).append(L"</li>");
 559                                         }
 560                                         break;
 561                                 case L'F':
 562                                         if((dictionaries & KDD_JKF)!=0) {
 563                                                 dictionaryInfo.append(L"<li>Japanese Kanji Flashcards (White Rabbit Press): ")
 564                                                         .append(sTemp.substr(2)).append(L"</li>");
 565                                         }
 566                                         break;
 567                                 case L'G':
 568                                         if((dictionaries & KDD_KCKG)!=0) {
 569                                                 dictionaryInfo.append(L"<li>Kodansha Compact Kanji Guide: ")
 570                                                         .append(sTemp.substr(2)).append(L"</li>");
 571                                         }
 572                                         break;
 573                                 case L'H':
 574                                         if((dictionaries & KDD_GTRWJH)!=0) {
 575                                                 dictionaryInfo.append(L"<li>A Guide To Reading and Writing Japanese (Hensall): ")
 576                                                         .append(sTemp.substr(2)).append(L"</li>");
 577                                         }
 578                                         break;
 579                                 case L'J':
 580                                         if((dictionaries & KDD_KIC)!=0) {
 581                                                 dictionaryInfo.append(L"<li>Kanji in Context (Nishiguchi and Kono): ")
 582                                                         .append(sTemp.substr(2)).append(L"</li>");
 583                                         }
 584                                         break;
 585                                 case L'K':
 586                                         if((dictionaries & KDD_KLD)!=0) {
 587                                                 dictionaryInfo.append(L"<li>Kanji Learner's Dictionary (Halpern): ")
 588                                                         .append(sTemp.substr(2)).append(L"</li>");
 589                                         }
 590                                         break;
 591                                 case L'O':
 592                                         if((dictionaries & KDD_EK)!=0) {
 593                                                 dictionaryInfo.append(L"<li>Essential Kanji (O'Neill): ")
 594                                                         .append(sTemp.substr(2)).append(L"</li>");
 595                                         }
 596                                         break;
 597                                 case L'R':
 598                                         if((dictionaries & KDD_DR)!=0) {
 599                                                 dictionaryInfo.append(L"<li>2001 Kanji (De Roo): ")
 600                                                         .append(sTemp.substr(2)).append(L"</li>");
 601                                         }
 602                                         break;
 603                                 case L'S':
 604                                         if((dictionaries & KDD_GTRWJS)!=0) {
 605                                                 dictionaryInfo.append(L"<li>A Guide to Reading and Writing Japanese (Sakade): ")
 606                                                         .append(sTemp.substr(2)).append(L"</li>");
 607                                         }
 608                                         break;
 609                                 case L'T':
 610                                         if((dictionaries & KDD_TKC)!=0) {
 611                                                 dictionaryInfo.append(L"<li>Tuttle Kanji Cards (Kask): ")
 612                                                         .append(sTemp.substr(2)).append(L"</li>");
 613                                         }
 614                                         break;
 615                                 default:
 616                                         if(unhandled.length()>0) unhandled.append(L" ");
 617                                         unhandled.append(sTemp);
 618                                         break;
 619                                 }
 620                                 break;
 621                         /* Crossreferences and miscodes */
 622                         case L'X':
 623                                 if(crossReferences.length()>0) crossReferences.append(L", ");
 624                                 crossReferences.append(sTemp.substr(1));
 625                                 break;
 626                         case L'Z':
 627                                 if(miscodes.length()>0) miscodes.append(L", ");
 628                                 miscodes.append(sTemp.substr(1));
 629                                 break;
 630                         /* Korean/Pinyin (Chinese) romanization */
 631                         case L'W':
 632                                 if(koreanRomanization.length()>0) koreanRomanization.append(L", ");
 633                                 koreanRomanization.append(sTemp.substr(1));
 634                                 break;
 635                         case L'Y':
 636                                 if(pinyinRomanization.length()>0) pinyinRomanization.append(L", ");
 637                                 pinyinRomanization.append(sTemp.substr(1));
 638                                 break;
 639                         default:
 640                                 if(unhandled.length()>0) unhandled.append(L" ");
 641                                 unhandled.append(sTemp);
 642                                 break;
 643                         }
 644                 }
 645         } /* while(t.HasMoreTokens()) */
 646
 647         if(header.str().length() > 0) result << header.str();
 648 #ifdef DEBUG
 649         printf("DEBUG: header=[%ls]\n", header.str().c_str());
 650 #endif
 651         result << L"<ul>";
 652         if((options & KDO_READINGS) != 0) {
 653                 if(onyomi.length() > 0)
 654                         result << L"<li>Onyomi Readings: " << onyomi << L"</li>";
 655                 if(kunyomi.length() > 0)
 656                         result << L"<li>Kunyomi Readings: " << kunyomi << L"</li>";
 657                 if(nanori.length() > 0)
 658                         result << L"<li>Nanori Readings: " << nanori << L"</li>";
 659                 if(radicalReading.length() > 0)
 660                         result << L"<li>Special Radical Reading: " << radicalReading <<
 661                                 L"</li>";
 662         }
 663         if((options & KDO_MEANINGS) != 0) {
 664                 if(english.length() > 0)
 665                         result << L"<li>English Meanings: " << english << L"</li>";
 666         }
 667         if((options & KDO_HIGHIMPORTANCE) != 0) {
 668                 if(strokes.length() > 0)
 669                         result << L"<li>Stroke count: " << strokes << L"</li>";
 670                 else
 671                         result << L"<li>Stroke count: not specified in KANJIDIC</li>";
 672                 result << L"<li>Grade Level: ";
 673                 if(grade<=6 && grade >= 1) {  /* Jouyou (Grade #) */
 674                         result << L"Jouyou (Grade " << grade << L")";
 675                 } else if(grade==8) {  /* Jouyou (General usage) */
 676                         result << L"Jouyou (General usage)";
 677                 } else if(grade==9) {  /* Jinmeiyou (Characters for names) */
 678                         result << L"Jinmeiyou (Characters for names)";
 679                 } else if(grade==-1) {  /* No flag specified in kanjidic string */
 680                         result << L"Unspecified";
 681                 } else {
 682                         result << L"Unhandled grade level (Grade " << grade << L")";
 683                 }
 684                 result << L"</li>";
 685                 if(frequency!=-1)
 686                         result << L"<li>Frequency Ranking: " << frequency << L"</li>";
 687                 else result << L"<li>Frequency Ranking: Unspecified</li>";
 688         }
 689         if((options & KDO_DICTIONARIES) != 0) {
 690                 if(dictionaryInfo.length()>0)
 691                         result << L"<li>Dictionary Codes:<ul>" << dictionaryInfo
 692                                    << L"</ul></li>";
 693         }
 694         if((options & KDO_VOCABCROSSREF) != 0) {
 695                 vector<wstring> *vList = &(jben->vocabList->GetVocabList());
 696                 wchar_t thisKanji = kanjidicStr[0];
 697                 vector<wstring> crossRefList;
 698                 vector<wstring>::iterator vIt;
 699                 for(vIt=vList->begin(); vIt!=vList->end(); vIt++) {
 700                         if(vIt->find(thisKanji)!=wstring::npos) {
 701                                 crossRefList.push_back(*vIt);
 702                         }
 703                 }
 704                 if(crossRefList.size()>0) {
 705                         result << L"<li>This kanji is used by words in your study list:<br><font size=\"7\">";
 706                         vIt = crossRefList.begin();
 707                         result << *vIt;
 708                         for(++vIt; vIt!=crossRefList.end(); vIt++) {
 709                                 result << L"&nbsp; " << *vIt;
 710                         }
 711                         result << L"</font></li>";
 712                 }
 713         }
 714         if((options & KDO_LOWIMPORTANCE) != 0) {
 715                 if(koreanRomanization.length()>0) lowRelevance.append(L"<li>Korean romanization: ").append(koreanRomanization).append(L"</li>");
 716                 if(pinyinRomanization.length()>0) lowRelevance.append(L"<li>Pinyin romanization: ").append(pinyinRomanization).append(L"</li>");
 717                 if(crossReferences.length()>0) lowRelevance.append(L"<li>Cross reference codes: ").append(crossReferences).append(L"</li>");
 718                 if(miscodes.length()>0) lowRelevance.append(L"<li>Miscodes: ").append(miscodes).append(L"</li>");
 719                 if(lowRelevance.length()>0)
 720                         result << L"<li>Extra Information:<ul>" << lowRelevance
 721                                    << L"</ul></li>";
 722         }
 723         if((options & KDO_UNHANDLED) != 0) {
 724                 if(unhandled.length()>0)
 725                         result << L"<li>Unhandled: " << unhandled << L"</li>";
 726         }
 727         result << L"</ul>";
 728
 729         return result.str();
 730 }
 731
 732 int KDict::GetIntField(wchar_t kanji, const wstring& marker) const {
 733         wstring markerStr, kanjiEntry, temp;
 734         size_t index=0;
 735         long value=-1;
 736         int markerLen;
 737
 738         markerStr.append(L" ").append(marker);
 739         markerLen=markerStr.length();
 740
 741         kanjiEntry = GetKanjidicStr(kanji);
 742         if(kanjiEntry.length()>0) {
 743                 index = kanjiEntry.find(markerStr);
 744                 if(index!=wstring::npos) {
 745                         temp = kanjiEntry.substr(
 746                                 index+markerLen,
 747                                 kanjiEntry.find(L" ", index+1) - index - (markerLen-1));
 748                         /*temp.ToLong(&value);*/
 749                         wistringstream(temp) >> value;
 750                 }
 751         }
 752
 753         return (int)value;
 754 }
 755
 756 const BoostHM<wchar_t,string>* KDict::GetHashTable() const {
 757         return &kanjidicData;
 758 }
 759
 760 enum {
 761         KDR_Onyomi=1,
 762         KDR_Kunyomi,
 763         KDR_English
 764 };
 765
 766 wstring KDict::GetOnyomiStr(wchar_t c) const {
 767         return GetKanjidicReading(c, KDR_Onyomi);
 768 }
 769
 770 wstring KDict::GetKunyomiStr(wchar_t c) const {
 771         return GetKanjidicReading(c, KDR_Kunyomi);
 772 }
 773
 774 wstring KDict::GetEnglishStr(wchar_t c) const {
 775         return GetKanjidicReading(c, KDR_English);
 776 }
 777
 778 wstring KDict::GetKanjidicReading(wchar_t c, int readingType) const {
 779         wostringstream result;
 780         wstring kanjidicStr = GetKanjidicStr(c);
 781
 782         long tmode = 0;
 783         wstring sTemp, token;
 784         list<wstring> t = StrTokenize<wchar_t>(kanjidicStr, L" ");
 785
 786         /* The first two tokens are guaranteed not to be what we're looking for.  Skip them. */
 787         if(t.size()>1) {
 788                 t.pop_front();
 789                 t.pop_front();
 790         }
 791         while(t.size()>0) {
 792                 token = t.front();
 793                 t.pop_front();
 794                 sTemp = token;
 795                 c = sTemp[0];
 796                 /* If a preceding character is detected, strip it */
 797                 if(c == L'(' || c == L'〜') {
 798                         sTemp = sTemp.substr(1);
 799                         c = sTemp[0];
 800                 }
 801                 if(tmode==0) {
 802                         if(IsKatakana(c) && readingType==KDR_Onyomi) {
 803                                 /* Onyomi reading detected */
 804                                 if(result.str().length()>0) result << L"  ";
 805                                 result << token;   /* Copy the original string,
 806                                                                           including ()'s and 〜's */
 807                                 continue;
 808                         }
 809                         else if(IsHiragana(c) && readingType==KDR_Kunyomi) {
 810                                 /* Kunyomi reading detected */
 811                                 if(result.str().length()>0) result << L"  ";
 812                                 result << token;   /* Copy the original string,
 813                                                                           including ()'s and 〜's */
 814                                 continue;
 815                         }
 816                 }
 817                 if(c == L'{' && readingType==KDR_English) {
 818                         /* English meaning detected
 819                            Special handling is needed to take care of spaces, though.
 820                            We'll "cheat" and mess with our iterator a bit if a space is detected. */
 821                         while(t.size()>0 && sTemp[sTemp.length()-1] != L'}') {
 822                                 sTemp.append(L" ").append(t.front());
 823                                 t.pop_front();
 824                         }
 825                         if(result.str().length()>0) result << L", ";
 826                         result << sTemp.substr(1,sTemp.length()-2);  /* Strip the {} */
 827                 }
 828                 else if(c==L'T') {
 829                         /*wstring(sTemp.substr(1)).ToLong(&tmode);*/
 830                         wistringstream(sTemp.substr(1)) >> tmode;
 831                 }
 832         }
 833
 834         return result.str();
 835 }
 836
 837 bool KDict::MainDataLoaded() const {
 838         if(kanjidicData.size()>0) return true;
 839         return false;
 840 }