src/wdict.cpp

   1 /*
   2 Project: J-Ben
   3 Author:  Paul Goins
   4 Website: http://www.vultaire.net/software/jben/
   5 License: GNU General Public License (GPL) version 2
   6          (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
   7
   8 File: wdict.cpp
   9
  10 This program is free software; you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation; either version 2 of the License, or
  13 (at your option) any later version.
  14
  15 This program is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with this program.  If not, see <http://www.gnu.org/licenses/>
  22 */
  23
  24 #include "wdict.h"
  25 #include "file_utils.h"
  26 #include "jutils.h"
  27 #include "string_utils.h"
  28 #include "encoding_convert.h"
  29 #include "errorlog.h"
  30 #include "preferences.h"
  31 #include <set>
  32 #include <list>
  33 #include <algorithm>
  34 #include <cstring>
  35 #include <fstream>
  36 #include <string>
  37 #include <sstream>
  38 using namespace std;
  39
  40 #ifdef __WXMSW__
  41 #       define FALLBACK_DICTDIR "dicts\\"
  42 #else
  43 #       define FALLBACK_DICTDIR "dicts/"
  44 #endif
  45
  46 /* SEARCH_MAX is our hard-coded cutoff point for searches.  It should be high
  47    enough not to interfere with normal "single page" operation, but it should
  48    also prevent the user from doing something too stupid and having to wait a
  49    minute or so because they searched for the letter "e" by mistake.
  50
  51    The most commonly used kanji in EDICT2 appears to be 人, at 1889 characters.
  52    Thus, let's make our panic breakoff point at 2000 characters. */
  53 #define SEARCH_MAX 2000
  54
  55 WDict* WDict::wdictSingleton = NULL;
  56
  57 const WDict *WDict::Get() {
  58         if(!wdictSingleton)
  59                 wdictSingleton = new WDict;
  60         return wdictSingleton;
  61 }
  62
  63 WDict::WDict() {
  64         Preferences *p = Preferences::Get();
  65         if(LoadEdict2(p->GetSetting("wdict_edict2").c_str())!=ED_SUCCESS)
  66                 LoadEdict2(FALLBACK_DICTDIR "edict2");
  67 }
  68
  69 void WDict::Destroy() {
  70         if(wdictSingleton) {
  71                 delete wdictSingleton;
  72                 wdictSingleton = NULL;
  73         }
  74 }
  75
  76 int WDict::LoadEdict2(const char *filename) {
  77         char *rawData = NULL;
  78         unsigned int size;
  79         int returnCode = 0xDEADBEEF;
  80
  81         ifstream ifile(filename, ios::ate); /* "at end" to get our file size */
  82         if(ifile) {
  83                 size = ifile.tellg();
  84                 ifile.seekg(0);
  85                 rawData = new char[size+1];
  86                 rawData[size] = '\0';
  87                 ifile.read(rawData, size);
  88 #ifdef DEBUG
  89                 if(strlen(rawData)!=size) {
  90                         ostringstream os;
  91                         os << "edict file size: " << strlen(rawData) << ", read-in string: " << size;
  92                         el.Push(EL_Warning, os.str());
  93                 }
  94 #endif
  95
  96                 /* Create the kanjidic object with our string data. */
  97                 this->Edict2Parser(rawData);
  98
  99                 returnCode = ED_SUCCESS;
 100         }
 101         else
 102                 returnCode = ED_FAILURE;
 103
 104         if(rawData) delete[] rawData;
 105         return returnCode;
 106 }
 107
 108 /* EDICT2 parser for WDict.  Takes a wstring containing the contents of
 109    an EDICT- or EDICT2-formatted dictionary, and adds its contents to an
 110    internal data struct.  This function also indexes the data, although ideally
 111    the indexing functionality should be externalized so it may be called later,
 112    like if another dictionary is added into the same WDict object at a later
 113    point. */
 114 void WDict::Edict2Parser(char *edictRawData) {
 115         char *token;
 116         wstring wToken;
 117
 118         int vIndex = -1; /* edict vector index */
 119         wstring sTemp;
 120
 121         /* Store raw EDICT data, plus store references by kanji/reading into ordered
 122            set */
 123         token = strtok(edictRawData, "\n");
 124         while(token) {
 125                 if(strlen(token)>0) {
 126                         /* 0. Make wstring copy of the token */
 127                         wToken = utfconv_mw(token);
 128                         /* 1. Store full string in vector */
 129                         edictData.push_back(token);
 130                         vIndex++;
 131
 132                 }
 133                 token = strtok(NULL, "\n");
 134         } /* while has more tokens */
 135 }
 136
 137 WDict::~WDict() {
 138         /* Currently, nothing needs to be done here. */
 139 }
 140
 141 bool WDict::Search(const wstring& query, list<int>& results,
 142                                    unsigned int searchType) const {
 143         list<int> priorityResults[4];
 144         bool englishSearch;
 145         bool isFurigana;  /* Not sure this is necessary - currently set but not
 146                                                  used.  May be a performance accelerator at cost of
 147                                                  code complexity. */
 148         int priorityExact, priorityBeginsWith, priorityEndsWith, priorityOther;
 149         vector<string>::const_iterator vIt;
 150
 151         if(query.length()==0) {
 152 #ifdef DEBUG
 153                 printf("[%s:%d] Empty string passed into WDict::Search.  (Not a problem!)\n", __FILE__, __LINE__);
 154 #endif
 155                 return false;
 156         }
 157
 158         /* Get our search priorities set up */
 159         int i;
 160         unsigned int uTemp;
 161
 162         /* Default priority is -1, "not used" */
 163         /* Lowest priority is 0, and highest will be 3. */
 164         /* HOWEVER, searchType is sorted as 0:7=high priority and 24:31=low. */
 165         priorityExact = priorityBeginsWith = priorityEndsWith = priorityOther = -1;
 166         for(i=0;i<4;i++) {
 167                 uTemp = (searchType >> ((3-i)*8)) & 0xFF;
 168                 if(uTemp == EDS_EXACT) priorityExact = i;
 169                 else if(uTemp == EDS_BEGIN) priorityBeginsWith = i;
 170                 else if(uTemp == EDS_END) priorityEndsWith = i;
 171                 else if(uTemp == EDS_ANY) priorityOther = i;
 172                 else if(uTemp == 0) { /* Do nothing; no preferred search method for
 173                                                                  this level */ }
 174                 else {
 175 #ifdef DEBUG
 176                         fprintf(stderr, "Unknown search type for priority level %d: %X\n", i+1, uTemp);
 177 #endif
 178                 }
 179         }
 180
 181         /* Store first char.  This determines whether we're doing an E-J or J-E
 182            search. */
 183         wchar_t firstChar = query[0];
 184         /* Using a very, very simple check: is it just a 7-bit char? */
 185         englishSearch = ( ((unsigned)firstChar) <= 0x7F );
 186         if(!englishSearch) {
 187                 isFurigana=true;
 188                 for(wstring::const_iterator stringIt = query.begin();
 189                   stringIt!=query.end(); stringIt++) {
 190                         isFurigana = (IsFurigana(*stringIt));
 191                         if(!isFurigana) break;
 192                 }
 193         }
 194
 195         /* Main search code begins below */
 196         /* NOTE: I think this can be cleaned up.  I don't think the vector for
 197            entryData below is needed; a simple string should suffice and a loop can
 198            be removed.  I'll look at this later since I'm busy with something else
 199            at the moment. */
 200
 201         vector<string> entryData; /* Stores the English/Japanese components of
 202                                                                  an EDICT string. */
 203         string utfQuery, lwrQuery, lwrData;
 204         vector<string>::iterator vSubIt;
 205         size_t indexSubstr, indexDataStart, indexDataEnd;
 206         int priorityLevel;
 207         char c;
 208
 209         utfQuery = utfconv_wm(query);
 210         lwrQuery = ToLower(utfQuery); /* For English searching, store a
 211                                                                                 lowercase query */
 212         i = 0;
 213
 214         for(vIt=edictData.begin(); vIt!=edictData.end(); vIt++) {
 215                 priorityLevel = -1; /* -1 == not a match*/
 216                 if(englishSearch) {
 217                         GetEnglish(*vIt, entryData);
 218                 } else {
 219                         GetJapanese(*vIt, entryData);
 220                 }
 221
 222                 for(vSubIt=entryData.begin(); vSubIt!=entryData.end(); vSubIt++) {
 223                         if(englishSearch) {
 224                                 /* English searching requires 2 special conditions:
 225                                    1. Case-insensitive searching (maybe optional, later)
 226                                    2. Recognition of word bounds (so we don't match character
 227                                       sequences inside of a word.) */
 228
 229                                 /* Convert target string to lower case */
 230                                 lwrData = ToLower(*vSubIt);
 231
 232                                 /* Find the first match that is bounded by non-alpha characters
 233                                    or beginning/end of string. */
 234                                 indexSubstr = lwrData.find(lwrQuery, 0);
 235                                 while(indexSubstr!=string::npos) {
 236 #ifdef DEBUG
 237 #if 0
 238 /*                                      printf("Checking possible match:\n"
 239                                                    "Query:       [%s]\n"
 240                                                    "Data string: [%s]\n"
 241                                                    "Index of match: %d\n",
 242                                                    lwrQuery.c_str(), lwrData.c_str(), indexSubstr);*/
 243                                         ostringstream oss;
 244                                         oss << "Checking possible match:\n"
 245                                                 << "Query:       " << lwrQuery << "\n"
 246                                                 << "Data string: " << lwrData << "\n"
 247                                                 << "Index of match: " << indexSubstr;
 248                                         el.Push(EL_Info, oss.str());
 249 #endif
 250 #endif
 251                                         if(
 252                                                 /* Check for beginning of data string or preceding
 253                                                    non-alpha char */
 254                                                 (indexSubstr==0 || !isalpha(lwrData[indexSubstr-1])) &&
 255                                                 /* Check for end of data string or following non-alpha
 256                                                    char */
 257                                                 (indexSubstr+lwrQuery.length() == lwrData.length() ||
 258                                                  !isalpha(lwrData[indexSubstr+lwrQuery.length()]))
 259                                                 ) break;
 260                                         /* If the match didn't meet all the above criteria, try to
 261                                            find the next one. */
 262 #ifdef DEBUG
 263                                         printf("Match not good.  Displaying verbose data:\n");
 264                                         if(indexSubstr==0)
 265                                                 printf("* Beginning of query matches beginning of data. (OK)\n");
 266                                         else if(!isalpha(lwrData[indexSubstr-1]))
 267                                                 printf("* Preceding character '%c' is non-alpha. (OK)\n",
 268                                                            lwrData[indexSubstr-1]);
 269                                         else
 270                                                 printf("* Start match is invalid. (FAIL)\n");
 271                                         if(indexSubstr+lwrQuery.length() == lwrData.length())
 272                                                 printf("* End of query matches end of data. (OK)\n");
 273                                         else if(!isalpha(lwrData[indexSubstr+lwrQuery.length()]))
 274                                                 printf("* Following character '%c' is non-alpha. (OK)\n",
 275                                                            lwrData[indexSubstr+lwrQuery.length()]);
 276                                         else
 277                                                 printf("* End match is invalid. (FAIL)\n");
 278
 279 #endif
 280                                         indexSubstr = lwrData.find(lwrQuery, indexSubstr+1);
 281                                 }
 282                         } else {
 283                                 indexSubstr = vSubIt->find(utfQuery, 0);
 284 #ifdef DEBUG
 285 #if 0
 286 /*                                      printf("Checking possible match:\n"
 287                                                    "Query:       [%s]\n"
 288                                                    "Data string: [%s]\n"
 289                                                    "Index of match: %d\n",
 290                                                    lwrQuery.c_str(), lwrData.c_str(), indexSubstr);*/
 291                                         ostringstream oss;
 292                                         oss << "Checking possible match:\n"
 293                                                 << "Query:       " << utfQuery << "\n"
 294                                                 << "Data string: " << *vSubIt << "\n"
 295                                                 << "Index of match: " << indexSubstr;
 296                                         el.Push(EL_Info, oss.str());
 297 #endif
 298 #endif
 299                         }
 300                         if(indexSubstr!=string::npos) {
 301                                 /* A match was found.
 302                                    Sort by type of match (exact, begin, end, other)
 303                                    LOGIC:
 304                                    - Search for a "begins with".
 305                                    - If it matches, check for "exact" (string length will work).
 306                                    - Check for an "ends with" (parens may be a prob??)
 307                                    - Dump all others into "other" */
 308
 309                                 /* FIRST: We need to get our dictionary bounds.  Check for an
 310                                    opening parenthesis, and if present, skip past it. */
 311                                 indexDataStart = 0;
 312                                 c = (*vSubIt)[indexDataStart];
 313                                 if(c == '(' || c == '{') {
 314                                         /* Parens found.  Loop until we reach the beginning of the
 315                                            real data. */
 316                                         while(1) {
 317                                                 /* Get first non-space char past the end parenthesis. */
 318                                                 if(c=='(') {
 319                                                         if(GetIndexAfterParens(*vSubIt, indexDataStart,
 320                                                                                                    indexDataStart, '(', ')')) {
 321                                                                 while(isspace((*vSubIt)[indexDataStart]))
 322                                                                         indexDataStart++;
 323                                                         }
 324                                                 } else if(c=='{') {
 325                                                         if(GetIndexAfterParens(*vSubIt, indexDataStart,
 326                                                                                                    indexDataStart, '{', '}')) {
 327                                                                 while(isspace((*vSubIt)[indexDataStart]))
 328                                                                         indexDataStart++;
 329                                                         }
 330                                                 } else break;
 331                                                 c = (*vSubIt)[indexDataStart];
 332                                         }
 333                                 }
 334
 335                                 /* Get the ending bound.
 336                                    NOTE: Currently this is only done for Japanese entries.
 337                                    English entries ending with ()'s will not omit them.  This is
 338                                    deliberate. */
 339                                 indexDataEnd = vSubIt->length()-1;
 340                                 if(!englishSearch) {
 341                                         c = (*vSubIt)[indexDataEnd];
 342                                         if(c == ')' || c == '}') {
 343                                                 /* Parens found.  Loop until we reach the beginning of
 344                                                    the real data. */
 345                                                 while(1) {
 346                                                         /* Get first non-space char past the end
 347                                                            parenthesis. */
 348                                                         if(c==')') {
 349                                                                 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
 350                                                                                                                 indexDataEnd, '(', ')')) {
 351                                                                         while(isspace((*vSubIt)[indexDataEnd]))
 352                                                                                 indexDataEnd--;
 353                                                                 }
 354                                                         } else if(c=='}') {
 355                                                                 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
 356                                                                                                                 indexDataEnd, '{', '}')) {
 357                                                                         while(isspace((*vSubIt)[indexDataEnd]))
 358                                                                                 indexDataEnd--;
 359                                                                 }
 360                                                         } else break;
 361                                                         c = (*vSubIt)[indexDataEnd];
 362                                                 }
 363                                         }
 364                                 }
 365
 366                                 /* Now, we apply the logic we specified at the beginning of this
 367                                    block. */
 368                                 if(indexSubstr==indexDataStart) {
 369                                         priorityLevel = max(priorityLevel, priorityBeginsWith);
 370                                         if(utfQuery.length()==indexDataEnd+1 - indexDataStart) {
 371                                                 priorityLevel = max(priorityLevel, priorityExact);
 372                                         } else {
 373                                         }
 374                                 } else if(indexSubstr == indexDataEnd+1 - utfQuery.length()) {
 375                                         priorityLevel = max(priorityLevel, priorityEndsWith);
 376                                 } else {
 377                                         priorityLevel = max(priorityLevel, priorityOther);
 378                                 }
 379                         }
 380                 }
 381                 /* Add to appropriate list */
 382                 if(priorityLevel>=0) {
 383                         if(priorityResults[0].size()
 384                           +priorityResults[1].size()
 385                           +priorityResults[2].size()
 386                           +priorityResults[3].size()< SEARCH_MAX) {
 387                                 priorityResults[priorityLevel].push_back(i);
 388                         } else {
 389                                 ostringstream os;
 390                                 os << "Over " << SEARCH_MAX
 391                                    << " results were found.  The search has been stopped.";
 392                                 el.Push(EL_Info, os.str());
 393                                 break;
 394                         }
 395                 }
 396
 397                 entryData.clear();
 398                 i++;
 399         }
 400
 401         /* Combine results into one list, based upon priority. */
 402         list<int>::iterator lIt;
 403         for(i=3;i>=0;i--) {
 404                 for(lIt=priorityResults[i].begin();
 405                   lIt!=priorityResults[i].end(); lIt++) {
 406                         results.push_back(*lIt);
 407                 }
 408         }
 409
 410 #ifdef DEBUG
 411         printf("Search result count: %d\n", results.size());
 412 #endif
 413         if(results.size()>0) return true;
 414         return false;
 415 }
 416
 417 wstring WDict::ResultToHTML(const wstring& rawResult) {
 418         wstring token, subToken, jStr, eStr, htmlStr;
 419         list<wstring> tk = StrTokenize(rawResult, L"\n");
 420         size_t indexSlash, indexNextSlash, indexBreak;
 421         while(tk.size()>0) {
 422                 token = tk.front();
 423                 tk.pop_front();
 424                 htmlStr.append(L"<p>");
 425
 426                 indexSlash = token.find_first_of(L'/');
 427                 if(indexSlash==wstring::npos) {
 428                         /* Fail-safe: just display the raw string */
 429                         htmlStr.append(token);
 430                 } else {
 431                         htmlStr.append(L"<b>Japanese:</b> <font size=\"6\">");
 432                         /*htmlStr.append(token.substr(0,indexSlash));*/
 433                         jStr = token.substr(0,indexSlash);
 434
 435                         indexBreak = jStr.find_first_of(L';');
 436                         while(indexBreak!=wstring::npos) {
 437                                 /*jStr[indexBreak]=L", ";*/
 438                                 jStr.replace(indexBreak,1,L", ",0,2);
 439                                 indexBreak = jStr.find_first_of(L';');
 440                         }
 441
 442                         htmlStr.append(jStr);
 443                         htmlStr.append(L"</font><br>");
 444
 445                         htmlStr.append(L"<b>English:</b> ");
 446                         eStr.clear();
 447                         while(indexSlash!=wstring::npos) {
 448                                 indexNextSlash = token.find_first_of(L'/', indexSlash+1);
 449                                 if(indexNextSlash==wstring::npos)
 450                                         subToken = token.substr(indexSlash+1);
 451                                 else
 452                                         subToken = token.substr(indexSlash+1,
 453                                                                                         indexNextSlash-1 - indexSlash);
 454                                 if(subToken.length()>0) {
 455                                         if(eStr.length()>0)
 456                                                 eStr.append(L"; ");
 457                                         eStr.append(subToken);
 458                                 }
 459                                 indexSlash = indexNextSlash;
 460                         }
 461                         htmlStr.append(eStr);
 462                 }
 463                 htmlStr.append(L"</p>");
 464         }
 465
 466         return htmlStr;
 467 }
 468
 469 void WDict::GetEnglish(const string& edictStr, vector<string>& dest) {
 470         char *tokenizedString = new char[edictStr.length()+1];
 471         char *token;
 472
 473         strcpy(tokenizedString, edictStr.c_str());
 474         token = strtok(tokenizedString, "/");
 475         /* Skip to the second token, since the first is just the Japanese readings */
 476         if(token) {
 477                 token = strtok(NULL, "/");
 478         }
 479
 480         while(token) {
 481                 if(strlen(token)>0) dest.push_back(token);
 482                 token = strtok(NULL, "/");
 483         }
 484
 485         delete[] tokenizedString;
 486 }
 487
 488 void WDict::GetJapanese(const string& edictStr, vector<string>& dest) {
 489         /* Grab the portion of the string relevant for Japanese readings */
 490         size_t indexFinal = edictStr.find_first_of('/');
 491         if(indexFinal==string::npos) indexFinal = edictStr.length();
 492         string jStr = edictStr.substr(0, indexFinal);
 493         string temp;
 494
 495         /* The data is too complex for a simple tokenization because strings within
 496            parentheses may contain characters normally used for breaking up the
 497            tokens.  So, the logic here is a little more complex. */
 498         size_t index, indexBreak, indexParen, indexStart=0;
 499         size_t len=jStr.length();
 500
 501         index = indexStart;
 502         while(indexStart<len) {
 503                 while(true) {
 504                         indexBreak = jStr.find_first_of(";[] ", index);
 505                         indexParen = jStr.find_first_of('(', index);
 506
 507                         /* Valid String Breaks */
 508                         /* If no parentheses are found, then indexBreak indicates our
 509                            bounds properly. */
 510                         if(indexParen==string::npos) break;
 511                         /* If parentheses ARE found, then we want to process them...
 512                            UNLESS a break char is found before the parenthesis. */
 513                         if(indexBreak!=string::npos && indexBreak<indexParen) break;
 514
 515                         /* Skip the parentheses and set index equal to the index following
 516                            the ')' character. */
 517                         if(!GetIndexAfterParens(jStr, indexParen, index)) {
 518                                 indexBreak = string::npos;
 519                                 break;
 520                         }
 521                 }
 522
 523                 if(indexBreak==string::npos) {
 524                         temp = jStr.substr(indexStart);
 525                 } else {
 526                         temp = jStr.substr(indexStart, indexBreak-indexStart);
 527                 }
 528                 if(temp.length()>0) dest.push_back(temp);
 529
 530                 /* Return if either indexBreak or index == string::npos.
 531                    index==string::npos:
 532                      This happens either if indexStart == string::npos, or if
 533                          parsing parentheses and we can't find a closing parenthesis.
 534                    indexBreak==string::npos:
 535                      This happens if the substring continues to the end of the source
 536                          string. */
 537                 if(index==string::npos
 538                 || indexBreak==string::npos) {
 539                         return;
 540                 }
 541
 542                 /* Iterate relevant vars for next iteration */
 543                 indexStart = indexBreak+1;
 544                 index = indexStart;
 545         }
 546 }
 547
 548 string WDict::GetEdictString(int i) const { return edictData[i]; }
 549
 550 bool WDict::MainDataLoaded() const {
 551         if(edictData.size()>0) return true;
 552         return false;
 553 }