edict.cpp

   1 /*
   2 Project: J-Ben
   3 Author:  Paul Goins
   4 Website: http://www.vultaire.net/software/jben/
   5 License: GNU General Public License (GPL) version 2
   6          (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
   7
   8 File: edict.cpp
   9
  10 This program is free software; you can redistribute it and/or modify
  11 it under the terms of the GNU General Public License as published by
  12 the Free Software Foundation; either version 2 of the License, or
  13 (at your option) any later version.
  14
  15 This program is distributed in the hope that it will be useful,
  16 but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 GNU General Public License for more details.
  19
  20 You should have received a copy of the GNU General Public License
  21 along with this program.  If not, see <http://www.gnu.org/licenses/>
  22 */
  23
  24 #include "edict.h"
  25 #include "file_utils.h"
  26 #include "wx/tokenzr.h"
  27 #include "jutils.h"
  28 #include "string_utils.h"
  29 #include <set>
  30 #include <list>
  31 #include <algorithm>
  32 #include <cstring>
  33 #include <fstream>
  34 #include <string>
  35 using namespace std;
  36
  37 /* SEARCH_MAX is our hard-coded cutoff point for searches.  It should be high
  38    enough not to interfere with normal "single page" operation, but it should
  39    also prevent the user from doing something too stupid and having to wait a
  40    minute or so because they searched for the letter "e" by mistake.
  41
  42    The most commonly used kanji in EDICT2 appears to be 人, at 1889 characters.
  43    Thus, let's make our panic breakoff point at 2000 characters. */
  44 #define SEARCH_MAX 2000
  45
  46 Edict *Edict::LoadEdict(const char *filename, int& returnCode) {
  47         Edict *e=NULL;
  48         char *rawData = NULL;
  49         unsigned int size;
  50
  51         ifstream ifile(filename, ios::ate); /* "at end" to get our file size */
  52         if(ifile) {
  53                 size = ifile.tellg();
  54                 ifile.seekg(0);
  55                 rawData = new char[size+1];
  56                 rawData[size] = '\0';
  57                 ifile.read(rawData, size);
  58 #ifdef DEBUG
  59                 if(strlen(rawData)!=size)
  60                         fprintf(stderr,
  61                           "WARNING: edict file size: %d, read-in string: %d\n",
  62                           strlen(rawData),
  63                           size);
  64 #endif
  65
  66                 /* Create the kanjidic object with our string data. */
  67                 e = new Edict(rawData);
  68
  69                 returnCode = ED_SUCCESS;
  70         }
  71         else
  72                 returnCode = ED_FAILURE;
  73
  74         if(rawData) delete[] rawData;
  75         return e;
  76 }
  77
  78 /* Default constructor for Edict.  Takes a wxString containing the contents of
  79    an EDICT- or EDICT2-formatted dictionary, and adds its contents to an
  80    internal data struct.  This function also indexes the data, although ideally
  81    the indexing functionality should be externalized so it may be called later,
  82    like if another dictionary is added into the same Edict object at a later
  83    point. */
  84 Edict::Edict(char *edictRawData) {
  85         char *token;
  86         wxString wxToken;
  87
  88         int vIndex = -1; /* edict vector index */
  89         wxString sTemp;
  90
  91         /* Store raw EDICT data, plus store references by kanji/reading into ordered
  92            set */
  93         token = strtok(edictRawData, "\n");
  94         while(token) {
  95                 if(strlen(token)>0) {
  96                         /* 0. Make wxString copy of the token */
  97                         UTF8ToWx(token, wxToken);
  98                         /* 1. Store full string in vector */
  99                         edictData.push_back(token);
 100                         vIndex++;
 101
 102                 }
 103                 token = strtok(NULL, "\n");
 104         } /* while has more tokens */
 105 }
 106
 107 Edict::~Edict() {
 108         /* Currently, nothing needs to be done here. */
 109 }
 110
 111 /* This function walks through the string, watching the parentheses, and copying
 112    only the portions which are outside parentheses.  Nested parentheses are
 113    handled. */
 114 wxString Edict::StripParenFields(const wxString& src) {
 115         wxString result;
 116         int parenCount = 0;
 117         size_t length, index, lastIndex, startValid;
 118
 119         startValid = 0;
 120         length = src.length();
 121         index = src.find_first_of(_T("()"));
 122         while(index != wxString::npos) {
 123                 if(src[index]==_T('(')) {
 124                         parenCount++;
 125                         if(parenCount==1) {
 126                                 /* Append the valid string up until parens were entered. */
 127                                 result.append(src.substr(startValid, index - startValid));
 128                         }
 129                 } else {
 130                         parenCount--;
 131                         if(parenCount==0) {
 132                                 /* Parens have been exited.  Reset our valid index. */
 133                                 startValid = index+1;
 134                         }
 135                         if(parenCount<0) parenCount=0; /* We'll skip extra )'s */
 136                 }
 137                 lastIndex = index;
 138                 index = src.find_first_of(_T("()"), lastIndex+1);
 139         }
 140         if(parenCount>0) {
 141 #ifdef DEBUG
 142                 fprintf(stderr, "WARNING: %s:%d, StripParenFields: Unclosed '(' detected.\n\tString: %ls\n",
 143                         __FILE__, __LINE__, src.c_str());
 144 #endif
 145         } else {
 146                 /* Append any remainder of the original string */
 147                 if(startValid!=wxString::npos && startValid < length) {
 148                         result.append(src.substr(startValid));
 149                 }
 150         }
 151
 152         return result;
 153 }
 154
 155 bool Edict::Search(const wxString& query, list<int>& results,
 156                                    unsigned int searchType) const {
 157         list<int> priorityResults[4];
 158         bool englishSearch;
 159         bool isFurigana;  /* Not sure this is necessary - currently set but not
 160                                                  used.  May be a performance accelerator at cost of
 161                                                  code complexity. */
 162         int priorityExact, priorityBeginsWith, priorityEndsWith, priorityOther;
 163         vector<string>::const_iterator vIt;
 164
 165         if(query.length()==0) {
 166 #ifdef DEBUG
 167                 printf("[%s:%d] Empty string passed into Edict::Search.  (Not a problem!)\n", __FILE__, __LINE__);
 168 #endif
 169                 return false;
 170         }
 171
 172         /* Get our search priorities set up */
 173         int i;
 174         unsigned int uTemp;
 175
 176         /* Default priority is -1, "not used" */
 177         /* Lowest priority is 0, and highest will be 3. */
 178         /* HOWEVER, searchType is sorted as 0:7=high priority and 24:31=low. */
 179         priorityExact = priorityBeginsWith = priorityEndsWith = priorityOther = -1;
 180         for(i=0;i<4;i++) {
 181                 uTemp = (searchType >> ((3-i)*8)) & 0xFF;
 182                 if(uTemp == EDS_EXACT) priorityExact = i;
 183                 else if(uTemp == EDS_BEGIN) priorityBeginsWith = i;
 184                 else if(uTemp == EDS_END) priorityEndsWith = i;
 185                 else if(uTemp == EDS_ANY) priorityOther = i;
 186                 else if(uTemp == 0) { /* Do nothing; no preferred search method for
 187                                                                  this level */ }
 188                 else {
 189 #ifdef DEBUG
 190                         fprintf(stderr, "Unknown search type for priority level %d: %X\n", i+1, uTemp);
 191 #endif
 192                 }
 193         }
 194
 195         /* Store first char.  This determines whether we're doing an E-J or J-E
 196            search. */
 197         wxChar firstChar = query[0];
 198         /* Using a very, very simple check: is it just a 7-bit char? */
 199         englishSearch = ( ((unsigned)firstChar) <= 0x7F );
 200         if(!englishSearch) {
 201                 isFurigana=true;
 202                 for(wxString::const_iterator stringIt = query.begin();
 203                   stringIt!=query.end(); stringIt++) {
 204                         isFurigana = (IsFurigana(*stringIt));
 205                         if(!isFurigana) break;
 206                 }
 207         }
 208
 209         /* Main search code begins below */
 210         /* NOTE: I think this can be cleaned up.  I don't think the vector for
 211            entryData below is needed; a simple string should suffice and a loop can
 212            be removed.  I'll look at this later since I'm busy with something else
 213            at the moment. */
 214
 215         vector<string> entryData; /* Stores the English/Japanese components of
 216                                                                  an EDICT string. */
 217         string utfQuery, lwrQuery, lwrData;
 218         vector<string>::iterator vSubIt;
 219         size_t indexSubstr, indexDataStart, indexDataEnd;
 220         int priorityLevel;
 221         char c;
 222
 223         WxToUTF8(query, utfQuery);
 224         lwrQuery = StrToLower(utfQuery); /* For English searching, store a
 225                                                                                 lowercase query */
 226         i = 0;
 227
 228         for(vIt=edictData.begin(); vIt!=edictData.end(); vIt++) {
 229                 priorityLevel = -1; /* -1 == not a match*/
 230                 if(englishSearch) {
 231                         GetEnglish(*vIt, entryData);
 232                 } else {
 233                         GetJapanese(*vIt, entryData);
 234                 }
 235
 236                 for(vSubIt=entryData.begin(); vSubIt!=entryData.end(); vSubIt++) {
 237                         if(englishSearch) {
 238                                 /* English searching requires 2 special conditions:
 239                                    1. Case-insensitive searching (maybe optional, later)
 240                                    2. Recognition of word bounds (so we don't match character
 241                                       sequences inside of a word.) */
 242
 243                                 /* Convert target string to lower case */
 244                                 lwrData = StrToLower(*vSubIt);
 245
 246                                 /* Find the first match that is bounded by non-alpha characters
 247                                    or beginning/end of string. */
 248                                 indexSubstr = lwrData.find(lwrQuery, 0);
 249                                 while(indexSubstr!=string::npos) {
 250 #ifdef DEBUG
 251                                         printf("Checking possible match:\n"
 252                                                    "Query:       [%s]\n"
 253                                                    "Data string: [%s]\n"
 254                                                    "Index of match: %d\n",
 255                                                    lwrQuery.c_str(), lwrData.c_str(), indexSubstr);
 256 #endif
 257                                         if(
 258                                                 /* Check for beginning of data string or preceding
 259                                                    non-alpha char */
 260                                                 (indexSubstr==0 || !isalpha(lwrData[indexSubstr-1])) &&
 261                                                 /* Check for end of data string or following non-alpha
 262                                                    char */
 263                                                 (indexSubstr+lwrQuery.length() == lwrData.length() ||
 264                                                  !isalpha(lwrData[indexSubstr+lwrQuery.length()]))
 265                                                 ) break;
 266                                         /* If the match didn't meet all the above criteria, try to
 267                                            find the next one. */
 268 #ifdef DEBUG
 269                                         printf("Match not good.  Displaying verbose data:\n");
 270                                         if(indexSubstr==0)
 271                                                 printf("* Beginning of query matches beginning of data. (OK)\n");
 272                                         else if(!isalpha(lwrData[indexSubstr-1]))
 273                                                 printf("* Preceding character '%c' is non-alpha. (OK)\n",
 274                                                            lwrData[indexSubstr-1]);
 275                                         else
 276                                                 printf("* Start match is invalid. (FAIL)\n");
 277                                         if(indexSubstr+lwrQuery.length() == lwrData.length())
 278                                                 printf("* End of query matches end of data. (OK)\n");
 279                                         else if(!isalpha(lwrData[indexSubstr+lwrQuery.length()]))
 280                                                 printf("* Following character '%c' is non-alpha. (OK)\n",
 281                                                            lwrData[indexSubstr+lwrQuery.length()]);
 282                                         else
 283                                                 printf("* End match is invalid. (FAIL)\n");
 284
 285 #endif
 286                                         indexSubstr = lwrData.find(lwrQuery, indexSubstr+1);
 287                                 }
 288                         } else {
 289                                 indexSubstr = vSubIt->find(utfQuery, 0);
 290                         }
 291                         if(indexSubstr!=string::npos) {
 292                                 /* A match was found.
 293                                    Sort by type of match (exact, begin, end, other)
 294                                    LOGIC:
 295                                    - Search for a "begins with".
 296                                    - If it matches, check for "exact" (string length will work).
 297                                    - Check for an "ends with" (parens may be a prob??)
 298                                    - Dump all others into "other" */
 299
 300                                 /* FIRST: We need to get our dictionary bounds.  Check for an
 301                                    opening parenthesis, and if present, skip past it. */
 302                                 indexDataStart = 0;
 303                                 c = (*vSubIt)[indexDataStart];
 304                                 if(c == '(' || c == '{') {
 305                                         /* Parens found.  Loop until we reach the beginning of the
 306                                            real data. */
 307                                         while(1) {
 308                                                 /* Get first non-space char past the end parenthesis. */
 309                                                 if(c=='(') {
 310                                                         if(GetIndexAfterParens(*vSubIt, indexDataStart,
 311                                                                                                    indexDataStart, '(', ')')) {
 312                                                                 while(isspace((*vSubIt)[indexDataStart]))
 313                                                                         indexDataStart++;
 314                                                         }
 315                                                 } else if(c=='{') {
 316                                                         if(GetIndexAfterParens(*vSubIt, indexDataStart,
 317                                                                                                    indexDataStart, '{', '}')) {
 318                                                                 while(isspace((*vSubIt)[indexDataStart]))
 319                                                                         indexDataStart++;
 320                                                         }
 321                                                 } else break;
 322                                                 c = (*vSubIt)[indexDataStart];
 323                                         }
 324                                 }
 325
 326                                 /* Get the ending bound.
 327                                    NOTE: Currently this is only done for Japanese entries.
 328                                    English entries ending with ()'s will not omit them.  This is
 329                                    deliberate. */
 330                                 indexDataEnd = vSubIt->length()-1;
 331                                 if(!englishSearch) {
 332                                         c = (*vSubIt)[indexDataEnd];
 333                                         if(c == ')' || c == '}') {
 334                                                 /* Parens found.  Loop until we reach the beginning of
 335                                                    the real data. */
 336                                                 while(1) {
 337                                                         /* Get first non-space char past the end
 338                                                            parenthesis. */
 339                                                         if(c==')') {
 340                                                                 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
 341                                                                                                                 indexDataEnd, '(', ')')) {
 342                                                                         while(isspace((*vSubIt)[indexDataEnd]))
 343                                                                                 indexDataEnd--;
 344                                                                 }
 345                                                         } else if(c=='}') {
 346                                                                 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
 347                                                                                                                 indexDataEnd, '{', '}')) {
 348                                                                         while(isspace((*vSubIt)[indexDataEnd]))
 349                                                                                 indexDataEnd--;
 350                                                                 }
 351                                                         } else break;
 352                                                         c = (*vSubIt)[indexDataEnd];
 353                                                 }
 354                                         }
 355                                 }
 356
 357                                 /* Now, we apply the logic we specified at the beginning of this
 358                                    block. */
 359                                 if(indexSubstr==indexDataStart) {
 360                                         priorityLevel = max(priorityLevel, priorityBeginsWith);
 361                                         if(utfQuery.length()==indexDataEnd+1 - indexDataStart) {
 362                                                 priorityLevel = max(priorityLevel, priorityExact);
 363                                         } else {
 364                                         }
 365                                 } else if(indexSubstr == indexDataEnd+1 - utfQuery.length()) {
 366                                         priorityLevel = max(priorityLevel, priorityEndsWith);
 367                                 } else {
 368                                         priorityLevel = max(priorityLevel, priorityOther);
 369                                 }
 370                         }
 371                 }
 372                 /* Add to appropriate list */
 373                 if(priorityLevel>=0) {
 374                         if(priorityResults[0].size()
 375                           +priorityResults[1].size()
 376                           +priorityResults[2].size()
 377                           +priorityResults[3].size()< SEARCH_MAX) {
 378                                 priorityResults[priorityLevel].push_back(i);
 379                         } else {
 380 #ifdef DEBUG
 381                                 printf("PANIC: SEARCH_MAX results reached!\n");
 382 #endif
 383                                 wxMessageBox(wxString::Format(_T("Over %d results were found.  The search has been stopped."), SEARCH_MAX),
 384                                                          _T("Excessive search results"),
 385                                                          wxOK | wxICON_INFORMATION, NULL);
 386                                 break;
 387                         }
 388                 }
 389
 390                 entryData.clear();
 391                 i++;
 392         }
 393
 394         /* Combine results into one list, based upon priority. */
 395         list<int>::iterator lIt;
 396         for(i=3;i>=0;i--) {
 397                 for(lIt=priorityResults[i].begin();
 398                   lIt!=priorityResults[i].end(); lIt++) {
 399                         results.push_back(*lIt);
 400                 }
 401         }
 402
 403 #ifdef DEBUG
 404         printf("Search result count: %d\n", results.size());
 405 #endif
 406         if(results.size()>0) return true;
 407         return false;
 408 }
 409
 410 wxString Edict::ResultToHTML(const wxString& rawResult) {
 411         wxString token, subToken, jStr, eStr, htmlStr;
 412         wxStringTokenizer tk(rawResult, _T("\n"));
 413         size_t indexSlash, indexNextSlash, indexBreak;
 414         while(tk.HasMoreTokens()) {
 415                 token = tk.GetNextToken();
 416                 htmlStr.append(_T("<p>"));
 417
 418                 indexSlash = token.find_first_of(_T('/'));
 419                 if(indexSlash==wxString::npos) {
 420                         /* Fail-safe: just display the raw string */
 421                         htmlStr.append(token);
 422                 } else {
 423                         htmlStr.append(_T("<b>Japanese:</b> <font size=\"6\">"));
 424                         /*htmlStr.append(token.substr(0,indexSlash));*/
 425                         jStr = token.substr(0,indexSlash);
 426
 427                         indexBreak = jStr.find_first_of(_T(';'));
 428                         while(indexBreak!=wxString::npos) {
 429                                 /*jStr[indexBreak]=_T(", ");*/
 430                                 jStr.replace(indexBreak,1,_T(", "),0,2);
 431                                 indexBreak = jStr.find_first_of(_T(';'));
 432                         }
 433
 434                         htmlStr.append(jStr);
 435                         htmlStr.append(_T("</font><br>"));
 436
 437                         htmlStr.append(_T("<b>English:</b> "));
 438                         eStr.clear();
 439                         while(indexSlash!=wxString::npos) {
 440                                 indexNextSlash = token.find_first_of(_T('/'), indexSlash+1);
 441                                 if(indexNextSlash==wxString::npos)
 442                                         subToken = token.substr(indexSlash+1);
 443                                 else
 444                                         subToken = token.substr(indexSlash+1,
 445                                                                                         indexNextSlash-1 - indexSlash);
 446                                 if(subToken.length()>0) {
 447                                         if(eStr.length()>0)
 448                                                 eStr.append(_T("; "));
 449                                         eStr.append(subToken);
 450                                 }
 451                                 indexSlash = indexNextSlash;
 452                         }
 453                         htmlStr.append(eStr);
 454                 }
 455                 htmlStr.append(_T("</p>"));
 456         }
 457
 458         return htmlStr;
 459 }
 460
 461 void Edict::GetEnglish(const string& edictStr, vector<string>& dest) {
 462         char *tokenizedString = new char[edictStr.length()+1];
 463         char *token;
 464
 465         strcpy(tokenizedString, edictStr.c_str());
 466         token = strtok(tokenizedString, "/");
 467         /* Skip to the second token, since the first is just the Japanese readings */
 468         if(token) {
 469                 token = strtok(NULL, "/");
 470         }
 471
 472         while(token) {
 473                 if(strlen(token)>0) dest.push_back(token);
 474                 token = strtok(NULL, "/");
 475         }
 476
 477         delete[] tokenizedString;
 478 }
 479
 480 void Edict::GetJapanese(const string& edictStr, vector<string>& dest) {
 481         /* Grab the portion of the string relevant for Japanese readings */
 482         size_t indexFinal = edictStr.find_first_of('/');
 483         if(indexFinal==string::npos) indexFinal = edictStr.length();
 484         string jStr = edictStr.substr(0, indexFinal);
 485         string temp;
 486
 487         /* The data is too complex for a simple tokenization because strings within
 488            parentheses may contain characters normally used for breaking up the
 489            tokens.  So, the logic here is a little more complex. */
 490         size_t index, indexBreak, indexParen, indexStart=0;
 491         size_t len=jStr.length();
 492
 493         index = indexStart;
 494         while(indexStart<len) {
 495                 while(true) {
 496                         indexBreak = jStr.find_first_of(";[] ", index);
 497                         indexParen = jStr.find_first_of('(', index);
 498
 499                         /* Valid String Breaks */
 500                         /* If no parentheses are found, then indexBreak indicates our
 501                            bounds properly. */
 502                         if(indexParen==string::npos) break;
 503                         /* If parentheses ARE found, then we want to process them...
 504                            UNLESS a break char is found before the parenthesis. */
 505                         if(indexBreak!=string::npos && indexBreak<indexParen) break;
 506
 507                         /* Skip the parentheses and set index equal to the index following
 508                            the ')' character. */
 509                         if(!GetIndexAfterParens(jStr, indexParen, index)) {
 510                                 indexBreak = string::npos;
 511                                 break;
 512                         }
 513                 }
 514
 515                 if(indexBreak==string::npos) {
 516                         temp = jStr.substr(indexStart);
 517                 } else {
 518                         temp = jStr.substr(indexStart, indexBreak-indexStart);
 519                 }
 520                 if(temp.length()>0) dest.push_back(temp);
 521
 522                 /* Return if either indexBreak or index == string::npos.
 523                    index==string::npos:
 524                      This happens either if indexStart == string::npos, or if
 525                          parsing parentheses and we can't find a closing parenthesis.
 526                    indexBreak==string::npos:
 527                      This happens if the substring continues to the end of the source
 528                          string. */
 529                 if(index==string::npos
 530                 || indexBreak==string::npos) {
 531                         return;
 532                 }
 533
 534                 /* Iterate relevant vars for next iteration */
 535                 indexStart = indexBreak+1;
 536                 index = indexStart;
 537         }
 538 }
 539
 540 string Edict::GetEdictString(int i) const { return edictData[i]; }