Major directory structure changes, plus a bug fix.
[jben.git] / src / wdict.cpp
blob4295fcb8ead223e9a88de2f961571e5df871e007
1 /*
2 Project: J-Ben
3 Author: Paul Goins
4 Website: http://www.vultaire.net/software/jben/
5 License: GNU General Public License (GPL) version 2
6 (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
8 File: wdict.cpp
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>
24 #include "wdict.h"
25 #include "file_utils.h"
26 #include "jutils.h"
27 #include "string_utils.h"
28 #include "encoding_convert.h"
29 #include "errorlog.h"
30 #include "preferences.h"
31 #include <set>
32 #include <list>
33 #include <algorithm>
34 #include <cstring>
35 #include <fstream>
36 #include <string>
37 #include <sstream>
38 using namespace std;
40 #ifdef __WXMSW__
41 # define FALLBACK_DICTDIR "dicts\\"
42 #else
43 # define FALLBACK_DICTDIR "dicts/"
44 #endif
46 /* SEARCH_MAX is our hard-coded cutoff point for searches. It should be high
47 enough not to interfere with normal "single page" operation, but it should
48 also prevent the user from doing something too stupid and having to wait a
49 minute or so because they searched for the letter "e" by mistake.
51 The most commonly used kanji in EDICT2 appears to be 人, at 1889 characters.
52 Thus, let's make our panic breakoff point at 2000 characters. */
53 #define SEARCH_MAX 2000
55 WDict* WDict::wdictSingleton = NULL;
57 const WDict *WDict::Get() {
58 if(!wdictSingleton)
59 wdictSingleton = new WDict;
60 return wdictSingleton;
63 WDict::WDict() {
64 Preferences *p = Preferences::Get();
65 if(LoadEdict2(p->GetSetting("wdict_edict2").c_str())!=ED_SUCCESS)
66 LoadEdict2(FALLBACK_DICTDIR "edict2");
69 void WDict::Destroy() {
70 if(wdictSingleton) {
71 delete wdictSingleton;
72 wdictSingleton = NULL;
76 int WDict::LoadEdict2(const char *filename) {
77 char *rawData = NULL;
78 unsigned int size;
79 int returnCode = 0xDEADBEEF;
81 ifstream ifile(filename, ios::ate); /* "at end" to get our file size */
82 if(ifile) {
83 size = ifile.tellg();
84 ifile.seekg(0);
85 rawData = new char[size+1];
86 rawData[size] = '\0';
87 ifile.read(rawData, size);
88 #ifdef DEBUG
89 if(strlen(rawData)!=size) {
90 ostringstream os;
91 os << "edict file size: " << strlen(rawData) << ", read-in string: " << size;
92 el.Push(EL_Warning, os.str());
94 #endif
96 /* Create the kanjidic object with our string data. */
97 this->Edict2Parser(rawData);
99 returnCode = ED_SUCCESS;
101 else
102 returnCode = ED_FAILURE;
104 if(rawData) delete[] rawData;
105 return returnCode;
108 /* EDICT2 parser for WDict. Takes a wstring containing the contents of
109 an EDICT- or EDICT2-formatted dictionary, and adds its contents to an
110 internal data struct. This function also indexes the data, although ideally
111 the indexing functionality should be externalized so it may be called later,
112 like if another dictionary is added into the same WDict object at a later
113 point. */
114 void WDict::Edict2Parser(char *edictRawData) {
115 char *token;
116 wstring wToken;
118 int vIndex = -1; /* edict vector index */
119 wstring sTemp;
121 /* Store raw EDICT data, plus store references by kanji/reading into ordered
122 set */
123 token = strtok(edictRawData, "\n");
124 while(token) {
125 if(strlen(token)>0) {
126 /* 0. Make wstring copy of the token */
127 wToken = utfconv_mw(token);
128 /* 1. Store full string in vector */
129 edictData.push_back(token);
130 vIndex++;
133 token = strtok(NULL, "\n");
134 } /* while has more tokens */
137 WDict::~WDict() {
138 /* Currently, nothing needs to be done here. */
141 bool WDict::Search(const wstring& query, list<int>& results,
142 unsigned int searchType) const {
143 list<int> priorityResults[4];
144 bool englishSearch;
145 bool isFurigana; /* Not sure this is necessary - currently set but not
146 used. May be a performance accelerator at cost of
147 code complexity. */
148 int priorityExact, priorityBeginsWith, priorityEndsWith, priorityOther;
149 vector<string>::const_iterator vIt;
151 if(query.length()==0) {
152 #ifdef DEBUG
153 printf("[%s:%d] Empty string passed into WDict::Search. (Not a problem!)\n", __FILE__, __LINE__);
154 #endif
155 return false;
158 /* Get our search priorities set up */
159 int i;
160 unsigned int uTemp;
162 /* Default priority is -1, "not used" */
163 /* Lowest priority is 0, and highest will be 3. */
164 /* HOWEVER, searchType is sorted as 0:7=high priority and 24:31=low. */
165 priorityExact = priorityBeginsWith = priorityEndsWith = priorityOther = -1;
166 for(i=0;i<4;i++) {
167 uTemp = (searchType >> ((3-i)*8)) & 0xFF;
168 if(uTemp == EDS_EXACT) priorityExact = i;
169 else if(uTemp == EDS_BEGIN) priorityBeginsWith = i;
170 else if(uTemp == EDS_END) priorityEndsWith = i;
171 else if(uTemp == EDS_ANY) priorityOther = i;
172 else if(uTemp == 0) { /* Do nothing; no preferred search method for
173 this level */ }
174 else {
175 #ifdef DEBUG
176 fprintf(stderr, "Unknown search type for priority level %d: %X\n", i+1, uTemp);
177 #endif
181 /* Store first char. This determines whether we're doing an E-J or J-E
182 search. */
183 wchar_t firstChar = query[0];
184 /* Using a very, very simple check: is it just a 7-bit char? */
185 englishSearch = ( ((unsigned)firstChar) <= 0x7F );
186 if(!englishSearch) {
187 isFurigana=true;
188 for(wstring::const_iterator stringIt = query.begin();
189 stringIt!=query.end(); stringIt++) {
190 isFurigana = (IsFurigana(*stringIt));
191 if(!isFurigana) break;
195 /* Main search code begins below */
196 /* NOTE: I think this can be cleaned up. I don't think the vector for
197 entryData below is needed; a simple string should suffice and a loop can
198 be removed. I'll look at this later since I'm busy with something else
199 at the moment. */
201 vector<string> entryData; /* Stores the English/Japanese components of
202 an EDICT string. */
203 string utfQuery, lwrQuery, lwrData;
204 vector<string>::iterator vSubIt;
205 size_t indexSubstr, indexDataStart, indexDataEnd;
206 int priorityLevel;
207 char c;
209 utfQuery = utfconv_wm(query);
210 lwrQuery = ToLower(utfQuery); /* For English searching, store a
211 lowercase query */
212 i = 0;
214 for(vIt=edictData.begin(); vIt!=edictData.end(); vIt++) {
215 priorityLevel = -1; /* -1 == not a match*/
216 if(englishSearch) {
217 GetEnglish(*vIt, entryData);
218 } else {
219 GetJapanese(*vIt, entryData);
222 for(vSubIt=entryData.begin(); vSubIt!=entryData.end(); vSubIt++) {
223 if(englishSearch) {
224 /* English searching requires 2 special conditions:
225 1. Case-insensitive searching (maybe optional, later)
226 2. Recognition of word bounds (so we don't match character
227 sequences inside of a word.) */
229 /* Convert target string to lower case */
230 lwrData = ToLower(*vSubIt);
232 /* Find the first match that is bounded by non-alpha characters
233 or beginning/end of string. */
234 indexSubstr = lwrData.find(lwrQuery, 0);
235 while(indexSubstr!=string::npos) {
236 #ifdef DEBUG
237 #if 0
238 /* printf("Checking possible match:\n"
239 "Query: [%s]\n"
240 "Data string: [%s]\n"
241 "Index of match: %d\n",
242 lwrQuery.c_str(), lwrData.c_str(), indexSubstr);*/
243 ostringstream oss;
244 oss << "Checking possible match:\n"
245 << "Query: " << lwrQuery << "\n"
246 << "Data string: " << lwrData << "\n"
247 << "Index of match: " << indexSubstr;
248 el.Push(EL_Info, oss.str());
249 #endif
250 #endif
252 /* Check for beginning of data string or preceding
253 non-alpha char */
254 (indexSubstr==0 || !isalpha(lwrData[indexSubstr-1])) &&
255 /* Check for end of data string or following non-alpha
256 char */
257 (indexSubstr+lwrQuery.length() == lwrData.length() ||
258 !isalpha(lwrData[indexSubstr+lwrQuery.length()]))
259 ) break;
260 /* If the match didn't meet all the above criteria, try to
261 find the next one. */
262 #ifdef DEBUG
263 printf("Match not good. Displaying verbose data:\n");
264 if(indexSubstr==0)
265 printf("* Beginning of query matches beginning of data. (OK)\n");
266 else if(!isalpha(lwrData[indexSubstr-1]))
267 printf("* Preceding character '%c' is non-alpha. (OK)\n",
268 lwrData[indexSubstr-1]);
269 else
270 printf("* Start match is invalid. (FAIL)\n");
271 if(indexSubstr+lwrQuery.length() == lwrData.length())
272 printf("* End of query matches end of data. (OK)\n");
273 else if(!isalpha(lwrData[indexSubstr+lwrQuery.length()]))
274 printf("* Following character '%c' is non-alpha. (OK)\n",
275 lwrData[indexSubstr+lwrQuery.length()]);
276 else
277 printf("* End match is invalid. (FAIL)\n");
279 #endif
280 indexSubstr = lwrData.find(lwrQuery, indexSubstr+1);
282 } else {
283 indexSubstr = vSubIt->find(utfQuery, 0);
284 #ifdef DEBUG
285 #if 0
286 /* printf("Checking possible match:\n"
287 "Query: [%s]\n"
288 "Data string: [%s]\n"
289 "Index of match: %d\n",
290 lwrQuery.c_str(), lwrData.c_str(), indexSubstr);*/
291 ostringstream oss;
292 oss << "Checking possible match:\n"
293 << "Query: " << utfQuery << "\n"
294 << "Data string: " << *vSubIt << "\n"
295 << "Index of match: " << indexSubstr;
296 el.Push(EL_Info, oss.str());
297 #endif
298 #endif
300 if(indexSubstr!=string::npos) {
301 /* A match was found.
302 Sort by type of match (exact, begin, end, other)
303 LOGIC:
304 - Search for a "begins with".
305 - If it matches, check for "exact" (string length will work).
306 - Check for an "ends with" (parens may be a prob??)
307 - Dump all others into "other" */
309 /* FIRST: We need to get our dictionary bounds. Check for an
310 opening parenthesis, and if present, skip past it. */
311 indexDataStart = 0;
312 c = (*vSubIt)[indexDataStart];
313 if(c == '(' || c == '{') {
314 /* Parens found. Loop until we reach the beginning of the
315 real data. */
316 while(1) {
317 /* Get first non-space char past the end parenthesis. */
318 if(c=='(') {
319 if(GetIndexAfterParens(*vSubIt, indexDataStart,
320 indexDataStart, '(', ')')) {
321 while(isspace((*vSubIt)[indexDataStart]))
322 indexDataStart++;
324 } else if(c=='{') {
325 if(GetIndexAfterParens(*vSubIt, indexDataStart,
326 indexDataStart, '{', '}')) {
327 while(isspace((*vSubIt)[indexDataStart]))
328 indexDataStart++;
330 } else break;
331 c = (*vSubIt)[indexDataStart];
335 /* Get the ending bound.
336 NOTE: Currently this is only done for Japanese entries.
337 English entries ending with ()'s will not omit them. This is
338 deliberate. */
339 indexDataEnd = vSubIt->length()-1;
340 if(!englishSearch) {
341 c = (*vSubIt)[indexDataEnd];
342 if(c == ')' || c == '}') {
343 /* Parens found. Loop until we reach the beginning of
344 the real data. */
345 while(1) {
346 /* Get first non-space char past the end
347 parenthesis. */
348 if(c==')') {
349 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
350 indexDataEnd, '(', ')')) {
351 while(isspace((*vSubIt)[indexDataEnd]))
352 indexDataEnd--;
354 } else if(c=='}') {
355 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
356 indexDataEnd, '{', '}')) {
357 while(isspace((*vSubIt)[indexDataEnd]))
358 indexDataEnd--;
360 } else break;
361 c = (*vSubIt)[indexDataEnd];
366 /* Now, we apply the logic we specified at the beginning of this
367 block. */
368 if(indexSubstr==indexDataStart) {
369 priorityLevel = max(priorityLevel, priorityBeginsWith);
370 if(utfQuery.length()==indexDataEnd+1 - indexDataStart) {
371 priorityLevel = max(priorityLevel, priorityExact);
372 } else {
374 } else if(indexSubstr == indexDataEnd+1 - utfQuery.length()) {
375 priorityLevel = max(priorityLevel, priorityEndsWith);
376 } else {
377 priorityLevel = max(priorityLevel, priorityOther);
381 /* Add to appropriate list */
382 if(priorityLevel>=0) {
383 if(priorityResults[0].size()
384 +priorityResults[1].size()
385 +priorityResults[2].size()
386 +priorityResults[3].size()< SEARCH_MAX) {
387 priorityResults[priorityLevel].push_back(i);
388 } else {
389 ostringstream os;
390 os << "Over " << SEARCH_MAX
391 << " results were found. The search has been stopped.";
392 el.Push(EL_Info, os.str());
393 break;
397 entryData.clear();
398 i++;
401 /* Combine results into one list, based upon priority. */
402 list<int>::iterator lIt;
403 for(i=3;i>=0;i--) {
404 for(lIt=priorityResults[i].begin();
405 lIt!=priorityResults[i].end(); lIt++) {
406 results.push_back(*lIt);
410 #ifdef DEBUG
411 printf("Search result count: %d\n", results.size());
412 #endif
413 if(results.size()>0) return true;
414 return false;
417 wstring WDict::ResultToHTML(const wstring& rawResult) {
418 wstring token, subToken, jStr, eStr, htmlStr;
419 list<wstring> tk = StrTokenize(rawResult, L"\n");
420 size_t indexSlash, indexNextSlash, indexBreak;
421 while(tk.size()>0) {
422 token = tk.front();
423 tk.pop_front();
424 htmlStr.append(L"<p>");
426 indexSlash = token.find_first_of(L'/');
427 if(indexSlash==wstring::npos) {
428 /* Fail-safe: just display the raw string */
429 htmlStr.append(token);
430 } else {
431 htmlStr.append(L"<b>Japanese:</b> <font size=\"6\">");
432 /*htmlStr.append(token.substr(0,indexSlash));*/
433 jStr = token.substr(0,indexSlash);
435 indexBreak = jStr.find_first_of(L';');
436 while(indexBreak!=wstring::npos) {
437 /*jStr[indexBreak]=L", ";*/
438 jStr.replace(indexBreak,1,L", ",0,2);
439 indexBreak = jStr.find_first_of(L';');
442 htmlStr.append(jStr);
443 htmlStr.append(L"</font><br>");
445 htmlStr.append(L"<b>English:</b> ");
446 eStr.clear();
447 while(indexSlash!=wstring::npos) {
448 indexNextSlash = token.find_first_of(L'/', indexSlash+1);
449 if(indexNextSlash==wstring::npos)
450 subToken = token.substr(indexSlash+1);
451 else
452 subToken = token.substr(indexSlash+1,
453 indexNextSlash-1 - indexSlash);
454 if(subToken.length()>0) {
455 if(eStr.length()>0)
456 eStr.append(L"; ");
457 eStr.append(subToken);
459 indexSlash = indexNextSlash;
461 htmlStr.append(eStr);
463 htmlStr.append(L"</p>");
466 return htmlStr;
469 void WDict::GetEnglish(const string& edictStr, vector<string>& dest) {
470 char *tokenizedString = new char[edictStr.length()+1];
471 char *token;
473 strcpy(tokenizedString, edictStr.c_str());
474 token = strtok(tokenizedString, "/");
475 /* Skip to the second token, since the first is just the Japanese readings */
476 if(token) {
477 token = strtok(NULL, "/");
480 while(token) {
481 if(strlen(token)>0) dest.push_back(token);
482 token = strtok(NULL, "/");
485 delete[] tokenizedString;
488 void WDict::GetJapanese(const string& edictStr, vector<string>& dest) {
489 /* Grab the portion of the string relevant for Japanese readings */
490 size_t indexFinal = edictStr.find_first_of('/');
491 if(indexFinal==string::npos) indexFinal = edictStr.length();
492 string jStr = edictStr.substr(0, indexFinal);
493 string temp;
495 /* The data is too complex for a simple tokenization because strings within
496 parentheses may contain characters normally used for breaking up the
497 tokens. So, the logic here is a little more complex. */
498 size_t index, indexBreak, indexParen, indexStart=0;
499 size_t len=jStr.length();
501 index = indexStart;
502 while(indexStart<len) {
503 while(true) {
504 indexBreak = jStr.find_first_of(";[] ", index);
505 indexParen = jStr.find_first_of('(', index);
507 /* Valid String Breaks */
508 /* If no parentheses are found, then indexBreak indicates our
509 bounds properly. */
510 if(indexParen==string::npos) break;
511 /* If parentheses ARE found, then we want to process them...
512 UNLESS a break char is found before the parenthesis. */
513 if(indexBreak!=string::npos && indexBreak<indexParen) break;
515 /* Skip the parentheses and set index equal to the index following
516 the ')' character. */
517 if(!GetIndexAfterParens(jStr, indexParen, index)) {
518 indexBreak = string::npos;
519 break;
523 if(indexBreak==string::npos) {
524 temp = jStr.substr(indexStart);
525 } else {
526 temp = jStr.substr(indexStart, indexBreak-indexStart);
528 if(temp.length()>0) dest.push_back(temp);
530 /* Return if either indexBreak or index == string::npos.
531 index==string::npos:
532 This happens either if indexStart == string::npos, or if
533 parsing parentheses and we can't find a closing parenthesis.
534 indexBreak==string::npos:
535 This happens if the substring continues to the end of the source
536 string. */
537 if(index==string::npos
538 || indexBreak==string::npos) {
539 return;
542 /* Iterate relevant vars for next iteration */
543 indexStart = indexBreak+1;
544 index = indexStart;
548 string WDict::GetEdictString(int i) const { return edictData[i]; }
550 bool WDict::MainDataLoaded() const {
551 if(edictData.size()>0) return true;
552 return false;