4 Website: http://www.vultaire.net/software/jben/
5 License: GNU General Public License (GPL) version 2
6 (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>
25 #include "file_utils.h"
27 #include "string_utils.h"
28 #include "encoding_convert.h"
30 #include "preferences.h"
41 # define FALLBACK_DICTDIR "dicts\\"
43 # define FALLBACK_DICTDIR "dicts/"
46 /* SEARCH_MAX is our hard-coded cutoff point for searches. It should be high
47 enough not to interfere with normal "single page" operation, but it should
48 also prevent the user from doing something too stupid and having to wait a
49 minute or so because they searched for the letter "e" by mistake.
51 The most commonly used kanji in EDICT2 appears to be 人, at 1889 characters.
52 Thus, let's make our panic breakoff point at 2000 characters. */
53 #define SEARCH_MAX 2000
55 WDict
* WDict::wdictSingleton
= NULL
;
57 const WDict
*WDict::Get() {
59 wdictSingleton
= new WDict
;
60 return wdictSingleton
;
64 Preferences
*p
= Preferences::Get();
65 if(LoadEdict2(p
->GetSetting("wdict_edict2").c_str())!=ED_SUCCESS
)
66 LoadEdict2(FALLBACK_DICTDIR
"edict2");
69 void WDict::Destroy() {
71 delete wdictSingleton
;
72 wdictSingleton
= NULL
;
76 int WDict::LoadEdict2(const char *filename
) {
79 int returnCode
= 0xDEADBEEF;
81 ifstream
ifile(filename
, ios::ate
); /* "at end" to get our file size */
85 rawData
= new char[size
+1];
87 ifile
.read(rawData
, size
);
89 if(strlen(rawData
)!=size
) {
91 os
<< "edict file size: " << strlen(rawData
) << ", read-in string: " << size
;
92 el
.Push(EL_Warning
, os
.str());
96 /* Create the kanjidic object with our string data. */
97 this->Edict2Parser(rawData
);
99 returnCode
= ED_SUCCESS
;
102 returnCode
= ED_FAILURE
;
104 if(rawData
) delete[] rawData
;
108 /* EDICT2 parser for WDict. Takes a wstring containing the contents of
109 an EDICT- or EDICT2-formatted dictionary, and adds its contents to an
110 internal data struct. This function also indexes the data, although ideally
111 the indexing functionality should be externalized so it may be called later,
112 like if another dictionary is added into the same WDict object at a later
114 void WDict::Edict2Parser(char *edictRawData
) {
118 int vIndex
= -1; /* edict vector index */
121 /* Store raw EDICT data, plus store references by kanji/reading into ordered
123 token
= strtok(edictRawData
, "\n");
125 if(strlen(token
)>0) {
126 /* 0. Make wstring copy of the token */
127 wToken
= utfconv_mw(token
);
128 /* 1. Store full string in vector */
129 edictData
.push_back(token
);
133 token
= strtok(NULL
, "\n");
134 } /* while has more tokens */
138 /* Currently, nothing needs to be done here. */
141 bool WDict::Search(const wstring
& query
, list
<int>& results
,
142 unsigned int searchType
) const {
143 list
<int> priorityResults
[4];
145 bool isFurigana
; /* Not sure this is necessary - currently set but not
146 used. May be a performance accelerator at cost of
148 int priorityExact
, priorityBeginsWith
, priorityEndsWith
, priorityOther
;
149 vector
<string
>::const_iterator vIt
;
151 if(query
.length()==0) {
153 printf("[%s:%d] Empty string passed into WDict::Search. (Not a problem!)\n", __FILE__
, __LINE__
);
158 /* Get our search priorities set up */
162 /* Default priority is -1, "not used" */
163 /* Lowest priority is 0, and highest will be 3. */
164 /* HOWEVER, searchType is sorted as 0:7=high priority and 24:31=low. */
165 priorityExact
= priorityBeginsWith
= priorityEndsWith
= priorityOther
= -1;
167 uTemp
= (searchType
>> ((3-i
)*8)) & 0xFF;
168 if(uTemp
== EDS_EXACT
) priorityExact
= i
;
169 else if(uTemp
== EDS_BEGIN
) priorityBeginsWith
= i
;
170 else if(uTemp
== EDS_END
) priorityEndsWith
= i
;
171 else if(uTemp
== EDS_ANY
) priorityOther
= i
;
172 else if(uTemp
== 0) { /* Do nothing; no preferred search method for
176 fprintf(stderr
, "Unknown search type for priority level %d: %X\n", i
+1, uTemp
);
181 /* Store first char. This determines whether we're doing an E-J or J-E
183 wchar_t firstChar
= query
[0];
184 /* Using a very, very simple check: is it just a 7-bit char? */
185 englishSearch
= ( ((unsigned)firstChar
) <= 0x7F );
188 for(wstring::const_iterator stringIt
= query
.begin();
189 stringIt
!=query
.end(); stringIt
++) {
190 isFurigana
= (IsFurigana(*stringIt
));
191 if(!isFurigana
) break;
195 /* Main search code begins below */
196 /* NOTE: I think this can be cleaned up. I don't think the vector for
197 entryData below is needed; a simple string should suffice and a loop can
198 be removed. I'll look at this later since I'm busy with something else
201 vector
<string
> entryData
; /* Stores the English/Japanese components of
203 string utfQuery
, lwrQuery
, lwrData
;
204 vector
<string
>::iterator vSubIt
;
205 size_t indexSubstr
, indexDataStart
, indexDataEnd
;
209 utfQuery
= utfconv_wm(query
);
210 lwrQuery
= ToLower(utfQuery
); /* For English searching, store a
214 for(vIt
=edictData
.begin(); vIt
!=edictData
.end(); vIt
++) {
215 priorityLevel
= -1; /* -1 == not a match*/
217 GetEnglish(*vIt
, entryData
);
219 GetJapanese(*vIt
, entryData
);
222 for(vSubIt
=entryData
.begin(); vSubIt
!=entryData
.end(); vSubIt
++) {
224 /* English searching requires 2 special conditions:
225 1. Case-insensitive searching (maybe optional, later)
226 2. Recognition of word bounds (so we don't match character
227 sequences inside of a word.) */
229 /* Convert target string to lower case */
230 lwrData
= ToLower(*vSubIt
);
232 /* Find the first match that is bounded by non-alpha characters
233 or beginning/end of string. */
234 indexSubstr
= lwrData
.find(lwrQuery
, 0);
235 while(indexSubstr
!=string::npos
) {
238 /* printf("Checking possible match:\n"
240 "Data string: [%s]\n"
241 "Index of match: %d\n",
242 lwrQuery.c_str(), lwrData.c_str(), indexSubstr);*/
244 oss
<< "Checking possible match:\n"
245 << "Query: " << lwrQuery
<< "\n"
246 << "Data string: " << lwrData
<< "\n"
247 << "Index of match: " << indexSubstr
;
248 el
.Push(EL_Info
, oss
.str());
252 /* Check for beginning of data string or preceding
254 (indexSubstr
==0 || !isalpha(lwrData
[indexSubstr
-1])) &&
255 /* Check for end of data string or following non-alpha
257 (indexSubstr
+lwrQuery
.length() == lwrData
.length() ||
258 !isalpha(lwrData
[indexSubstr
+lwrQuery
.length()]))
260 /* If the match didn't meet all the above criteria, try to
261 find the next one. */
263 printf("Match not good. Displaying verbose data:\n");
265 printf("* Beginning of query matches beginning of data. (OK)\n");
266 else if(!isalpha(lwrData
[indexSubstr
-1]))
267 printf("* Preceding character '%c' is non-alpha. (OK)\n",
268 lwrData
[indexSubstr
-1]);
270 printf("* Start match is invalid. (FAIL)\n");
271 if(indexSubstr
+lwrQuery
.length() == lwrData
.length())
272 printf("* End of query matches end of data. (OK)\n");
273 else if(!isalpha(lwrData
[indexSubstr
+lwrQuery
.length()]))
274 printf("* Following character '%c' is non-alpha. (OK)\n",
275 lwrData
[indexSubstr
+lwrQuery
.length()]);
277 printf("* End match is invalid. (FAIL)\n");
280 indexSubstr
= lwrData
.find(lwrQuery
, indexSubstr
+1);
283 indexSubstr
= vSubIt
->find(utfQuery
, 0);
286 /* printf("Checking possible match:\n"
288 "Data string: [%s]\n"
289 "Index of match: %d\n",
290 lwrQuery.c_str(), lwrData.c_str(), indexSubstr);*/
292 oss
<< "Checking possible match:\n"
293 << "Query: " << utfQuery
<< "\n"
294 << "Data string: " << *vSubIt
<< "\n"
295 << "Index of match: " << indexSubstr
;
296 el
.Push(EL_Info
, oss
.str());
300 if(indexSubstr
!=string::npos
) {
301 /* A match was found.
302 Sort by type of match (exact, begin, end, other)
304 - Search for a "begins with".
305 - If it matches, check for "exact" (string length will work).
306 - Check for an "ends with" (parens may be a prob??)
307 - Dump all others into "other" */
309 /* FIRST: We need to get our dictionary bounds. Check for an
310 opening parenthesis, and if present, skip past it. */
312 c
= (*vSubIt
)[indexDataStart
];
313 if(c
== '(' || c
== '{') {
314 /* Parens found. Loop until we reach the beginning of the
317 /* Get first non-space char past the end parenthesis. */
319 if(GetIndexAfterParens(*vSubIt
, indexDataStart
,
320 indexDataStart
, '(', ')')) {
321 while(isspace((*vSubIt
)[indexDataStart
]))
325 if(GetIndexAfterParens(*vSubIt
, indexDataStart
,
326 indexDataStart
, '{', '}')) {
327 while(isspace((*vSubIt
)[indexDataStart
]))
331 c
= (*vSubIt
)[indexDataStart
];
335 /* Get the ending bound.
336 NOTE: Currently this is only done for Japanese entries.
337 English entries ending with ()'s will not omit them. This is
339 indexDataEnd
= vSubIt
->length()-1;
341 c
= (*vSubIt
)[indexDataEnd
];
342 if(c
== ')' || c
== '}') {
343 /* Parens found. Loop until we reach the beginning of
346 /* Get first non-space char past the end
349 if(GetIndexBeforeParens(*vSubIt
, indexDataEnd
,
350 indexDataEnd
, '(', ')')) {
351 while(isspace((*vSubIt
)[indexDataEnd
]))
355 if(GetIndexBeforeParens(*vSubIt
, indexDataEnd
,
356 indexDataEnd
, '{', '}')) {
357 while(isspace((*vSubIt
)[indexDataEnd
]))
361 c
= (*vSubIt
)[indexDataEnd
];
366 /* Now, we apply the logic we specified at the beginning of this
368 if(indexSubstr
==indexDataStart
) {
369 priorityLevel
= max(priorityLevel
, priorityBeginsWith
);
370 if(utfQuery
.length()==indexDataEnd
+1 - indexDataStart
) {
371 priorityLevel
= max(priorityLevel
, priorityExact
);
374 } else if(indexSubstr
== indexDataEnd
+1 - utfQuery
.length()) {
375 priorityLevel
= max(priorityLevel
, priorityEndsWith
);
377 priorityLevel
= max(priorityLevel
, priorityOther
);
381 /* Add to appropriate list */
382 if(priorityLevel
>=0) {
383 if(priorityResults
[0].size()
384 +priorityResults
[1].size()
385 +priorityResults
[2].size()
386 +priorityResults
[3].size()< SEARCH_MAX
) {
387 priorityResults
[priorityLevel
].push_back(i
);
390 os
<< "Over " << SEARCH_MAX
391 << " results were found. The search has been stopped.";
392 el
.Push(EL_Info
, os
.str());
401 /* Combine results into one list, based upon priority. */
402 list
<int>::iterator lIt
;
404 for(lIt
=priorityResults
[i
].begin();
405 lIt
!=priorityResults
[i
].end(); lIt
++) {
406 results
.push_back(*lIt
);
411 printf("Search result count: %d\n", results
.size());
413 if(results
.size()>0) return true;
417 wstring
WDict::ResultToHTML(const wstring
& rawResult
) {
418 wstring token
, subToken
, jStr
, eStr
, htmlStr
;
419 list
<wstring
> tk
= StrTokenize(rawResult
, L
"\n");
420 size_t indexSlash
, indexNextSlash
, indexBreak
;
424 htmlStr
.append(L
"<p>");
426 indexSlash
= token
.find_first_of(L
'/');
427 if(indexSlash
==wstring::npos
) {
428 /* Fail-safe: just display the raw string */
429 htmlStr
.append(token
);
431 htmlStr
.append(L
"<b>Japanese:</b> <font size=\"6\">");
432 /*htmlStr.append(token.substr(0,indexSlash));*/
433 jStr
= token
.substr(0,indexSlash
);
435 indexBreak
= jStr
.find_first_of(L
';');
436 while(indexBreak
!=wstring::npos
) {
437 /*jStr[indexBreak]=L", ";*/
438 jStr
.replace(indexBreak
,1,L
", ",0,2);
439 indexBreak
= jStr
.find_first_of(L
';');
442 htmlStr
.append(jStr
);
443 htmlStr
.append(L
"</font><br>");
445 htmlStr
.append(L
"<b>English:</b> ");
447 while(indexSlash
!=wstring::npos
) {
448 indexNextSlash
= token
.find_first_of(L
'/', indexSlash
+1);
449 if(indexNextSlash
==wstring::npos
)
450 subToken
= token
.substr(indexSlash
+1);
452 subToken
= token
.substr(indexSlash
+1,
453 indexNextSlash
-1 - indexSlash
);
454 if(subToken
.length()>0) {
457 eStr
.append(subToken
);
459 indexSlash
= indexNextSlash
;
461 htmlStr
.append(eStr
);
463 htmlStr
.append(L
"</p>");
469 void WDict::GetEnglish(const string
& edictStr
, vector
<string
>& dest
) {
470 char *tokenizedString
= new char[edictStr
.length()+1];
473 strcpy(tokenizedString
, edictStr
.c_str());
474 token
= strtok(tokenizedString
, "/");
475 /* Skip to the second token, since the first is just the Japanese readings */
477 token
= strtok(NULL
, "/");
481 if(strlen(token
)>0) dest
.push_back(token
);
482 token
= strtok(NULL
, "/");
485 delete[] tokenizedString
;
488 void WDict::GetJapanese(const string
& edictStr
, vector
<string
>& dest
) {
489 /* Grab the portion of the string relevant for Japanese readings */
490 size_t indexFinal
= edictStr
.find_first_of('/');
491 if(indexFinal
==string::npos
) indexFinal
= edictStr
.length();
492 string jStr
= edictStr
.substr(0, indexFinal
);
495 /* The data is too complex for a simple tokenization because strings within
496 parentheses may contain characters normally used for breaking up the
497 tokens. So, the logic here is a little more complex. */
498 size_t index
, indexBreak
, indexParen
, indexStart
=0;
499 size_t len
=jStr
.length();
502 while(indexStart
<len
) {
504 indexBreak
= jStr
.find_first_of(";[] ", index
);
505 indexParen
= jStr
.find_first_of('(', index
);
507 /* Valid String Breaks */
508 /* If no parentheses are found, then indexBreak indicates our
510 if(indexParen
==string::npos
) break;
511 /* If parentheses ARE found, then we want to process them...
512 UNLESS a break char is found before the parenthesis. */
513 if(indexBreak
!=string::npos
&& indexBreak
<indexParen
) break;
515 /* Skip the parentheses and set index equal to the index following
516 the ')' character. */
517 if(!GetIndexAfterParens(jStr
, indexParen
, index
)) {
518 indexBreak
= string::npos
;
523 if(indexBreak
==string::npos
) {
524 temp
= jStr
.substr(indexStart
);
526 temp
= jStr
.substr(indexStart
, indexBreak
-indexStart
);
528 if(temp
.length()>0) dest
.push_back(temp
);
530 /* Return if either indexBreak or index == string::npos.
532 This happens either if indexStart == string::npos, or if
533 parsing parentheses and we can't find a closing parenthesis.
534 indexBreak==string::npos:
535 This happens if the substring continues to the end of the source
537 if(index
==string::npos
538 || indexBreak
==string::npos
) {
542 /* Iterate relevant vars for next iteration */
543 indexStart
= indexBreak
+1;
548 string
WDict::GetEdictString(int i
) const { return edictData
[i
]; }
550 bool WDict::MainDataLoaded() const {
551 if(edictData
.size()>0) return true;