4 Website: http://www.vultaire.net/software/jben/
5 License: GNU General Public License (GPL) version 2
6 (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>
25 #include "file_utils.h"
26 #include "wx/tokenzr.h"
28 #include "string_utils.h"
37 /* SEARCH_MAX is our hard-coded cutoff point for searches. It should be high
38 enough not to interfere with normal "single page" operation, but it should
39 also prevent the user from doing something too stupid and having to wait a
40 minute or so because they searched for the letter "e" by mistake.
42 The most commonly used kanji in EDICT2 appears to be 人, at 1889 characters.
43 Thus, let's make our panic breakoff point at 2000 characters. */
44 #define SEARCH_MAX 2000
46 Edict
*Edict::LoadEdict(const char *filename
, int& returnCode
) {
51 ifstream
ifile(filename
, ios::ate
); /* "at end" to get our file size */
55 rawData
= new char[size
+1];
57 ifile
.read(rawData
, size
);
59 if(strlen(rawData
)!=size
)
61 "WARNING: edict file size: %d, read-in string: %d\n",
66 /* Create the kanjidic object with our string data. */
67 e
= new Edict(rawData
);
69 returnCode
= ED_SUCCESS
;
72 returnCode
= ED_FAILURE
;
74 if(rawData
) delete[] rawData
;
78 /* Default constructor for Edict. Takes a wxString containing the contents of
79 an EDICT- or EDICT2-formatted dictionary, and adds its contents to an
80 internal data struct. This function also indexes the data, although ideally
81 the indexing functionality should be externalized so it may be called later,
82 like if another dictionary is added into the same Edict object at a later
84 Edict::Edict(char *edictRawData
) {
88 int vIndex
= -1; /* edict vector index */
91 /* Store raw EDICT data, plus store references by kanji/reading into ordered
93 token
= strtok(edictRawData
, "\n");
96 /* 0. Make wxString copy of the token */
97 UTF8ToWx(token
, wxToken
);
98 /* 1. Store full string in vector */
99 edictData
.push_back(token
);
103 token
= strtok(NULL
, "\n");
104 } /* while has more tokens */
108 /* Currently, nothing needs to be done here. */
111 /* This function walks through the string, watching the parentheses, and copying
112 only the portions which are outside parentheses. Nested parentheses are
114 wxString
Edict::StripParenFields(const wxString
& src
) {
117 size_t length
, index
, lastIndex
, startValid
;
120 length
= src
.length();
121 index
= src
.find_first_of(_T("()"));
122 while(index
!= wxString::npos
) {
123 if(src
[index
]==_T('(')) {
126 /* Append the valid string up until parens were entered. */
127 result
.append(src
.substr(startValid
, index
- startValid
));
132 /* Parens have been exited. Reset our valid index. */
133 startValid
= index
+1;
135 if(parenCount
<0) parenCount
=0; /* We'll skip extra )'s */
138 index
= src
.find_first_of(_T("()"), lastIndex
+1);
142 fprintf(stderr
, "WARNING: %s:%d, StripParenFields: Unclosed '(' detected.\n\tString: %ls\n",
143 __FILE__
, __LINE__
, src
.c_str());
146 /* Append any remainder of the original string */
147 if(startValid
!=wxString::npos
&& startValid
< length
) {
148 result
.append(src
.substr(startValid
));
155 bool Edict::Search(const wxString
& query
, list
<int>& results
,
156 unsigned int searchType
) const {
157 list
<int> priorityResults
[4];
159 bool isFurigana
; /* Not sure this is necessary - currently set but not
160 used. May be a performance accelerator at cost of
162 int priorityExact
, priorityBeginsWith
, priorityEndsWith
, priorityOther
;
163 vector
<string
>::const_iterator vIt
;
165 if(query
.length()==0) {
167 printf("[%s:%d] Empty string passed into Edict::Search. (Not a problem!)\n", __FILE__
, __LINE__
);
172 /* Get our search priorities set up */
176 /* Default priority is -1, "not used" */
177 /* Lowest priority is 0, and highest will be 3. */
178 /* HOWEVER, searchType is sorted as 0:7=high priority and 24:31=low. */
179 priorityExact
= priorityBeginsWith
= priorityEndsWith
= priorityOther
= -1;
181 uTemp
= (searchType
>> ((3-i
)*8)) & 0xFF;
182 if(uTemp
== EDS_EXACT
) priorityExact
= i
;
183 else if(uTemp
== EDS_BEGIN
) priorityBeginsWith
= i
;
184 else if(uTemp
== EDS_END
) priorityEndsWith
= i
;
185 else if(uTemp
== EDS_ANY
) priorityOther
= i
;
186 else if(uTemp
== 0) { /* Do nothing; no preferred search method for
190 fprintf(stderr
, "Unknown search type for priority level %d: %X\n", i
+1, uTemp
);
195 /* Store first char. This determines whether we're doing an E-J or J-E
197 wxChar firstChar
= query
[0];
198 /* Using a very, very simple check: is it just a 7-bit char? */
199 englishSearch
= ( ((unsigned)firstChar
) <= 0x7F );
202 for(wxString::const_iterator stringIt
= query
.begin();
203 stringIt
!=query
.end(); stringIt
++) {
204 isFurigana
= (IsFurigana(*stringIt
));
205 if(!isFurigana
) break;
209 /* Main search code begins below */
210 /* NOTE: I think this can be cleaned up. I don't think the vector for
211 entryData below is needed; a simple string should suffice and a loop can
212 be removed. I'll look at this later since I'm busy with something else
215 vector
<string
> entryData
; /* Stores the English/Japanese components of
217 string utfQuery
, lwrQuery
, lwrData
;
218 vector
<string
>::iterator vSubIt
;
219 size_t indexSubstr
, indexDataStart
, indexDataEnd
;
223 WxToUTF8(query
, utfQuery
);
224 lwrQuery
= StrToLower(utfQuery
); /* For English searching, store a
228 for(vIt
=edictData
.begin(); vIt
!=edictData
.end(); vIt
++) {
229 priorityLevel
= -1; /* -1 == not a match*/
231 GetEnglish(*vIt
, entryData
);
233 GetJapanese(*vIt
, entryData
);
236 for(vSubIt
=entryData
.begin(); vSubIt
!=entryData
.end(); vSubIt
++) {
238 /* English searching requires 2 special conditions:
239 1. Case-insensitive searching (maybe optional, later)
240 2. Recognition of word bounds (so we don't match character
241 sequences inside of a word.) */
243 /* Convert target string to lower case */
244 lwrData
= StrToLower(*vSubIt
);
246 /* Find the first match that is bounded by non-alpha characters
247 or beginning/end of string. */
248 indexSubstr
= lwrData
.find(lwrQuery
, 0);
249 while(indexSubstr
!=string::npos
) {
251 printf("Checking possible match:\n"
253 "Data string: [%s]\n"
254 "Index of match: %d\n",
255 lwrQuery
.c_str(), lwrData
.c_str(), indexSubstr
);
258 /* Check for beginning of data string or preceding
260 (indexSubstr
==0 || !isalpha(lwrData
[indexSubstr
-1])) &&
261 /* Check for end of data string or following non-alpha
263 (indexSubstr
+lwrQuery
.length() == lwrData
.length() ||
264 !isalpha(lwrData
[indexSubstr
+lwrQuery
.length()]))
266 /* If the match didn't meet all the above criteria, try to
267 find the next one. */
269 printf("Match not good. Displaying verbose data:\n");
271 printf("* Beginning of query matches beginning of data. (OK)\n");
272 else if(!isalpha(lwrData
[indexSubstr
-1]))
273 printf("* Preceding character '%c' is non-alpha. (OK)\n",
274 lwrData
[indexSubstr
-1]);
276 printf("* Start match is invalid. (FAIL)\n");
277 if(indexSubstr
+lwrQuery
.length() == lwrData
.length())
278 printf("* End of query matches end of data. (OK)\n");
279 else if(!isalpha(lwrData
[indexSubstr
+lwrQuery
.length()]))
280 printf("* Following character '%c' is non-alpha. (OK)\n",
281 lwrData
[indexSubstr
+lwrQuery
.length()]);
283 printf("* End match is invalid. (FAIL)\n");
286 indexSubstr
= lwrData
.find(lwrQuery
, indexSubstr
+1);
289 indexSubstr
= vSubIt
->find(utfQuery
, 0);
291 if(indexSubstr
!=string::npos
) {
292 /* A match was found.
293 Sort by type of match (exact, begin, end, other)
295 - Search for a "begins with".
296 - If it matches, check for "exact" (string length will work).
297 - Check for an "ends with" (parens may be a prob??)
298 - Dump all others into "other" */
300 /* FIRST: We need to get our dictionary bounds. Check for an
301 opening parenthesis, and if present, skip past it. */
303 c
= (*vSubIt
)[indexDataStart
];
304 if(c
== '(' || c
== '{') {
305 /* Parens found. Loop until we reach the beginning of the
308 /* Get first non-space char past the end parenthesis. */
310 if(GetIndexAfterParens(*vSubIt
, indexDataStart
,
311 indexDataStart
, '(', ')')) {
312 while(isspace((*vSubIt
)[indexDataStart
]))
316 if(GetIndexAfterParens(*vSubIt
, indexDataStart
,
317 indexDataStart
, '{', '}')) {
318 while(isspace((*vSubIt
)[indexDataStart
]))
322 c
= (*vSubIt
)[indexDataStart
];
326 /* Get the ending bound.
327 NOTE: Currently this is only done for Japanese entries.
328 English entries ending with ()'s will not omit them. This is
330 indexDataEnd
= vSubIt
->length()-1;
332 c
= (*vSubIt
)[indexDataEnd
];
333 if(c
== ')' || c
== '}') {
334 /* Parens found. Loop until we reach the beginning of
337 /* Get first non-space char past the end
340 if(GetIndexBeforeParens(*vSubIt
, indexDataEnd
,
341 indexDataEnd
, '(', ')')) {
342 while(isspace((*vSubIt
)[indexDataEnd
]))
346 if(GetIndexBeforeParens(*vSubIt
, indexDataEnd
,
347 indexDataEnd
, '{', '}')) {
348 while(isspace((*vSubIt
)[indexDataEnd
]))
352 c
= (*vSubIt
)[indexDataEnd
];
357 /* Now, we apply the logic we specified at the beginning of this
359 if(indexSubstr
==indexDataStart
) {
360 priorityLevel
= max(priorityLevel
, priorityBeginsWith
);
361 if(utfQuery
.length()==indexDataEnd
+1 - indexDataStart
) {
362 priorityLevel
= max(priorityLevel
, priorityExact
);
365 } else if(indexSubstr
== indexDataEnd
+1 - utfQuery
.length()) {
366 priorityLevel
= max(priorityLevel
, priorityEndsWith
);
368 priorityLevel
= max(priorityLevel
, priorityOther
);
372 /* Add to appropriate list */
373 if(priorityLevel
>=0) {
374 if(priorityResults
[0].size()
375 +priorityResults
[1].size()
376 +priorityResults
[2].size()
377 +priorityResults
[3].size()< SEARCH_MAX
) {
378 priorityResults
[priorityLevel
].push_back(i
);
381 printf("PANIC: SEARCH_MAX results reached!\n");
383 wxMessageBox(wxString::Format(_T("Over %d results were found. The search has been stopped."), SEARCH_MAX
),
384 _T("Excessive search results"),
385 wxOK
| wxICON_INFORMATION
, NULL
);
394 /* Combine results into one list, based upon priority. */
395 list
<int>::iterator lIt
;
397 for(lIt
=priorityResults
[i
].begin();
398 lIt
!=priorityResults
[i
].end(); lIt
++) {
399 results
.push_back(*lIt
);
404 printf("Search result count: %d\n", results
.size());
406 if(results
.size()>0) return true;
410 wxString
Edict::ResultToHTML(const wxString
& rawResult
) {
411 wxString token
, subToken
, jStr
, eStr
, htmlStr
;
412 wxStringTokenizer
tk(rawResult
, _T("\n"));
413 size_t indexSlash
, indexNextSlash
, indexBreak
;
414 while(tk
.HasMoreTokens()) {
415 token
= tk
.GetNextToken();
416 htmlStr
.append(_T("<p>"));
418 indexSlash
= token
.find_first_of(_T('/'));
419 if(indexSlash
==wxString::npos
) {
420 /* Fail-safe: just display the raw string */
421 htmlStr
.append(token
);
423 htmlStr
.append(_T("<b>Japanese:</b> <font size=\"6\">"));
424 /*htmlStr.append(token.substr(0,indexSlash));*/
425 jStr
= token
.substr(0,indexSlash
);
427 indexBreak
= jStr
.find_first_of(_T(';'));
428 while(indexBreak
!=wxString::npos
) {
429 /*jStr[indexBreak]=_T(", ");*/
430 jStr
.replace(indexBreak
,1,_T(", "),0,2);
431 indexBreak
= jStr
.find_first_of(_T(';'));
434 htmlStr
.append(jStr
);
435 htmlStr
.append(_T("</font><br>"));
437 htmlStr
.append(_T("<b>English:</b> "));
439 while(indexSlash
!=wxString::npos
) {
440 indexNextSlash
= token
.find_first_of(_T('/'), indexSlash
+1);
441 if(indexNextSlash
==wxString::npos
)
442 subToken
= token
.substr(indexSlash
+1);
444 subToken
= token
.substr(indexSlash
+1,
445 indexNextSlash
-1 - indexSlash
);
446 if(subToken
.length()>0) {
448 eStr
.append(_T("; "));
449 eStr
.append(subToken
);
451 indexSlash
= indexNextSlash
;
453 htmlStr
.append(eStr
);
455 htmlStr
.append(_T("</p>"));
461 void Edict::GetEnglish(const string
& edictStr
, vector
<string
>& dest
) {
462 char *tokenizedString
= new char[edictStr
.length()+1];
465 strcpy(tokenizedString
, edictStr
.c_str());
466 token
= strtok(tokenizedString
, "/");
467 /* Skip to the second token, since the first is just the Japanese readings */
469 token
= strtok(NULL
, "/");
473 if(strlen(token
)>0) dest
.push_back(token
);
474 token
= strtok(NULL
, "/");
477 delete[] tokenizedString
;
480 void Edict::GetJapanese(const string
& edictStr
, vector
<string
>& dest
) {
481 /* Grab the portion of the string relevant for Japanese readings */
482 size_t indexFinal
= edictStr
.find_first_of('/');
483 if(indexFinal
==string::npos
) indexFinal
= edictStr
.length();
484 string jStr
= edictStr
.substr(0, indexFinal
);
487 /* The data is too complex for a simple tokenization because strings within
488 parentheses may contain characters normally used for breaking up the
489 tokens. So, the logic here is a little more complex. */
490 size_t index
, indexBreak
, indexParen
, indexStart
=0;
491 size_t len
=jStr
.length();
494 while(indexStart
<len
) {
496 indexBreak
= jStr
.find_first_of(";[] ", index
);
497 indexParen
= jStr
.find_first_of('(', index
);
499 /* Valid String Breaks */
500 /* If no parentheses are found, then indexBreak indicates our
502 if(indexParen
==string::npos
) break;
503 /* If parentheses ARE found, then we want to process them...
504 UNLESS a break char is found before the parenthesis. */
505 if(indexBreak
!=string::npos
&& indexBreak
<indexParen
) break;
507 /* Skip the parentheses and set index equal to the index following
508 the ')' character. */
509 if(!GetIndexAfterParens(jStr
, indexParen
, index
)) {
510 indexBreak
= string::npos
;
515 if(indexBreak
==string::npos
) {
516 temp
= jStr
.substr(indexStart
);
518 temp
= jStr
.substr(indexStart
, indexBreak
-indexStart
);
520 if(temp
.length()>0) dest
.push_back(temp
);
522 /* Return if either indexBreak or index == string::npos.
524 This happens either if indexStart == string::npos, or if
525 parsing parentheses and we can't find a closing parenthesis.
526 indexBreak==string::npos:
527 This happens if the substring continues to the end of the source
529 if(index
==string::npos
530 || indexBreak
==string::npos
) {
534 /* Iterate relevant vars for next iteration */
535 indexStart
= indexBreak
+1;
540 string
Edict::GetEdictString(int i
) const { return edictData
[i
]; }