Const conversions for edict and kanjidic objects. Removal of obsolete dictionary...
[jben.git] / edict.cpp
blobe28f5ebac828471a7d74327074bbfee9cabff39d
1 /*
2 Project: J-Ben
3 Author: Paul Goins
4 Website: http://www.vultaire.net/software/jben/
5 License: GNU General Public License (GPL) version 2
6 (http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt)
8 File: edict.cpp
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>
24 #include "edict.h"
25 #include "file_utils.h"
26 #include "wx/tokenzr.h"
27 #include "jutils.h"
28 #include "string_utils.h"
29 #include <set>
30 #include <list>
31 #include <algorithm>
32 #include <cstring>
33 #include <fstream>
34 #include <string>
35 using namespace std;
37 /* SEARCH_MAX is our hard-coded cutoff point for searches. It should be high
38 enough not to interfere with normal "single page" operation, but it should
39 also prevent the user from doing something too stupid and having to wait a
40 minute or so because they searched for the letter "e" by mistake.
42 The most commonly used kanji in EDICT2 appears to be 人, at 1889 characters.
43 Thus, let's make our panic breakoff point at 2000 characters. */
44 #define SEARCH_MAX 2000
46 Edict *Edict::LoadEdict(const char *filename, int& returnCode) {
47 Edict *e=NULL;
48 char *rawData = NULL;
49 unsigned int size;
51 ifstream ifile(filename, ios::ate); /* "at end" to get our file size */
52 if(ifile) {
53 size = ifile.tellg();
54 ifile.seekg(0);
55 rawData = new char[size+1];
56 rawData[size] = '\0';
57 ifile.read(rawData, size);
58 #ifdef DEBUG
59 if(strlen(rawData)!=size)
60 fprintf(stderr,
61 "WARNING: edict file size: %d, read-in string: %d\n",
62 strlen(rawData),
63 size);
64 #endif
66 /* Create the kanjidic object with our string data. */
67 e = new Edict(rawData);
69 returnCode = ED_SUCCESS;
71 else
72 returnCode = ED_FAILURE;
74 if(rawData) delete[] rawData;
75 return e;
78 /* Default constructor for Edict. Takes a wxString containing the contents of
79 an EDICT- or EDICT2-formatted dictionary, and adds its contents to an
80 internal data struct. This function also indexes the data, although ideally
81 the indexing functionality should be externalized so it may be called later,
82 like if another dictionary is added into the same Edict object at a later
83 point. */
84 Edict::Edict(char *edictRawData) {
85 char *token;
86 wxString wxToken;
88 int vIndex = -1; /* edict vector index */
89 wxString sTemp;
91 /* Store raw EDICT data, plus store references by kanji/reading into ordered
92 set */
93 token = strtok(edictRawData, "\n");
94 while(token) {
95 if(strlen(token)>0) {
96 /* 0. Make wxString copy of the token */
97 UTF8ToWx(token, wxToken);
98 /* 1. Store full string in vector */
99 edictData.push_back(token);
100 vIndex++;
103 token = strtok(NULL, "\n");
104 } /* while has more tokens */
107 Edict::~Edict() {
108 /* Currently, nothing needs to be done here. */
111 /* This function walks through the string, watching the parentheses, and copying
112 only the portions which are outside parentheses. Nested parentheses are
113 handled. */
114 wxString Edict::StripParenFields(const wxString& src) {
115 wxString result;
116 int parenCount = 0;
117 size_t length, index, lastIndex, startValid;
119 startValid = 0;
120 length = src.length();
121 index = src.find_first_of(_T("()"));
122 while(index != wxString::npos) {
123 if(src[index]==_T('(')) {
124 parenCount++;
125 if(parenCount==1) {
126 /* Append the valid string up until parens were entered. */
127 result.append(src.substr(startValid, index - startValid));
129 } else {
130 parenCount--;
131 if(parenCount==0) {
132 /* Parens have been exited. Reset our valid index. */
133 startValid = index+1;
135 if(parenCount<0) parenCount=0; /* We'll skip extra )'s */
137 lastIndex = index;
138 index = src.find_first_of(_T("()"), lastIndex+1);
140 if(parenCount>0) {
141 #ifdef DEBUG
142 fprintf(stderr, "WARNING: %s:%d, StripParenFields: Unclosed '(' detected.\n\tString: %ls\n",
143 __FILE__, __LINE__, src.c_str());
144 #endif
145 } else {
146 /* Append any remainder of the original string */
147 if(startValid!=wxString::npos && startValid < length) {
148 result.append(src.substr(startValid));
152 return result;
155 bool Edict::Search(const wxString& query, list<int>& results,
156 unsigned int searchType) const {
157 list<int> priorityResults[4];
158 bool englishSearch;
159 bool isFurigana; /* Not sure this is necessary - currently set but not
160 used. May be a performance accelerator at cost of
161 code complexity. */
162 int priorityExact, priorityBeginsWith, priorityEndsWith, priorityOther;
163 vector<string>::const_iterator vIt;
165 if(query.length()==0) {
166 #ifdef DEBUG
167 printf("[%s:%d] Empty string passed into Edict::Search. (Not a problem!)\n", __FILE__, __LINE__);
168 #endif
169 return false;
172 /* Get our search priorities set up */
173 int i;
174 unsigned int uTemp;
176 /* Default priority is -1, "not used" */
177 /* Lowest priority is 0, and highest will be 3. */
178 /* HOWEVER, searchType is sorted as 0:7=high priority and 24:31=low. */
179 priorityExact = priorityBeginsWith = priorityEndsWith = priorityOther = -1;
180 for(i=0;i<4;i++) {
181 uTemp = (searchType >> ((3-i)*8)) & 0xFF;
182 if(uTemp == EDS_EXACT) priorityExact = i;
183 else if(uTemp == EDS_BEGIN) priorityBeginsWith = i;
184 else if(uTemp == EDS_END) priorityEndsWith = i;
185 else if(uTemp == EDS_ANY) priorityOther = i;
186 else if(uTemp == 0) { /* Do nothing; no preferred search method for
187 this level */ }
188 else {
189 #ifdef DEBUG
190 fprintf(stderr, "Unknown search type for priority level %d: %X\n", i+1, uTemp);
191 #endif
195 /* Store first char. This determines whether we're doing an E-J or J-E
196 search. */
197 wxChar firstChar = query[0];
198 /* Using a very, very simple check: is it just a 7-bit char? */
199 englishSearch = ( ((unsigned)firstChar) <= 0x7F );
200 if(!englishSearch) {
201 isFurigana=true;
202 for(wxString::const_iterator stringIt = query.begin();
203 stringIt!=query.end(); stringIt++) {
204 isFurigana = (IsFurigana(*stringIt));
205 if(!isFurigana) break;
209 /* Main search code begins below */
210 /* NOTE: I think this can be cleaned up. I don't think the vector for
211 entryData below is needed; a simple string should suffice and a loop can
212 be removed. I'll look at this later since I'm busy with something else
213 at the moment. */
215 vector<string> entryData; /* Stores the English/Japanese components of
216 an EDICT string. */
217 string utfQuery, lwrQuery, lwrData;
218 vector<string>::iterator vSubIt;
219 size_t indexSubstr, indexDataStart, indexDataEnd;
220 int priorityLevel;
221 char c;
223 WxToUTF8(query, utfQuery);
224 lwrQuery = StrToLower(utfQuery); /* For English searching, store a
225 lowercase query */
226 i = 0;
228 for(vIt=edictData.begin(); vIt!=edictData.end(); vIt++) {
229 priorityLevel = -1; /* -1 == not a match*/
230 if(englishSearch) {
231 GetEnglish(*vIt, entryData);
232 } else {
233 GetJapanese(*vIt, entryData);
236 for(vSubIt=entryData.begin(); vSubIt!=entryData.end(); vSubIt++) {
237 if(englishSearch) {
238 /* English searching requires 2 special conditions:
239 1. Case-insensitive searching (maybe optional, later)
240 2. Recognition of word bounds (so we don't match character
241 sequences inside of a word.) */
243 /* Convert target string to lower case */
244 lwrData = StrToLower(*vSubIt);
246 /* Find the first match that is bounded by non-alpha characters
247 or beginning/end of string. */
248 indexSubstr = lwrData.find(lwrQuery, 0);
249 while(indexSubstr!=string::npos) {
250 #ifdef DEBUG
251 printf("Checking possible match:\n"
252 "Query: [%s]\n"
253 "Data string: [%s]\n"
254 "Index of match: %d\n",
255 lwrQuery.c_str(), lwrData.c_str(), indexSubstr);
256 #endif
258 /* Check for beginning of data string or preceding
259 non-alpha char */
260 (indexSubstr==0 || !isalpha(lwrData[indexSubstr-1])) &&
261 /* Check for end of data string or following non-alpha
262 char */
263 (indexSubstr+lwrQuery.length() == lwrData.length() ||
264 !isalpha(lwrData[indexSubstr+lwrQuery.length()]))
265 ) break;
266 /* If the match didn't meet all the above criteria, try to
267 find the next one. */
268 #ifdef DEBUG
269 printf("Match not good. Displaying verbose data:\n");
270 if(indexSubstr==0)
271 printf("* Beginning of query matches beginning of data. (OK)\n");
272 else if(!isalpha(lwrData[indexSubstr-1]))
273 printf("* Preceding character '%c' is non-alpha. (OK)\n",
274 lwrData[indexSubstr-1]);
275 else
276 printf("* Start match is invalid. (FAIL)\n");
277 if(indexSubstr+lwrQuery.length() == lwrData.length())
278 printf("* End of query matches end of data. (OK)\n");
279 else if(!isalpha(lwrData[indexSubstr+lwrQuery.length()]))
280 printf("* Following character '%c' is non-alpha. (OK)\n",
281 lwrData[indexSubstr+lwrQuery.length()]);
282 else
283 printf("* End match is invalid. (FAIL)\n");
285 #endif
286 indexSubstr = lwrData.find(lwrQuery, indexSubstr+1);
288 } else {
289 indexSubstr = vSubIt->find(utfQuery, 0);
291 if(indexSubstr!=string::npos) {
292 /* A match was found.
293 Sort by type of match (exact, begin, end, other)
294 LOGIC:
295 - Search for a "begins with".
296 - If it matches, check for "exact" (string length will work).
297 - Check for an "ends with" (parens may be a prob??)
298 - Dump all others into "other" */
300 /* FIRST: We need to get our dictionary bounds. Check for an
301 opening parenthesis, and if present, skip past it. */
302 indexDataStart = 0;
303 c = (*vSubIt)[indexDataStart];
304 if(c == '(' || c == '{') {
305 /* Parens found. Loop until we reach the beginning of the
306 real data. */
307 while(1) {
308 /* Get first non-space char past the end parenthesis. */
309 if(c=='(') {
310 if(GetIndexAfterParens(*vSubIt, indexDataStart,
311 indexDataStart, '(', ')')) {
312 while(isspace((*vSubIt)[indexDataStart]))
313 indexDataStart++;
315 } else if(c=='{') {
316 if(GetIndexAfterParens(*vSubIt, indexDataStart,
317 indexDataStart, '{', '}')) {
318 while(isspace((*vSubIt)[indexDataStart]))
319 indexDataStart++;
321 } else break;
322 c = (*vSubIt)[indexDataStart];
326 /* Get the ending bound.
327 NOTE: Currently this is only done for Japanese entries.
328 English entries ending with ()'s will not omit them. This is
329 deliberate. */
330 indexDataEnd = vSubIt->length()-1;
331 if(!englishSearch) {
332 c = (*vSubIt)[indexDataEnd];
333 if(c == ')' || c == '}') {
334 /* Parens found. Loop until we reach the beginning of
335 the real data. */
336 while(1) {
337 /* Get first non-space char past the end
338 parenthesis. */
339 if(c==')') {
340 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
341 indexDataEnd, '(', ')')) {
342 while(isspace((*vSubIt)[indexDataEnd]))
343 indexDataEnd--;
345 } else if(c=='}') {
346 if(GetIndexBeforeParens(*vSubIt, indexDataEnd,
347 indexDataEnd, '{', '}')) {
348 while(isspace((*vSubIt)[indexDataEnd]))
349 indexDataEnd--;
351 } else break;
352 c = (*vSubIt)[indexDataEnd];
357 /* Now, we apply the logic we specified at the beginning of this
358 block. */
359 if(indexSubstr==indexDataStart) {
360 priorityLevel = max(priorityLevel, priorityBeginsWith);
361 if(utfQuery.length()==indexDataEnd+1 - indexDataStart) {
362 priorityLevel = max(priorityLevel, priorityExact);
363 } else {
365 } else if(indexSubstr == indexDataEnd+1 - utfQuery.length()) {
366 priorityLevel = max(priorityLevel, priorityEndsWith);
367 } else {
368 priorityLevel = max(priorityLevel, priorityOther);
372 /* Add to appropriate list */
373 if(priorityLevel>=0) {
374 if(priorityResults[0].size()
375 +priorityResults[1].size()
376 +priorityResults[2].size()
377 +priorityResults[3].size()< SEARCH_MAX) {
378 priorityResults[priorityLevel].push_back(i);
379 } else {
380 #ifdef DEBUG
381 printf("PANIC: SEARCH_MAX results reached!\n");
382 #endif
383 wxMessageBox(wxString::Format(_T("Over %d results were found. The search has been stopped."), SEARCH_MAX),
384 _T("Excessive search results"),
385 wxOK | wxICON_INFORMATION, NULL);
386 break;
390 entryData.clear();
391 i++;
394 /* Combine results into one list, based upon priority. */
395 list<int>::iterator lIt;
396 for(i=3;i>=0;i--) {
397 for(lIt=priorityResults[i].begin();
398 lIt!=priorityResults[i].end(); lIt++) {
399 results.push_back(*lIt);
403 #ifdef DEBUG
404 printf("Search result count: %d\n", results.size());
405 #endif
406 if(results.size()>0) return true;
407 return false;
410 wxString Edict::ResultToHTML(const wxString& rawResult) {
411 wxString token, subToken, jStr, eStr, htmlStr;
412 wxStringTokenizer tk(rawResult, _T("\n"));
413 size_t indexSlash, indexNextSlash, indexBreak;
414 while(tk.HasMoreTokens()) {
415 token = tk.GetNextToken();
416 htmlStr.append(_T("<p>"));
418 indexSlash = token.find_first_of(_T('/'));
419 if(indexSlash==wxString::npos) {
420 /* Fail-safe: just display the raw string */
421 htmlStr.append(token);
422 } else {
423 htmlStr.append(_T("<b>Japanese:</b> <font size=\"6\">"));
424 /*htmlStr.append(token.substr(0,indexSlash));*/
425 jStr = token.substr(0,indexSlash);
427 indexBreak = jStr.find_first_of(_T(';'));
428 while(indexBreak!=wxString::npos) {
429 /*jStr[indexBreak]=_T(", ");*/
430 jStr.replace(indexBreak,1,_T(", "),0,2);
431 indexBreak = jStr.find_first_of(_T(';'));
434 htmlStr.append(jStr);
435 htmlStr.append(_T("</font><br>"));
437 htmlStr.append(_T("<b>English:</b> "));
438 eStr.clear();
439 while(indexSlash!=wxString::npos) {
440 indexNextSlash = token.find_first_of(_T('/'), indexSlash+1);
441 if(indexNextSlash==wxString::npos)
442 subToken = token.substr(indexSlash+1);
443 else
444 subToken = token.substr(indexSlash+1,
445 indexNextSlash-1 - indexSlash);
446 if(subToken.length()>0) {
447 if(eStr.length()>0)
448 eStr.append(_T("; "));
449 eStr.append(subToken);
451 indexSlash = indexNextSlash;
453 htmlStr.append(eStr);
455 htmlStr.append(_T("</p>"));
458 return htmlStr;
461 void Edict::GetEnglish(const string& edictStr, vector<string>& dest) {
462 char *tokenizedString = new char[edictStr.length()+1];
463 char *token;
465 strcpy(tokenizedString, edictStr.c_str());
466 token = strtok(tokenizedString, "/");
467 /* Skip to the second token, since the first is just the Japanese readings */
468 if(token) {
469 token = strtok(NULL, "/");
472 while(token) {
473 if(strlen(token)>0) dest.push_back(token);
474 token = strtok(NULL, "/");
477 delete[] tokenizedString;
480 void Edict::GetJapanese(const string& edictStr, vector<string>& dest) {
481 /* Grab the portion of the string relevant for Japanese readings */
482 size_t indexFinal = edictStr.find_first_of('/');
483 if(indexFinal==string::npos) indexFinal = edictStr.length();
484 string jStr = edictStr.substr(0, indexFinal);
485 string temp;
487 /* The data is too complex for a simple tokenization because strings within
488 parentheses may contain characters normally used for breaking up the
489 tokens. So, the logic here is a little more complex. */
490 size_t index, indexBreak, indexParen, indexStart=0;
491 size_t len=jStr.length();
493 index = indexStart;
494 while(indexStart<len) {
495 while(true) {
496 indexBreak = jStr.find_first_of(";[] ", index);
497 indexParen = jStr.find_first_of('(', index);
499 /* Valid String Breaks */
500 /* If no parentheses are found, then indexBreak indicates our
501 bounds properly. */
502 if(indexParen==string::npos) break;
503 /* If parentheses ARE found, then we want to process them...
504 UNLESS a break char is found before the parenthesis. */
505 if(indexBreak!=string::npos && indexBreak<indexParen) break;
507 /* Skip the parentheses and set index equal to the index following
508 the ')' character. */
509 if(!GetIndexAfterParens(jStr, indexParen, index)) {
510 indexBreak = string::npos;
511 break;
515 if(indexBreak==string::npos) {
516 temp = jStr.substr(indexStart);
517 } else {
518 temp = jStr.substr(indexStart, indexBreak-indexStart);
520 if(temp.length()>0) dest.push_back(temp);
522 /* Return if either indexBreak or index == string::npos.
523 index==string::npos:
524 This happens either if indexStart == string::npos, or if
525 parsing parentheses and we can't find a closing parenthesis.
526 indexBreak==string::npos:
527 This happens if the substring continues to the end of the source
528 string. */
529 if(index==string::npos
530 || indexBreak==string::npos) {
531 return;
534 /* Iterate relevant vars for next iteration */
535 indexStart = indexBreak+1;
536 index = indexStart;
540 string Edict::GetEdictString(int i) const { return edictData[i]; }