moved kdeaccessibility kdeaddons kdeadmin kdeartwork kdebindings kdeedu kdegames...
[kdeedu.git] / kiten / dict.cpp
blob6be8452685397f7df9d23ae094c113bcddc440e0
1 /**
2 This file is part of Kiten, a KDE Japanese Reference Tool...
3 Copyright (C) 2001 Jason Katz-Brown <jason@katzbrown.com>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 USA
19 **/
21 #include <kdebug.h>
22 #include <klocale.h>
23 #include <kmessagebox.h>
24 #include <kprocess.h>
25 #include <kstandarddirs.h>
27 #include <qfileinfo.h>
28 #include <qregexp.h>
29 #include <qtextcodec.h>
31 #include "dict.h"
33 #include <iostream>
34 #include <cassert>
35 #include <sys/mman.h>
36 #include <stdio.h>
38 namespace
40 void msgerr(const QString &msg, const QString &dict = QString::null)
42 QString output = msg;
43 if (!dict.isNull())
44 output = msg.arg(dict);
45 KMessageBox::error(0, output);
49 using namespace Dict;
51 TextType Dict::textType(const QString &text)
53 ushort first = text.at(0).unicode();
55 if (first < 0x3000)
56 return Text_Latin;
57 // else if (first < 0x3040) // CJK Symbols and Punctuation
58 // return Text_Kana;
59 // else if (first < 0x30A0) // Hiragana
60 // return Text_Kana;
61 else if (first < 0x3100) // Katakana
62 return Text_Kana;
64 else /*if (first >= 0x3400 && first < 0x4DC0)*/ // CJK Unified Ideographs Extension A
65 return Text_Kanji;
68 File::File(QString path, QString n)
69 : myName(n)
70 , dictFile(path)
71 , dictPtr((const unsigned char *)MAP_FAILED)
72 , indexFile(KGlobal::dirs()->saveLocation("data", "kiten/xjdx/", true) + QFileInfo(path).baseName() + ".xjdx")
73 , indexPtr((const uint32_t *)MAP_FAILED)
74 , valid(false)
76 bool forceUpdate = false;
78 bool indexFileExists = indexFile.exists();
79 if (indexFileExists)
81 // ### change this if need be!!
82 const int indexFileVersion = 14;
84 // this up-to-date code from xjdservcomm.c
85 // we need to check if the index needs to
86 // remade
88 int dictionaryLength;
89 QFile dictionary(path);
90 dictionaryLength = dictionary.size();
91 dictionaryLength++;
92 //kdDebug() << "dictionaryLength = " << dictionaryLength << endl;
94 int32_t testWord[1];
95 fread(&testWord[0], sizeof(int32_t), 1, fopen(indexFile.name().latin1(), "rb"));
97 //kdDebug() << "testWord[0] = " << testWord[0] << endl;
99 if (testWord[0] != (dictionaryLength + indexFileVersion))
100 forceUpdate = true;
103 if (!indexFileExists || forceUpdate)
105 //kdDebug() << "creating " << indexFile.name() << endl;
106 // find the index generator executable
107 KProcess proc;
108 proc << KStandardDirs::findExe("kitengen") << path << indexFile.name();
109 // TODO: put up a status dialog and event loop instead of blocking
110 proc.start(KProcess::Block, KProcess::NoCommunication);
113 if (!dictFile.open(IO_ReadOnly))
115 msgerr(i18n("Could not open dictionary %1."), path);
116 return;
119 dictPtr = (const unsigned char *)mmap(0, dictFile.size(), PROT_READ, MAP_SHARED, dictFile.handle(), 0);
120 if (dictPtr == (unsigned char*) MAP_FAILED)
122 msgerr(i18n("Memory error when loading dictionary %1."), path);
123 return;
126 if (!indexFile.open(IO_ReadOnly))
128 msgerr(i18n("Could not open index for dictionary %1."), path);
129 return;
132 indexPtr = (const uint32_t*)mmap(0, indexFile.size(), PROT_READ, MAP_SHARED, indexFile.handle(), 0);
133 if (indexPtr == (uint32_t*) MAP_FAILED)
135 msgerr(i18n("Memory error when loading dictionary %1's index file."), path);
136 return;
139 valid = true;
142 File::~File(void)
144 if (dictPtr != (unsigned char*) MAP_FAILED)
145 munmap((char *)dictPtr, dictFile.size());
146 dictFile.close();
148 if (indexPtr != (uint32_t*) MAP_FAILED)
149 munmap((char *)indexPtr, indexFile.size());
150 indexFile.close();
153 QString File::name(void)
155 return myName;
158 Array<const unsigned char> File::dict(void)
160 assert(valid);
161 return Array<const unsigned char>(dictPtr, dictFile.size());
164 Array<const uint32_t> File::index(void)
166 assert(valid);
167 return Array<const uint32_t>(indexPtr, indexFile.size());
170 int File::dictLength(void)
172 return dictFile.size();
175 int File::indexLength(void)
177 return indexFile.size();
180 bool File::isValid(void)
182 return valid;
185 // returns specified character from a dictionary
186 unsigned char File::lookup(unsigned i, int offset)
188 uint32_t pos = indexPtr[i] + offset - 1;
189 if (pos > dictFile.size()) return 10;
190 return dictPtr[pos];
193 QCString File::lookup(unsigned i)
195 uint32_t start = indexPtr[i] - 1;
196 uint32_t pos = start;
197 const unsigned size = dictFile.size();
198 // get the whole word
199 while(pos <= size && dictPtr[pos] != 0 && dictPtr[pos] != 0x0a)
200 ++pos;
201 // put the word in the QCString
202 QCString retval((const char *)(dictPtr + start), pos - start);
203 // tack on a null
204 char null = 0;
205 retval.append(&null);
206 // and away we go
207 return retval;
210 // And last, Index itself is the API presented to the rest of Kiten
211 Index::Index()
212 : QObject()
214 dictFiles.setAutoDelete(true);
215 kanjiDictFiles.setAutoDelete(true);
218 Index::~Index()
222 void Index::setDictList(const QStringList &list, const QStringList &names)
224 loadDictList(dictFiles, list, names);
227 void Index::setKanjiDictList(const QStringList &list, const QStringList &names)
229 loadDictList(kanjiDictFiles, list, names);
232 void Index::loadDictList(QPtrList<File> &fileList, const QStringList &dictList, const QStringList &dictNameList)
234 fileList.clear();
236 // check if we have a dict
237 if (dictList.size() < 1)
239 msgerr(i18n("No dictionaries in list!"));
240 return;
243 QStringList::ConstIterator it;
244 QStringList::ConstIterator dictIt;
245 for (it = dictList.begin(), dictIt = dictNameList.begin(); it != dictList.end(); ++it, ++dictIt)
247 File *f = new File(*it, *dictIt);
248 // our ugly substitute for exceptions
249 if (f->isValid())
250 fileList.append(f);
251 else
252 delete f;
256 QStringList Index::doSearch(File &file, const QString &text)
258 // Do a binary search to find an entry that matches text
259 QTextCodec &codec = *QTextCodec::codecForName("eucJP");
260 QCString eucString = codec.fromUnicode(text);
262 QString prevResult;
264 Array<const uint32_t> index = file.index();
265 Array<const unsigned char> dict = file.dict();
266 int lo = 0;
267 int hi = index.size() - 1;
268 unsigned cur;
269 int comp = 0;
273 cur = (hi + lo) / 2;
274 comp = stringCompare(file, cur, eucString);
276 if (comp < 0)
277 hi = cur - 1;
278 else if (comp > 0)
279 lo = cur + 1;
281 while(hi >= lo && comp != 0 && !(hi == 0 && lo == 0));
282 QStringList results;
283 // A match?
284 if (comp == 0)
286 // wheel back to make sure we get the first matching entry
287 while(cur - 1 && 0 == stringCompare(file, cur - 1, eucString))
288 --cur;
290 // output every matching entry
291 while(cur < index.size() && 0 == stringCompare(file, cur, eucString))
293 // because the index doesn't point
294 // to the start of the line, find the
295 // start of the line:
296 int i = 0;
297 while(file.lookup(cur, i - 1) != 0x0a) --i;
299 QByteArray bytes(0);
300 while(file.lookup(cur, i) != 0x0a) // get to end of our line
302 const char eucchar = file.lookup(cur, i);
303 bytes.resize(bytes.size() + 1);
304 bytes[bytes.size() - 1] = eucchar;
305 ++i;
308 QString result = codec.toUnicode(bytes) + QString("\n");
309 if (prevResult != result)
311 results.append(result);
312 prevResult = result;
315 ++cur;
319 // return all the entries found, or null if no match
320 return results;
323 SearchResult Index::scanResults(QRegExp regexp, QStringList results, bool common)
325 unsigned int num = 0;
326 unsigned int fullNum = 0;
328 SearchResult ret;
330 //ret.results = results; //not here..
332 for (QStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
334 if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
336 ret.list.append(parse(*itr));
337 continue;
340 int found = regexp.search(*itr);
342 if (found >= 0)
344 ++fullNum;
345 if ((*itr).find(QString("(P)")) >= 0 || !common)
347 // we append HERE, so we get the exact
348 // results we have in ret.list
350 ret.results.append(*itr);
351 ret.list.append(parse(*itr));
352 ++num;
357 ret.count = num;
358 ret.outOf = fullNum;
359 ret.common = common;
360 return ret;
363 SearchResult Index::search(QRegExp regexp, const QString &text, bool common)
365 QStringList results;
366 for (QPtrListIterator<File> file(dictFiles); *file; ++file)
368 results.append(QString("DICT ") + (*file)->name());
370 results += doSearch(**file, text);
373 SearchResult res = scanResults(regexp, results, common);
374 res.text = text;
375 return res;
378 SearchResult Index::scanKanjiResults(QRegExp regexp, QStringList results, bool common)
380 unsigned int num = 0;
381 unsigned int fullNum = 0;
382 const bool jmyCount = false; // don't count JinMeiYou as common
383 SearchResult ret;
384 ret.results = results;
386 for (QStringList::Iterator itr = results.begin(); itr != results.end(); ++itr)
388 if ((*itr).left(5) == "DICT " || (*itr).left(8) == "HEADING ")
390 ret.list.append(kanjiParse(*itr));
391 continue;
394 int found = regexp.search(*itr);
396 if (found >= 0)
398 ++fullNum;
399 // common entries have G[1-8] (jouyou)
400 QRegExp comregexp(jmyCount ? "G[1-9]" : "G[1-8]");
401 if ((*itr).find(comregexp) >= 0 || !common)
403 ret.list.append(kanjiParse(*itr));
404 ++num;
409 ret.count = num;
410 ret.outOf = fullNum;
411 ret.common = common;
412 return ret;
415 SearchResult Index::searchKanji(QRegExp regexp, const QString &text, bool common)
417 QStringList results;
418 for (QPtrListIterator<File> file(kanjiDictFiles); *file; ++file)
420 results.append(QString("DICT ") + (*file)->name());
422 results += doSearch(**file, text);
425 SearchResult res = scanKanjiResults(regexp, results, common);
426 res.text = text;
427 return res;
430 SearchResult Index::searchPrevious(QRegExp regexp, const QString &text, SearchResult list, bool common)
432 SearchResult res;
434 if (firstEntry(list).extendedKanjiInfo())
435 res = scanKanjiResults(regexp, list.results, common);
436 else
437 res = scanResults(regexp, list.results, common);
439 res.text = text;
440 return res;
443 QRegExp Dict::Index::createRegExp(SearchType type, const QString &text, DictionaryType dictionaryType, bool caseSensitive)
445 QString regExp;
446 switch (type)
448 case Search_Beginning:
449 switch (textType(text))
451 case Dict::Text_Latin:
452 regExp = "\\W%1";
453 break;
455 case Dict::Text_Kana:
456 if (dictionaryType == Kanjidict)
457 regExp = "\\W%1";
458 else // edict
459 regExp = "\\[%1";
460 break;
462 case Dict::Text_Kanji:
463 regExp = "^%1";
465 break;
467 case Search_FullWord:
468 switch (textType(text))
470 case Dict::Text_Latin:
471 regExp = "\\W%1\\W";
472 break;
474 case Dict::Text_Kana:
475 if (dictionaryType == Kanjidict)
476 regExp = " %1 ";
477 else // edict
478 regExp = "\\[%1\\]";
479 break;
481 case Dict::Text_Kanji:
482 regExp = "^%1\\W";
484 break;
486 case Search_Anywhere:
487 regExp = "%1";
490 return QRegExp(regExp.arg(text), caseSensitive);
493 int Index::stringCompare(File &file, int index, QCString str)
495 return eucStringCompare(file.lookup(index), str);
498 // effectively does a strnicmp on two "strings"
499 // except it will make katakana and hiragana match (EUC A4 & A5)
500 int Dict::eucStringCompare(const char *str, const char *str2)
502 for (unsigned i = 0; ; ++i)
504 unsigned char c = static_cast<unsigned char>(str[i]);
505 unsigned char c2 = static_cast<unsigned char>(str2[i]);
506 if ((c2 == '\0') || (c == '\0'))
507 return 0;
509 if ((i % 2) == 0)
511 if (c2 == 0xA5)
512 c2 = 0xA4;
514 if (c == 0xA5)
515 c = 0xA4;
518 if ((c2 >= 'A') && (c2 <= 'Z')) c2 |= 0x20; /*fix ucase*/
519 if ((c >= 'A') && (c <= 'Z')) c |= 0x20;
521 if (c2 != c)
522 return (int)c2 - (int)c;
525 return 0;
528 bool Dict::isEUC(unsigned char c)
530 return (c & 0x80);
533 Entry Dict::parse(const QString &raw)
535 unsigned int length = raw.length();
536 if (raw.left(5) == "DICT ")
537 return Entry(raw.right(length - 5));
538 if (raw.left(8) == "HEADING ")
539 return Entry(raw.right(length - 8), true);
541 QString reading;
542 QString kanji;
543 QStringList meanings;
544 QString curmeaning;
545 bool firstmeaning = true;
546 QCString parsemode("kanji");
548 unsigned int i;
549 for (i = 0; i < length; i++)
551 QChar ichar(raw.at(i));
553 if (ichar == '[')
555 parsemode = "reading";
557 else if (ichar == ']')
559 // do nothing
561 else if (ichar == '/')
563 if (!firstmeaning)
565 meanings.append(curmeaning);
566 curmeaning = "";
568 else
570 firstmeaning = false;
571 parsemode = "meaning";
574 else if (ichar == ' ')
576 if (parsemode == "meaning") // only one that needs the space
577 curmeaning += ' ';
579 else if (parsemode == "kanji")
581 kanji += ichar;
583 else if (parsemode == "meaning")
585 curmeaning += ichar;
587 else if (parsemode == "reading")
589 reading += ichar;
593 return (Entry(kanji, reading, meanings));
596 Entry Dict::kanjiParse(const QString &raw)
598 unsigned int length = raw.length();
599 if (raw.left(5) == "DICT ")
600 return Entry(raw.right(length - 5));
601 if (raw.left(8) == "HEADING ")
602 return Entry(raw.right(length - 8), true);
604 QStringList readings;
605 QString kanji;
606 QStringList meanings;
607 QString curmeaning;
608 QString curreading;
610 QString strfreq;
611 QString strgrade;
612 QString strstrokes;
613 QString strmiscount = "";
615 bool prevwasspace = true;
616 QChar detailname;
617 QCString parsemode("kanji");
619 // if there are two S entries, second is common miscount
620 bool strokesset = false;
622 unsigned int i;
623 QChar ichar;
624 for (i = 0; i < length; i++)
626 ichar = raw.at(i);
628 if (ichar == ' ')
630 if (parsemode == "reading")
632 readings.append(curreading);
633 curreading = "";
635 else if (parsemode == "kanji")
637 parsemode = "misc";
639 else if (parsemode == "detail")
641 if (detailname == 'S')
642 strokesset = true;
644 parsemode = "misc";
646 else if (parsemode == "meaning")
648 curmeaning += ichar;
650 prevwasspace = true;
652 else if (ichar == '{')
654 parsemode = "meaning";
656 else if (ichar == '}')
658 meanings.append(curmeaning);
659 curmeaning = "";
661 else if (parsemode == "detail")
663 if (detailname == 'G')
665 strgrade += ichar;
667 else if (detailname == 'F')
669 strfreq += ichar;
671 else if (detailname == 'S')
673 if (strokesset)
674 strmiscount += ichar;
675 else
676 strstrokes += ichar;
678 prevwasspace = false;
680 else if (parsemode == "kanji")
682 kanji += ichar;
684 else if (parsemode == "meaning")
686 curmeaning += ichar;
688 else if (parsemode == "reading")
690 curreading += ichar;
692 else if (parsemode == "misc" && prevwasspace)
694 if (QRegExp("[A-Za-z0-9]").search(QString(ichar)) >= 0)
695 // is non-japanese?
697 detailname = ichar;
698 parsemode = "detail";
700 else
702 curreading = QString(ichar);
703 parsemode = "reading";
708 return (Entry(kanji, readings, meanings, strgrade.toUInt(), strfreq.toUInt(), strstrokes.toUInt(), strmiscount.toUInt()));
711 QString Dict::prettyMeaning(QStringList Meanings)
713 QString meanings;
714 QStringList::Iterator it;
715 for (it = Meanings.begin(); it != Meanings.end(); ++it)
716 meanings.append((*it).stripWhiteSpace()).append("; ");
718 meanings.truncate(meanings.length() - 2);
719 return meanings;
722 QString Dict::prettyKanjiReading(QStringList Readings)
724 QStringList::Iterator it;
725 QString html;
727 for (it = Readings.begin(); it != Readings.end(); ++it)
729 if ((*it) == "T1")
730 html += i18n("In names: ");
731 else
733 if ((*it) == "T2")
734 html += i18n("As radical: ");
735 else
737 html += (*it).stripWhiteSpace();
738 html += ", ";
742 html.truncate(html.length() - 2); // get rid of last ,
744 return html;
747 Dict::Entry Dict::firstEntry(Dict::SearchResult result)
749 for (QValueListIterator<Dict::Entry> it = result.list.begin(); it != result.list.end(); ++it)
751 if ((*it).dictName() == "__NOTSET" && (*it).header() == "__NOTSET")
752 return (*it);
755 return Dict::Entry("__NOTHING");
758 QString Dict::firstEntryText(Dict::SearchResult result)
760 for (QStringList::Iterator it = result.results.begin(); it != result.results.end(); ++it)
762 if ((*it).left(5) != "DICT " && (*it).left(7) != "HEADER ")
763 return (*it);
766 return QString("NONE ");
769 ///////////////////////////////////////////////////////////////
771 Entry::Entry(const QString & kanji, const QString & reading, const QStringList &meanings)
772 : DictName(QString::fromLatin1("__NOTSET"))
773 , Header(QString::fromLatin1("__NOTSET"))
774 , Meanings(meanings)
775 , Kanji(kanji)
776 , KanaOnly(reading.isEmpty())
777 , Readings(KanaOnly ? kanji : reading)
778 , ExtendedKanjiInfo(false)
779 , Grade(0)
780 , Strokes(0)
781 , Miscount(0)
782 , Freq(0)
786 Entry::Entry(const QString &kanji, QStringList &readings, QStringList &meanings, unsigned int grade, unsigned int freq, unsigned int strokes, unsigned int miscount)
787 : DictName(QString::fromLatin1("__NOTSET"))
788 , Header(QString::fromLatin1("__NOTSET"))
789 , Meanings(meanings)
790 , Kanji(kanji)
791 , KanaOnly(false)
792 , Readings(readings)
793 , ExtendedKanjiInfo(true)
794 , Grade(grade)
795 , Strokes(strokes)
796 , Miscount(miscount)
797 , Freq(freq)
801 Entry::Entry(const QString &dictname)
802 : KanaOnly(true)
803 , ExtendedKanjiInfo(false)
805 DictName = dictname;
808 Entry::Entry(const QString &headername, bool)
809 : DictName(QString::fromLatin1("__NOTSET"))
810 , Header(headername)
811 , KanaOnly(true)
812 , ExtendedKanjiInfo(false)
816 QString Entry::dictName()
818 return DictName;
821 QString Entry::header()
823 return Header;
826 bool Entry::kanaOnly()
828 return KanaOnly;
831 QString Entry::kanji()
833 return Kanji;
836 QStringList Entry::readings()
838 return Readings;
841 QString Entry::firstReading()
843 return *Readings.at(0);
846 QStringList Entry::meanings()
848 return Meanings;
851 unsigned int Entry::grade()
853 return Grade;
856 unsigned int Entry::freq()
858 return Freq;
861 unsigned int Entry::miscount()
863 return Miscount;
866 unsigned int Entry::strokes()
868 return Strokes;
871 bool Entry::extendedKanjiInfo()
873 return ExtendedKanjiInfo;
876 #include "dict.moc"