2 This file is part of Kiten, a KDE Japanese Reference Tool...
3 Copyright (C) 2001 Jason Katz-Brown <jason@katzbrown.com>
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 #include <kmessagebox.h>
25 #include <kstandarddirs.h>
27 #include <qfileinfo.h>
29 #include <qtextcodec.h>
40 void msgerr(const QString
&msg
, const QString
&dict
= QString::null
)
44 output
= msg
.arg(dict
);
45 KMessageBox::error(0, output
);
51 TextType
Dict::textType(const QString
&text
)
53 ushort first
= text
.at(0).unicode();
57 // else if (first < 0x3040) // CJK Symbols and Punctuation
59 // else if (first < 0x30A0) // Hiragana
61 else if (first
< 0x3100) // Katakana
64 else /*if (first >= 0x3400 && first < 0x4DC0)*/ // CJK Unified Ideographs Extension A
68 File::File(QString path
, QString n
)
71 , dictPtr((const unsigned char *)MAP_FAILED
)
72 , indexFile(KGlobal::dirs()->saveLocation("data", "kiten/xjdx/", true) + QFileInfo(path
).baseName() + ".xjdx")
73 , indexPtr((const uint32_t *)MAP_FAILED
)
76 bool forceUpdate
= false;
78 bool indexFileExists
= indexFile
.exists();
81 // ### change this if need be!!
82 const int indexFileVersion
= 14;
84 // this up-to-date code from xjdservcomm.c
85 // we need to check if the index needs to
89 QFile
dictionary(path
);
90 dictionaryLength
= dictionary
.size();
92 //kdDebug() << "dictionaryLength = " << dictionaryLength << endl;
95 fread(&testWord
[0], sizeof(int32_t), 1, fopen(indexFile
.name().latin1(), "rb"));
97 //kdDebug() << "testWord[0] = " << testWord[0] << endl;
99 if (testWord
[0] != (dictionaryLength
+ indexFileVersion
))
103 if (!indexFileExists
|| forceUpdate
)
105 //kdDebug() << "creating " << indexFile.name() << endl;
106 // find the index generator executable
108 proc
<< KStandardDirs::findExe("kitengen") << path
<< indexFile
.name();
109 // TODO: put up a status dialog and event loop instead of blocking
110 proc
.start(KProcess::Block
, KProcess::NoCommunication
);
113 if (!dictFile
.open(IO_ReadOnly
))
115 msgerr(i18n("Could not open dictionary %1."), path
);
119 dictPtr
= (const unsigned char *)mmap(0, dictFile
.size(), PROT_READ
, MAP_SHARED
, dictFile
.handle(), 0);
120 if (dictPtr
== (unsigned char*) MAP_FAILED
)
122 msgerr(i18n("Memory error when loading dictionary %1."), path
);
126 if (!indexFile
.open(IO_ReadOnly
))
128 msgerr(i18n("Could not open index for dictionary %1."), path
);
132 indexPtr
= (const uint32_t*)mmap(0, indexFile
.size(), PROT_READ
, MAP_SHARED
, indexFile
.handle(), 0);
133 if (indexPtr
== (uint32_t*) MAP_FAILED
)
135 msgerr(i18n("Memory error when loading dictionary %1's index file."), path
);
144 if (dictPtr
!= (unsigned char*) MAP_FAILED
)
145 munmap((char *)dictPtr
, dictFile
.size());
148 if (indexPtr
!= (uint32_t*) MAP_FAILED
)
149 munmap((char *)indexPtr
, indexFile
.size());
153 QString
File::name(void)
158 Array
<const unsigned char> File::dict(void)
161 return Array
<const unsigned char>(dictPtr
, dictFile
.size());
164 Array
<const uint32_t> File::index(void)
167 return Array
<const uint32_t>(indexPtr
, indexFile
.size());
170 int File::dictLength(void)
172 return dictFile
.size();
175 int File::indexLength(void)
177 return indexFile
.size();
180 bool File::isValid(void)
185 // returns specified character from a dictionary
186 unsigned char File::lookup(unsigned i
, int offset
)
188 uint32_t pos
= indexPtr
[i
] + offset
- 1;
189 if (pos
> dictFile
.size()) return 10;
193 QCString
File::lookup(unsigned i
)
195 uint32_t start
= indexPtr
[i
] - 1;
196 uint32_t pos
= start
;
197 const unsigned size
= dictFile
.size();
198 // get the whole word
199 while(pos
<= size
&& dictPtr
[pos
] != 0 && dictPtr
[pos
] != 0x0a)
201 // put the word in the QCString
202 QCString
retval((const char *)(dictPtr
+ start
), pos
- start
);
205 retval
.append(&null
);
210 // And last, Index itself is the API presented to the rest of Kiten
214 dictFiles
.setAutoDelete(true);
215 kanjiDictFiles
.setAutoDelete(true);
222 void Index::setDictList(const QStringList
&list
, const QStringList
&names
)
224 loadDictList(dictFiles
, list
, names
);
227 void Index::setKanjiDictList(const QStringList
&list
, const QStringList
&names
)
229 loadDictList(kanjiDictFiles
, list
, names
);
232 void Index::loadDictList(QPtrList
<File
> &fileList
, const QStringList
&dictList
, const QStringList
&dictNameList
)
236 // check if we have a dict
237 if (dictList
.size() < 1)
239 msgerr(i18n("No dictionaries in list!"));
243 QStringList::ConstIterator it
;
244 QStringList::ConstIterator dictIt
;
245 for (it
= dictList
.begin(), dictIt
= dictNameList
.begin(); it
!= dictList
.end(); ++it
, ++dictIt
)
247 File
*f
= new File(*it
, *dictIt
);
248 // our ugly substitute for exceptions
256 QStringList
Index::doSearch(File
&file
, const QString
&text
)
258 // Do a binary search to find an entry that matches text
259 QTextCodec
&codec
= *QTextCodec::codecForName("eucJP");
260 QCString eucString
= codec
.fromUnicode(text
);
264 Array
<const uint32_t> index
= file
.index();
265 Array
<const unsigned char> dict
= file
.dict();
267 int hi
= index
.size() - 1;
274 comp
= stringCompare(file
, cur
, eucString
);
281 while(hi
>= lo
&& comp
!= 0 && !(hi
== 0 && lo
== 0));
286 // wheel back to make sure we get the first matching entry
287 while(cur
- 1 && 0 == stringCompare(file
, cur
- 1, eucString
))
290 // output every matching entry
291 while(cur
< index
.size() && 0 == stringCompare(file
, cur
, eucString
))
293 // because the index doesn't point
294 // to the start of the line, find the
295 // start of the line:
297 while(file
.lookup(cur
, i
- 1) != 0x0a) --i
;
300 while(file
.lookup(cur
, i
) != 0x0a) // get to end of our line
302 const char eucchar
= file
.lookup(cur
, i
);
303 bytes
.resize(bytes
.size() + 1);
304 bytes
[bytes
.size() - 1] = eucchar
;
308 QString result
= codec
.toUnicode(bytes
) + QString("\n");
309 if (prevResult
!= result
)
311 results
.append(result
);
319 // return all the entries found, or null if no match
323 SearchResult
Index::scanResults(QRegExp regexp
, QStringList results
, bool common
)
325 unsigned int num
= 0;
326 unsigned int fullNum
= 0;
330 //ret.results = results; //not here..
332 for (QStringList::Iterator itr
= results
.begin(); itr
!= results
.end(); ++itr
)
334 if ((*itr
).left(5) == "DICT " || (*itr
).left(8) == "HEADING ")
336 ret
.list
.append(parse(*itr
));
340 int found
= regexp
.search(*itr
);
345 if ((*itr
).find(QString("(P)")) >= 0 || !common
)
347 // we append HERE, so we get the exact
348 // results we have in ret.list
350 ret
.results
.append(*itr
);
351 ret
.list
.append(parse(*itr
));
363 SearchResult
Index::search(QRegExp regexp
, const QString
&text
, bool common
)
366 for (QPtrListIterator
<File
> file(dictFiles
); *file
; ++file
)
368 results
.append(QString("DICT ") + (*file
)->name());
370 results
+= doSearch(**file
, text
);
373 SearchResult res
= scanResults(regexp
, results
, common
);
378 SearchResult
Index::scanKanjiResults(QRegExp regexp
, QStringList results
, bool common
)
380 unsigned int num
= 0;
381 unsigned int fullNum
= 0;
382 const bool jmyCount
= false; // don't count JinMeiYou as common
384 ret
.results
= results
;
386 for (QStringList::Iterator itr
= results
.begin(); itr
!= results
.end(); ++itr
)
388 if ((*itr
).left(5) == "DICT " || (*itr
).left(8) == "HEADING ")
390 ret
.list
.append(kanjiParse(*itr
));
394 int found
= regexp
.search(*itr
);
399 // common entries have G[1-8] (jouyou)
400 QRegExp
comregexp(jmyCount
? "G[1-9]" : "G[1-8]");
401 if ((*itr
).find(comregexp
) >= 0 || !common
)
403 ret
.list
.append(kanjiParse(*itr
));
415 SearchResult
Index::searchKanji(QRegExp regexp
, const QString
&text
, bool common
)
418 for (QPtrListIterator
<File
> file(kanjiDictFiles
); *file
; ++file
)
420 results
.append(QString("DICT ") + (*file
)->name());
422 results
+= doSearch(**file
, text
);
425 SearchResult res
= scanKanjiResults(regexp
, results
, common
);
430 SearchResult
Index::searchPrevious(QRegExp regexp
, const QString
&text
, SearchResult list
, bool common
)
434 if (firstEntry(list
).extendedKanjiInfo())
435 res
= scanKanjiResults(regexp
, list
.results
, common
);
437 res
= scanResults(regexp
, list
.results
, common
);
443 QRegExp
Dict::Index::createRegExp(SearchType type
, const QString
&text
, DictionaryType dictionaryType
, bool caseSensitive
)
448 case Search_Beginning
:
449 switch (textType(text
))
451 case Dict::Text_Latin
:
455 case Dict::Text_Kana
:
456 if (dictionaryType
== Kanjidict
)
462 case Dict::Text_Kanji
:
467 case Search_FullWord
:
468 switch (textType(text
))
470 case Dict::Text_Latin
:
474 case Dict::Text_Kana
:
475 if (dictionaryType
== Kanjidict
)
481 case Dict::Text_Kanji
:
486 case Search_Anywhere
:
490 return QRegExp(regExp
.arg(text
), caseSensitive
);
493 int Index::stringCompare(File
&file
, int index
, QCString str
)
495 return eucStringCompare(file
.lookup(index
), str
);
498 // effectively does a strnicmp on two "strings"
499 // except it will make katakana and hiragana match (EUC A4 & A5)
500 int Dict::eucStringCompare(const char *str
, const char *str2
)
502 for (unsigned i
= 0; ; ++i
)
504 unsigned char c
= static_cast<unsigned char>(str
[i
]);
505 unsigned char c2
= static_cast<unsigned char>(str2
[i
]);
506 if ((c2
== '\0') || (c
== '\0'))
518 if ((c2
>= 'A') && (c2
<= 'Z')) c2
|= 0x20; /*fix ucase*/
519 if ((c
>= 'A') && (c
<= 'Z')) c
|= 0x20;
522 return (int)c2
- (int)c
;
528 bool Dict::isEUC(unsigned char c
)
533 Entry
Dict::parse(const QString
&raw
)
535 unsigned int length
= raw
.length();
536 if (raw
.left(5) == "DICT ")
537 return Entry(raw
.right(length
- 5));
538 if (raw
.left(8) == "HEADING ")
539 return Entry(raw
.right(length
- 8), true);
543 QStringList meanings
;
545 bool firstmeaning
= true;
546 QCString
parsemode("kanji");
549 for (i
= 0; i
< length
; i
++)
551 QChar
ichar(raw
.at(i
));
555 parsemode
= "reading";
557 else if (ichar
== ']')
561 else if (ichar
== '/')
565 meanings
.append(curmeaning
);
570 firstmeaning
= false;
571 parsemode
= "meaning";
574 else if (ichar
== ' ')
576 if (parsemode
== "meaning") // only one that needs the space
579 else if (parsemode
== "kanji")
583 else if (parsemode
== "meaning")
587 else if (parsemode
== "reading")
593 return (Entry(kanji
, reading
, meanings
));
596 Entry
Dict::kanjiParse(const QString
&raw
)
598 unsigned int length
= raw
.length();
599 if (raw
.left(5) == "DICT ")
600 return Entry(raw
.right(length
- 5));
601 if (raw
.left(8) == "HEADING ")
602 return Entry(raw
.right(length
- 8), true);
604 QStringList readings
;
606 QStringList meanings
;
613 QString strmiscount
= "";
615 bool prevwasspace
= true;
617 QCString
parsemode("kanji");
619 // if there are two S entries, second is common miscount
620 bool strokesset
= false;
624 for (i
= 0; i
< length
; i
++)
630 if (parsemode
== "reading")
632 readings
.append(curreading
);
635 else if (parsemode
== "kanji")
639 else if (parsemode
== "detail")
641 if (detailname
== 'S')
646 else if (parsemode
== "meaning")
652 else if (ichar
== '{')
654 parsemode
= "meaning";
656 else if (ichar
== '}')
658 meanings
.append(curmeaning
);
661 else if (parsemode
== "detail")
663 if (detailname
== 'G')
667 else if (detailname
== 'F')
671 else if (detailname
== 'S')
674 strmiscount
+= ichar
;
678 prevwasspace
= false;
680 else if (parsemode
== "kanji")
684 else if (parsemode
== "meaning")
688 else if (parsemode
== "reading")
692 else if (parsemode
== "misc" && prevwasspace
)
694 if (QRegExp("[A-Za-z0-9]").search(QString(ichar
)) >= 0)
698 parsemode
= "detail";
702 curreading
= QString(ichar
);
703 parsemode
= "reading";
708 return (Entry(kanji
, readings
, meanings
, strgrade
.toUInt(), strfreq
.toUInt(), strstrokes
.toUInt(), strmiscount
.toUInt()));
711 QString
Dict::prettyMeaning(QStringList Meanings
)
714 QStringList::Iterator it
;
715 for (it
= Meanings
.begin(); it
!= Meanings
.end(); ++it
)
716 meanings
.append((*it
).stripWhiteSpace()).append("; ");
718 meanings
.truncate(meanings
.length() - 2);
722 QString
Dict::prettyKanjiReading(QStringList Readings
)
724 QStringList::Iterator it
;
727 for (it
= Readings
.begin(); it
!= Readings
.end(); ++it
)
730 html
+= i18n("In names: ");
734 html
+= i18n("As radical: ");
737 html
+= (*it
).stripWhiteSpace();
742 html
.truncate(html
.length() - 2); // get rid of last ,
747 Dict::Entry
Dict::firstEntry(Dict::SearchResult result
)
749 for (QValueListIterator
<Dict::Entry
> it
= result
.list
.begin(); it
!= result
.list
.end(); ++it
)
751 if ((*it
).dictName() == "__NOTSET" && (*it
).header() == "__NOTSET")
755 return Dict::Entry("__NOTHING");
758 QString
Dict::firstEntryText(Dict::SearchResult result
)
760 for (QStringList::Iterator it
= result
.results
.begin(); it
!= result
.results
.end(); ++it
)
762 if ((*it
).left(5) != "DICT " && (*it
).left(7) != "HEADER ")
766 return QString("NONE ");
769 ///////////////////////////////////////////////////////////////
771 Entry::Entry(const QString
& kanji
, const QString
& reading
, const QStringList
&meanings
)
772 : DictName(QString::fromLatin1("__NOTSET"))
773 , Header(QString::fromLatin1("__NOTSET"))
776 , KanaOnly(reading
.isEmpty())
777 , Readings(KanaOnly
? kanji
: reading
)
778 , ExtendedKanjiInfo(false)
786 Entry::Entry(const QString
&kanji
, QStringList
&readings
, QStringList
&meanings
, unsigned int grade
, unsigned int freq
, unsigned int strokes
, unsigned int miscount
)
787 : DictName(QString::fromLatin1("__NOTSET"))
788 , Header(QString::fromLatin1("__NOTSET"))
793 , ExtendedKanjiInfo(true)
801 Entry::Entry(const QString
&dictname
)
803 , ExtendedKanjiInfo(false)
808 Entry::Entry(const QString
&headername
, bool)
809 : DictName(QString::fromLatin1("__NOTSET"))
812 , ExtendedKanjiInfo(false)
816 QString
Entry::dictName()
821 QString
Entry::header()
826 bool Entry::kanaOnly()
831 QString
Entry::kanji()
836 QStringList
Entry::readings()
841 QString
Entry::firstReading()
843 return *Readings
.at(0);
846 QStringList
Entry::meanings()
851 unsigned int Entry::grade()
856 unsigned int Entry::freq()
861 unsigned int Entry::miscount()
866 unsigned int Entry::strokes()
871 bool Entry::extendedKanjiInfo()
873 return ExtendedKanjiInfo
;