parsers/kanjidic.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for KANJIDIC.
  32
  33 This parser is dependent on a small amount of code kept in the
  34 kanjidic2 parser, so be sure to grab both if you are using these
  35 modules in your own programs.
  36
  37 """
  38
  39 import re, gzip, gettext
  40 gettext.install('pyjben', unicode=True)
  41
  42 from parsers.kanjidic_common \
  43      import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
  44
  45
  46 alpha_regex = re.compile(u"(^[^0-9]+)(.*)")
  47
  48 # Copied from J-Ben 1.x and modified using Gnome Character Map's
  49 # "Unicode Block" information.
  50 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
  51
  52 def is_hiragana(uc):
  53     # 3040..309F; Hiragana
  54     o = ord(uc)
  55     return o >= 0x3040 and o <= 0x309F
  56
  57 def is_katakana(uc):
  58     # 30A0..30FF; Katakana
  59     # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
  60     o = ord(uc)
  61     return o >= 0x30A0 and o <= 0x30FF
  62
  63 def is_furigana(uc):
  64     return is_hiragana(uc) or is_katakana(uc)
  65
  66 def jis_hex_to_kuten(hex_code):
  67     """KANJIDIC2-style kuten string"""
  68     return u"%s-%s" % (
  69         (((hex_code >> 8) & 0xFF) - 0x20),
  70         ((hex_code & 0xFF) - 0x20))
  71
  72 def kanjidic_key_to_kanjidic2(dkey):
  73     """Converts KANJIDIC dictionary keys to KANJIDIC2.
  74
  75     If unable to find a KANJIDIC2 key, returns the original key.
  76
  77     """
  78     d = {
  79         "H": "halpern_njecd",
  80         "N": "nelson_c",
  81         "V": "nelson_n",
  82         "IN": "sh_kk",
  83         "MN": "moro",
  84         "E": "henshall",
  85         "K": "gakken",
  86         "L": "heisig",
  87         "O": "oneill_names",
  88         "DB": "busy_people",
  89         "DC": "crowley",
  90         "DF": "jf_cards",
  91         "DG": "kodansha_compact",
  92         "DH": "henshall3",
  93         "DJ": "kanji_in_context",
  94         "DK": "halpern_kkld",
  95         "DO": "oneill_kk",
  96         "DS": "sakade",
  97         "DT": "tutt_cards",
  98         "DM": "maniette"
  99         }
 100     return d.get(dkey, dkey)
 101
 102 class KanjidicEntry(object):
 103
 104     def __init__(self, raw_entry):
 105         # Key info
 106         self.literal = None
 107         self.meanings = []
 108         self.kunyomi = []
 109         self.onyomi = []
 110         self.nanori = []
 111         # Secondary info
 112         self.strokes = None
 113         self.strokes_miss = []
 114         self.freq = None
 115         self.grade = None
 116         self.jlpt = None
 117         # Info of low importance for most target users
 118         self.jis = None
 119         self.radical = None
 120         self.radical_c = None  # "Classic" KangXi Zidian radical
 121         self.radname = None
 122         self.pinyin = []
 123         self.korean = []
 124         # "Query codes": Pattern-based lookup
 125         # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
 126         self.qcodes = {}
 127         # Dictionary codes
 128         self.dcodes = {}
 129         # Dictionary-related metadata
 130         self.xref = []
 131         self.misclass = []
 132         self.unparsed = []
 133
 134         self.parse_entry(raw_entry)
 135
 136     def parse_entry(self, raw_entry):
 137         if not raw_entry:
 138             return None
 139
 140         state = ParserState()  # Holds "t class"
 141
 142         # First 2 fields are always the same
 143         pieces = raw_entry.split(None, 2)
 144         misc = pieces.pop()
 145         self.jis = int(pieces.pop(), 16)
 146         self.literal = pieces.pop()
 147
 148         # Parse the remainder
 149         si = ei = 0
 150         while si < len(misc):
 151             c = misc[si]
 152             i = ord(c)
 153             if c == u' ':
 154                 si += 1
 155                 continue
 156             if i > 0xFF or c in (u'-', u'.'):
 157                 # Parse Japanese
 158                 ei = misc.find(u' ', si+1)
 159                 if ei == -1:
 160                     ei = len(misc) + 1
 161                 sub = misc[si:ei]
 162
 163                 self._parse_japanese(state, sub)
 164             elif c == u'{':
 165                 # Parse Translation
 166                 si += 1  # Move si inside of {
 167                 ei = misc.find(u'}', si+1)
 168                 if ei == -1:
 169                     ei = len(misc) + 1
 170                 sub = misc[si:ei]
 171                 ei += 1  # Move ei past }
 172
 173                 self.meanings.append(sub)
 174             else:
 175                 # Parse info field
 176                 ei = misc.find(u' ', si+1)
 177                 if ei == -1:
 178                     ei = len(misc) + 1
 179                 sub = misc[si:ei]
 180
 181                 self._parse_info(state, sub)
 182
 183             si = ei + 1
 184
 185     def _parse_japanese(self, state, data):
 186         if not state.t_class:
 187             # Check hiragana/katakana
 188             for c in data:
 189                 if is_hiragana(c):
 190                     self.kunyomi.append(data)
 191                     break
 192                 elif is_katakana(c):
 193                     self.onyomi.append(data)
 194                     break
 195         elif state.t_class == 1:
 196             self.nanori.append(data)
 197         elif state.t_class == 2:
 198             self.radname = data
 199
 200     def _parse_info(self, state, data):
 201         onechar_dicts = set(('H', 'N', 'V', 'E', 'K', 'L', 'O'))
 202         strval_dicts = set(('DB',))
 203         intval_dicts = set(('DC', 'DF', 'DG', 'DH', 'DJ',
 204                             'DK', 'DO', 'DS', 'DT', 'DM'))
 205         try:
 206             c = data[0]
 207             if c == 'U':
 208                 # Unicode value - we alread store the literal as unicode, so let's
 209                 # use this as our encoding sanity check!
 210                 assert ord(self.literal) == int(data[1:], 16), \
 211                     "Encoding error detected"
 212             elif c == 'B':
 213                 self.radical = int(data[1:])
 214             elif c == 'C':
 215                 self.radical_c = int(data[1:])
 216             elif c == 'F':
 217                 self.freq = int(data[1:])
 218             elif c == 'G':
 219                 self.grade = int(data[1:])
 220             elif c == 'J':
 221                 self.jlpt = int(data[1:])
 222             elif c == 'S':
 223                 i = int(data[1:])
 224                 if not self.strokes:
 225                     self.strokes = i
 226                 else:
 227                     self.strokes_miss.append(i)
 228             elif c == 'W':
 229                 self.korean.append(data[1:])
 230             elif c == 'Y':
 231                 self.pinyin.append(data[1:])
 232             elif c == 'X':
 233                 self.xref.append(data[1:])
 234             elif c == 'Z':
 235                 self.misclass.append(data[1:])
 236             elif c == 'T':
 237                 state.t_class = int(data[1:])
 238             # Below this point is dictionary/query codes.
 239             elif c in onechar_dicts:
 240                 self.dcodes[c] = data[1:]
 241             elif c == 'P':
 242                 # SKIP codes.
 243                 # Thanks to changes in permissible SKIP code usage (change to
 244                 # Creative Commons licensing in January 2008), we can now use
 245                 # this without problems.  Jack Halpern, thank you!
 246                 if self.qcodes.get('skip'):
 247                     print "ALERT!  ALERT!  self.skip already set!"
 248                     exit(1)
 249                 self.qcodes['skip'] = data[1:];
 250             elif c == 'Q':
 251                 # Four Corner code
 252                 self.qcodes['four_corner'] = data[1:]
 253             elif c == 'I':  # Spahn/Hadamitzky dictionaries
 254                 if data[1] =='N':
 255                     # IN = Kanji & Kana (Spahn, Hadamitzky)
 256                     self.dcodes[data[:2]] = data[2:]
 257                 else:
 258                     # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
 259                     self.qcodes['sh_desc'] = data[1:]
 260             elif c == 'M':
 261                 # Morohashi Daikanwajiten
 262                 self.dcodes[data[:2]] = data[2:]
 263             elif c == 'D':
 264                 key = data[:2]
 265                 if key in intval_dicts:
 266                     self.dcodes[key] = int(data[2:])
 267                 elif key in strval_dicts:
 268                     self.dcodes[key] = data[2:]
 269                 elif key == 'DR':
 270                     # Query Code: 2001 Kanji (De Roo)
 271                     self.qcodes['deroo'] = int(data[2:])
 272                 else:
 273                     self.unparsed.append(data)
 274             else:
 275                 self.unparsed.append(data)
 276         except:
 277             self.unparsed.append(data)
 278
 279     def to_string(self, **kwargs):
 280         """A default "to-string" dump of a KanjidicEntry."""
 281         lines = []
 282         lines.append(_(u"Literal: %s") % self.literal)
 283         if self.onyomi:
 284             lines.append(_(u"Onyomi: %s")
 285                          % u"、".join(
 286                              [jstring_convert(us) for us in self.onyomi]))
 287         if self.kunyomi:
 288             lines.append(_(u"Kunyomi: %s")
 289                          % u"、".join(
 290                              [jstring_convert(us) for us in self.kunyomi]))
 291         if self.nanori:
 292             lines.append(_(u"Nanori: %s")
 293                          % u"、".join(
 294                              [jstring_convert(us) for us in self.nanori]))
 295         if self.meanings:
 296             lines.append(_(u"Meaning: %s") % _(u"; ").join(self.meanings))
 297
 298         if self.strokes:
 299             lines.append(_(u"Stroke count: %d") % self.strokes)
 300         if self.strokes_miss:
 301             lines.append(_(u"Common miscounts: %s")
 302                          % _(u", ").join(self.strokes_miss))
 303         if self.freq:
 304             lines.append(_(u"Newspaper Frequency: %d") % self.freq)
 305         if self.grade:
 306             if self.grade in range(1, 7):
 307                 grade_str = unicode(self.grade)
 308             elif self.grade == 8:
 309                 grade_str = _(u"General usage")
 310             elif self.grade == 9:
 311                 grade_str = _(u"Jinmeiyou (Characters for names)")
 312             elif self.grade == None:
 313                 grade_str = _(u"Unspecified")
 314             else:
 315                 grade_str = _(u"Unhandled grade level (Grade %d)") % self.grade
 316             lines.append(_(u"Jouyou Grade: %s") % grade_str)
 317         if self.jlpt:
 318             lines.append(_(u"JLPT Level: %d") % self.jlpt)
 319
 320         # Query codes
 321         if self.qcodes:
 322             for k, v in self.qcodes.iteritems():
 323                 desc = qcode_to_desc(k)
 324                 lines.append(_(u"%s code: %s") % (desc, self.qcodes[k]))
 325
 326                 if k == 'skip' and self.misclass:
 327                     miscodes = []
 328                     for code in self.misclass:
 329                         code_type = code[:2]
 330                         code_val = code[2:]
 331                         if code_type == u'SP':   # "stroke_count"
 332                             miscodes.append(_(u"%s (stroke count)") % code_val)
 333                         elif code_type == u'PP': # "posn"
 334                             miscodes.append(_(u"%s (position)") % code_val)
 335                         elif code_type == u'BP': # "stroke_and_posn"
 336                             miscodes.append(_(u"%s (stroke and position)") % code_val)
 337                         elif code_type == u'RP': # "stroke_diff"
 338                             miscodes.append(_(u"%s (debatable count)") % code_val)
 339                         else:
 340                             lines.append(_(u"Unrecognized misclassification code: %s")
 341                                          % unicode(code))
 342                     if miscodes:
 343                         lines.append(_(u"SKIP miscodes: %s")
 344                                      % _(u", ").join(miscodes))
 345
 346         if self.dcodes:
 347             # Probably we should sort these in some way... but for
 348             # now, just display.
 349             for k, v in self.dcodes.iteritems():
 350                 if k == "MP": continue
 351                 k = kanjidic2_key_to_str(
 352                     kanjidic_key_to_kanjidic2(k))
 353                 if k == "MN":
 354                     lines.append(_(u"%s: %s") % (k, v))
 355                 else:
 356                     vp = self.dcodes.get("MP")
 357                     if vp:
 358                         vol, page = vp.split('.', 1)
 359                         lines.append(_(u"%s: Index %s, Volume %s, Page %s")
 360                                      % (k, v, vol, page))
 361                     else:
 362                         lines.append(_(u"%s: %s") % (k, v))
 363
 364         if self.radname:
 365             lines.append(_(u"Radical name: %s") % self.radname)
 366         if self.radical:
 367             lines.append(_(u"Nelson Radical: %d") % self.radical)
 368         if self.radical_c:
 369             lines.append(_(u"KangXi Zidian Radical: %d") % self.radical_c)
 370
 371         if self.korean:
 372             lines.append(_(u"Korean romanization: %s")
 373                          % _(u", ").join(self.korean))
 374         if self.pinyin:
 375             lines.append(_(u"Pinyin romanization: %s")
 376                          % _(u", ").join(self.pinyin))
 377
 378         # "self.unicode" is always present. ;)
 379         lines.append(_(u"Unicode: 0x%04X") % ord(self.literal))
 380         if self.jis:
 381             kuten = jis_hex_to_kuten(self.jis)
 382             jis_set = u"208"  # For now, hard-code it.
 383             lines.append(_(u"JIS X 0%s code: Kuten = %s, Hex = 0x%04X")
 384                          % (jis_set, kuten, self.jis))
 385
 386         if self.xref:
 387             for ref in self.xref:
 388                 if ref[0] == 'J':
 389                     # JIS crossrefs
 390                     jis_id = ref[1]
 391                     hexcode = int(ref[2:], 16)
 392                     kuten = jis_hex_to_kuten(hexcode)
 393                     if jis_id == '0':
 394                         lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
 395                                        u"Hex = 0x%04X") % (kuten, hexcode))
 396                     elif jis_id == '1':
 397                         lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
 398                                        u"Hex = 0x%04X") % (kuten, hexcode))
 399                     else:
 400                         s = _(u"Crossref: JIS (UNHANDLED JIS CODESET): "
 401                               u"Kuten = %s, Hex = 0x%04X") % (kuten, hexcode)
 402                         lines.append(s)
 403                         # Not really "unparsed", but it is unhandled...
 404                         unparsed.append(s)
 405                     pass
 406                 else:
 407                     m = alpha_regex.match(ref)
 408                     k = kanjidic2_key_to_str(
 409                         kanjidic_key_to_kanjidic2(m.group(1)))
 410
 411                     v = ref[m.span()[1]:]
 412                     lines.append(_(u"Crossref: %s: %s")
 413                                  % (k, m.group(2)))
 414
 415         if self.unparsed:
 416             lines.append(_(u"Unrecognized codes: %s")
 417                          % (u", ").join(self.unparsed))
 418             pass
 419
 420         return u"\n".join(lines)
 421
 422     def __unicode__(self):
 423         """Dummy string dumper"""
 424         strs = [self.literal]
 425         for l in [self.kunyomi, self.onyomi, self.nanori, self.meanings]:
 426             strs.extend(l)
 427         if self.radname:
 428             strs.insert(3, self.radname)
 429
 430         return _(u", ").join(strs)
 431
 432 class ParserState(object):
 433     def __init__(self):
 434         self.t_class = 0
 435
 436 class KanjidicParser(object):
 437
 438     def __init__(self, filename, use_cache=True, encoding="EUC-JP"):
 439         self.filename = filename
 440         self.encoding = encoding
 441         self.use_cache = use_cache
 442         self.cache = {}
 443
 444     def search(self, query):
 445         """Returns a list of kanji entries matching kanji in the query.
 446
 447         Note: Previous versions implemented this as a generator.
 448         While I liked that solution, it did not maintain the order of
 449         kanji in the query.  Since the KANJIDIC2 parser does this,
 450         I've done it here as well for consistency.
 451
 452         """
 453         results = []
 454
 455         data = None
 456         if self.use_cache: data = self.cache
 457
 458         if not data:
 459             if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 460                 f = gzip.open(self.filename)
 461             else:
 462                 f = open(self.filename, "rb")
 463             fdata = f.read()
 464             f.close()
 465             fdata = fdata.decode(self.encoding)
 466             lines = fdata.splitlines()
 467             lines = [line for line in lines if line and (line[0] != u"#")]
 468
 469             data = {}
 470             for line in lines:
 471                 entry = KanjidicEntry(line)
 472                 if self.use_cache:
 473                     self.cache[entry.literal] = entry
 474                 if entry.literal in query: data[entry.literal] = entry
 475
 476         for char in query:
 477             kanji = data.get(char)
 478             if kanji: results.append(kanji)
 479
 480         return results
 481
 482 if __name__ == "__main__":
 483     import sys, os
 484
 485     if len(sys.argv) < 2:
 486         print _(u"Please specify a dictionary file.")
 487         exit(-1)
 488     try:
 489         kp = KanjidicParser(sys.argv[1])
 490     except Exception, e:
 491         print _(u"Could not create KanjidicParser: %s") % unicode(e)
 492         exit(-1)
 493
 494     if len(sys.argv) < 3:
 495         print _(u"Please specify a kanji.  "
 496                 u"(Copy/paste, or Alt-Zenkaku/Hankaku)")
 497         exit(-1)
 498
 499     if os.name == "nt":
 500         charset = "cp932"
 501     else:
 502         charset = "utf-8"
 503
 504     for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
 505         print _(u"Entry %d:\n%s\n") % (i+1, entry.to_string())