jbparse/jbparse/kanjidic.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for KANJIDIC.
  32
  33 This parser is dependent on a small amount of code kept in the
  34 kanjidic2 parser, so be sure to grab both if you are using these
  35 modules in your own programs.
  36
  37 """
  38
  39 from __future__ import absolute_import
  40
  41 import os, re, gzip, gettext
  42 gettext.install('pyjben', unicode=True)
  43
  44 from .kanjidic_common \
  45      import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
  46
  47
  48 alpha_regex = re.compile(u"(^[^0-9]+)(.*)")
  49
  50 # Copied from J-Ben 1.x and modified using Gnome Character Map's
  51 # "Unicode Block" information.
  52 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
  53
  54 def is_hiragana(uc):
  55     # 3040..309F; Hiragana
  56     o = ord(uc)
  57     return o >= 0x3040 and o <= 0x309F
  58
  59 def is_katakana(uc):
  60     # 30A0..30FF; Katakana
  61     # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
  62     o = ord(uc)
  63     return o >= 0x30A0 and o <= 0x30FF
  64
  65 def is_furigana(uc):
  66     return is_hiragana(uc) or is_katakana(uc)
  67
  68 def jis_hex_to_kuten(hex_code):
  69     """KANJIDIC2-style kuten string"""
  70     return u"%s-%s" % (
  71         (((hex_code >> 8) & 0xFF) - 0x20),
  72         ((hex_code & 0xFF) - 0x20))
  73
  74 def kanjidic_key_to_kanjidic2(dkey):
  75     """Converts KANJIDIC dictionary keys to KANJIDIC2.
  76
  77     If unable to find a KANJIDIC2 key, returns the original key.
  78
  79     """
  80     d = {
  81         "H": "halpern_njecd",
  82         "N": "nelson_c",
  83         "V": "nelson_n",
  84         "IN": "sh_kk",
  85         "MN": "moro",
  86         "E": "henshall",
  87         "K": "gakken",
  88         "L": "heisig",
  89         "O": "oneill_names",
  90         "DB": "busy_people",
  91         "DC": "crowley",
  92         "DF": "jf_cards",
  93         "DG": "kodansha_compact",
  94         "DH": "henshall3",
  95         "DJ": "kanji_in_context",
  96         "DK": "halpern_kkld",
  97         "DO": "oneill_kk",
  98         "DS": "sakade",
  99         "DT": "tutt_cards",
 100         "DM": "maniette"
 101         }
 102     return d.get(dkey, dkey)
 103
 104 class KanjidicEntry(object):
 105
 106     def __init__(self, raw_entry):
 107         # Key info
 108         self.literal = None
 109         self.meanings = []
 110         self.kunyomi = []
 111         self.onyomi = []
 112         self.nanori = []
 113         # Secondary info
 114         self.strokes = None
 115         self.strokes_miss = []
 116         self.freq = None
 117         self.grade = None
 118         self.jlpt = None
 119         # Info of low importance for most target users
 120         self.jis = None
 121         self.radical = None
 122         self.radical_c = None  # "Classic" KangXi Zidian radical
 123         self.radname = None
 124         self.pinyin = []
 125         self.korean = []
 126         # "Query codes": Pattern-based lookup
 127         # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
 128         self.qcodes = {}
 129         # Dictionary codes
 130         self.dcodes = {}
 131         # Dictionary-related metadata
 132         self.xref = []
 133         self.misclass = []
 134         self.unparsed = []
 135
 136         self.parse_entry(raw_entry)
 137
 138     def parse_entry(self, raw_entry):
 139         if not raw_entry:
 140             return None
 141
 142         state = ParserState()  # Holds "t class"
 143
 144         # First 2 fields are always the same
 145         pieces = raw_entry.split(None, 2)
 146         misc = pieces.pop()
 147         self.jis = int(pieces.pop(), 16)
 148         self.literal = pieces.pop()
 149
 150         # Parse the remainder
 151         si = ei = 0
 152         while si < len(misc):
 153             c = misc[si]
 154             i = ord(c)
 155             if c == u' ':
 156                 si += 1
 157                 continue
 158             if i > 0xFF or c in (u'-', u'.'):
 159                 # Parse Japanese
 160                 ei = misc.find(u' ', si+1)
 161                 if ei == -1:
 162                     ei = len(misc) + 1
 163                 sub = misc[si:ei]
 164
 165                 self._parse_japanese(state, sub)
 166             elif c == u'{':
 167                 # Parse Translation
 168                 si += 1  # Move si inside of {
 169                 ei = misc.find(u'}', si+1)
 170                 if ei == -1:
 171                     ei = len(misc) + 1
 172                 sub = misc[si:ei]
 173                 ei += 1  # Move ei past }
 174
 175                 self.meanings.append(sub)
 176             else:
 177                 # Parse info field
 178                 ei = misc.find(u' ', si+1)
 179                 if ei == -1:
 180                     ei = len(misc) + 1
 181                 sub = misc[si:ei]
 182
 183                 self._parse_info(state, sub)
 184
 185             si = ei + 1
 186
 187     def _parse_japanese(self, state, data):
 188         if not state.t_class:
 189             # Check hiragana/katakana
 190             for c in data:
 191                 if is_hiragana(c):
 192                     self.kunyomi.append(data)
 193                     break
 194                 elif is_katakana(c):
 195                     self.onyomi.append(data)
 196                     break
 197         elif state.t_class == 1:
 198             self.nanori.append(data)
 199         elif state.t_class == 2:
 200             self.radname = data
 201
 202     def _parse_info(self, state, data):
 203         onechar_dicts = set(('H', 'N', 'V', 'E', 'K', 'L', 'O'))
 204         strval_dicts = set(('DB',))
 205         intval_dicts = set(('DC', 'DF', 'DG', 'DH', 'DJ',
 206                             'DK', 'DO', 'DS', 'DT', 'DM'))
 207         try:
 208             c = data[0]
 209             if c == 'U':
 210                 # Unicode value - we alread store the literal as unicode, so let's
 211                 # use this as our encoding sanity check!
 212                 assert ord(self.literal) == int(data[1:], 16), \
 213                     "Encoding error detected"
 214             elif c == 'B':
 215                 self.radical = int(data[1:])
 216             elif c == 'C':
 217                 self.radical_c = int(data[1:])
 218             elif c == 'F':
 219                 self.freq = int(data[1:])
 220             elif c == 'G':
 221                 self.grade = int(data[1:])
 222             elif c == 'J':
 223                 self.jlpt = int(data[1:])
 224             elif c == 'S':
 225                 i = int(data[1:])
 226                 if not self.strokes:
 227                     self.strokes = i
 228                 else:
 229                     self.strokes_miss.append(i)
 230             elif c == 'W':
 231                 self.korean.append(data[1:])
 232             elif c == 'Y':
 233                 self.pinyin.append(data[1:])
 234             elif c == 'X':
 235                 self.xref.append(data[1:])
 236             elif c == 'Z':
 237                 self.misclass.append(data[1:])
 238             elif c == 'T':
 239                 state.t_class = int(data[1:])
 240             # Below this point is dictionary/query codes.
 241             elif c in onechar_dicts:
 242                 self.dcodes[c] = data[1:]
 243             elif c == 'P':
 244                 # SKIP codes.
 245                 # Thanks to changes in permissible SKIP code usage (change to
 246                 # Creative Commons licensing in January 2008), we can now use
 247                 # this without problems.  Jack Halpern, thank you!
 248                 if self.qcodes.get('skip'):
 249                     print "ALERT!  ALERT!  self.skip already set!"
 250                     exit(1)
 251                 self.qcodes['skip'] = data[1:];
 252             elif c == 'Q':
 253                 # Four Corner code
 254                 self.qcodes['four_corner'] = data[1:]
 255             elif c == 'I':  # Spahn/Hadamitzky dictionaries
 256                 if data[1] =='N':
 257                     # IN = Kanji & Kana (Spahn, Hadamitzky)
 258                     self.dcodes[data[:2]] = data[2:]
 259                 else:
 260                     # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
 261                     self.qcodes['sh_desc'] = data[1:]
 262             elif c == 'M':
 263                 # Morohashi Daikanwajiten
 264                 self.dcodes[data[:2]] = data[2:]
 265             elif c == 'D':
 266                 key = data[:2]
 267                 if key in intval_dicts:
 268                     self.dcodes[key] = int(data[2:])
 269                 elif key in strval_dicts:
 270                     self.dcodes[key] = data[2:]
 271                 elif key == 'DR':
 272                     # Query Code: 2001 Kanji (De Roo)
 273                     self.qcodes['deroo'] = int(data[2:])
 274                 else:
 275                     self.unparsed.append(data)
 276             else:
 277                 self.unparsed.append(data)
 278         except:
 279             self.unparsed.append(data)
 280
 281     def to_string(self, **kwargs):
 282         """A default "to-string" dump of a KanjidicEntry."""
 283         lines = []
 284         lines.append(_(u"Literal: %s") % self.literal)
 285         if self.onyomi:
 286             lines.append(_(u"Onyomi: %s")
 287                          % u"、".join(
 288                              [jstring_convert(us) for us in self.onyomi]))
 289         if self.kunyomi:
 290             lines.append(_(u"Kunyomi: %s")
 291                          % u"、".join(
 292                              [jstring_convert(us) for us in self.kunyomi]))
 293         if self.nanori:
 294             lines.append(_(u"Nanori: %s")
 295                          % u"、".join(
 296                              [jstring_convert(us) for us in self.nanori]))
 297         if self.meanings:
 298             lines.append(_(u"Meaning: %s") % _(u"; ").join(self.meanings))
 299
 300         if self.strokes:
 301             lines.append(_(u"Stroke count: %d") % self.strokes)
 302         if self.strokes_miss:
 303             lines.append(_(u"Common miscounts: %s")
 304                          % _(u", ").join(self.strokes_miss))
 305         if self.freq:
 306             lines.append(_(u"Newspaper Frequency: %d") % self.freq)
 307         if self.grade:
 308             if self.grade in range(1, 7):
 309                 grade_str = unicode(self.grade)
 310             elif self.grade == 8:
 311                 grade_str = _(u"General usage")
 312             elif self.grade == 9:
 313                 grade_str = _(u"Jinmeiyou (Characters for names)")
 314             elif self.grade == None:
 315                 grade_str = _(u"Unspecified")
 316             else:
 317                 grade_str = _(u"Unhandled grade level (Grade %d)") % self.grade
 318             lines.append(_(u"Jouyou Grade: %s") % grade_str)
 319         if self.jlpt:
 320             lines.append(_(u"JLPT Level: %d") % self.jlpt)
 321
 322         # Query codes
 323         if self.qcodes:
 324             for k, v in self.qcodes.iteritems():
 325                 desc = qcode_to_desc(k)
 326                 lines.append(_(u"%s code: %s") % (desc, self.qcodes[k]))
 327
 328                 if k == 'skip' and self.misclass:
 329                     miscodes = []
 330                     for code in self.misclass:
 331                         code_type = code[:2]
 332                         code_val = code[2:]
 333                         if code_type == u'SP':   # "stroke_count"
 334                             miscodes.append(_(u"%s (stroke count)") % code_val)
 335                         elif code_type == u'PP': # "posn"
 336                             miscodes.append(_(u"%s (position)") % code_val)
 337                         elif code_type == u'BP': # "stroke_and_posn"
 338                             miscodes.append(_(u"%s (stroke and position)") % code_val)
 339                         elif code_type == u'RP': # "stroke_diff"
 340                             miscodes.append(_(u"%s (debatable count)") % code_val)
 341                         else:
 342                             lines.append(_(u"Unrecognized misclassification code: %s")
 343                                          % unicode(code))
 344                     if miscodes:
 345                         lines.append(_(u"SKIP miscodes: %s")
 346                                      % _(u", ").join(miscodes))
 347
 348         if self.dcodes:
 349             # Probably we should sort these in some way... but for
 350             # now, just display.
 351             for k, v in self.dcodes.iteritems():
 352                 if k == "MP": continue
 353                 dictname = kanjidic2_key_to_str(
 354                     kanjidic_key_to_kanjidic2(k))
 355                 if k == "MN":
 356                     vp = self.dcodes.get("MP")
 357                     if vp:
 358                         vol, page = vp.split('.', 1)
 359                         lines.append(_(u"%s: Index %s, Volume %s, Page %s")
 360                                      % (dictname, v, vol, page))
 361                     else:
 362                         lines.append(_(u"%s: %s") % (dictname, v))
 363                 else:
 364                     lines.append(_(u"%s: %s") % (dictname, v))
 365
 366         if self.radname:
 367             lines.append(_(u"Radical name: %s") % self.radname)
 368         if self.radical:
 369             lines.append(_(u"Nelson Radical: %d") % self.radical)
 370         if self.radical_c:
 371             lines.append(_(u"KangXi Zidian Radical: %d") % self.radical_c)
 372
 373         if self.korean:
 374             lines.append(_(u"Korean romanization: %s")
 375                          % _(u", ").join(self.korean))
 376         if self.pinyin:
 377             lines.append(_(u"Pinyin romanization: %s")
 378                          % _(u", ").join(self.pinyin))
 379
 380         # "self.unicode" is always present. ;)
 381         lines.append(_(u"Unicode: 0x%04X") % ord(self.literal))
 382         if self.jis:
 383             kuten = jis_hex_to_kuten(self.jis)
 384             jis_set = u"208"  # For now, hard-code it.
 385             lines.append(_(u"JIS X 0%s code: Kuten = %s, Hex = 0x%04X")
 386                          % (jis_set, kuten, self.jis))
 387
 388         if self.xref:
 389             for ref in self.xref:
 390                 if ref[0] == 'J':
 391                     # JIS crossrefs
 392                     jis_id = ref[1]
 393                     hexcode = int(ref[2:], 16)
 394                     kuten = jis_hex_to_kuten(hexcode)
 395                     if jis_id == '0':
 396                         lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
 397                                        u"Hex = 0x%04X") % (kuten, hexcode))
 398                     elif jis_id == '1':
 399                         lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
 400                                        u"Hex = 0x%04X") % (kuten, hexcode))
 401                     else:
 402                         s = _(u"Crossref: JIS (UNHANDLED JIS CODESET): "
 403                               u"Kuten = %s, Hex = 0x%04X") % (kuten, hexcode)
 404                         lines.append(s)
 405                         # Not really "unparsed", but it is unhandled...
 406                         unparsed.append(s)
 407                     pass
 408                 else:
 409                     m = alpha_regex.match(ref)
 410                     k = kanjidic2_key_to_str(
 411                         kanjidic_key_to_kanjidic2(m.group(1)))
 412
 413                     v = ref[m.span()[1]:]
 414                     lines.append(_(u"Crossref: %s: %s")
 415                                  % (k, m.group(2)))
 416
 417         if self.unparsed:
 418             lines.append(_(u"Unrecognized codes: %s")
 419                          % (u", ").join(self.unparsed))
 420             pass
 421
 422         return u"\n".join(lines)
 423
 424     def __unicode__(self):
 425         """Dummy string dumper"""
 426         strs = [self.literal]
 427         for l in [self.kunyomi, self.onyomi, self.nanori, self.meanings]:
 428             strs.extend(l)
 429         if self.radname:
 430             strs.insert(3, self.radname)
 431
 432         return _(u", ").join(strs)
 433
 434 class ParserState(object):
 435     def __init__(self):
 436         self.t_class = 0
 437
 438 class Parser(object):
 439
 440     def __init__(self, filename, use_cache=True, encoding="EUC-JP"):
 441         if not os.path.exists(filename):
 442             raise Exception("Dictionary file does not exist.")
 443         self.filename = filename
 444         self.encoding = encoding
 445         self.use_cache = use_cache
 446         self.cache = {}
 447
 448     def search(self, query):
 449         """Returns a list of kanji entries matching kanji in the query.
 450
 451         Note: Previous versions implemented this as a generator.
 452         While I liked that solution, it did not maintain the order of
 453         kanji in the query.  Since the KANJIDIC2 parser does this,
 454         I've done it here as well for consistency.
 455
 456         """
 457         results = []
 458
 459         data = None
 460         if self.use_cache: data = self.cache
 461
 462         if not data:
 463             if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 464                 f = gzip.open(self.filename)
 465             else:
 466                 f = open(self.filename, "rb")
 467             fdata = f.read()
 468             f.close()
 469             fdata = fdata.decode(self.encoding)
 470             lines = fdata.splitlines()
 471             lines = [line for line in lines if line and (line[0] != u"#")]
 472
 473             data = {}
 474             for line in lines:
 475                 entry = KanjidicEntry(line)
 476                 if self.use_cache:
 477                     self.cache[entry.literal] = entry
 478                 if entry.literal in query: data[entry.literal] = entry
 479
 480         for char in query:
 481             kanji = data.get(char)
 482             if kanji: results.append(kanji)
 483
 484         return results
 485
 486 if __name__ == "__main__":
 487     import sys
 488
 489     if len(sys.argv) < 2:
 490         print _(u"Please specify a dictionary file.")
 491         exit(-1)
 492     try:
 493         kp = Parser(sys.argv[1])
 494     except Exception, e:
 495         print _(u"Could not create KanjidicParser: %s") % unicode(e)
 496         exit(-1)
 497
 498     if len(sys.argv) < 3:
 499         print _(u"Please specify a kanji.  "
 500                 u"(Copy/paste, or Alt-Zenkaku/Hankaku)")
 501         exit(-1)
 502
 503     if os.name == "nt":
 504         charset = "cp932"
 505     else:
 506         charset = "utf-8"
 507
 508     for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
 509         print _(u"Entry %d:\n%s\n") % (i+1, entry.to_string())