parsers/kanjidic2.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for KANJIDIC2.
  32
  33 This module is incomplete and currently just holds helper code for the
  34 KANJIDIC parser.
  35
  36 """
  37
  38 import gzip, xml.sax, gettext
  39 gettext.install('pyjben', unicode=True)
  40
  41 from parsers.kanjidic_common \
  42      import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
  43
  44 class Kanjidic2Entry(object):
  45
  46     def __init__(self):
  47         # Key info
  48         self.literal = None
  49         self.jis = None
  50         self.meanings = {}
  51         self.ja_kun = []
  52         self.ja_on = []
  53         self.nanori = []
  54
  55         # Secondary info
  56         self.strokes = None
  57         self.strokes_miss = []
  58         self.freq = None
  59         self.grade = None
  60         self.jlpt = None
  61
  62         # Info of low importance for most target users
  63         self.radical = None
  64         self.radical_c = None  # "Classic" KangXi Zidian radical
  65         self.radname = None
  66         self.pinyin = []
  67         self.korean_h = []
  68         self.korean_r = []
  69
  70         # "Query codes": Pattern-based lookup
  71         # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
  72         # Codes: P, DRnnnn, Inxnn.n, Qnnnn.n
  73         self.qcodes = {}
  74
  75         # Dictionary codes
  76         # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn
  77         # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM
  78         self.dcodes = {}
  79
  80         # Dictionary-related metadata
  81         self.xref = []
  82         self.misclass = []
  83
  84         self.unparsed = []
  85
  86     def to_string(self, **kwargs):
  87         """A default "to-string" dump of a Kanjidic2Entry."""
  88         lines = []
  89         lines.append(_(u"Literal: %s") % self.literal)
  90         if self.ja_on:
  91             lines.append(_(u"Onyomi: %s")
  92                          % u"、".join(
  93                              [jstring_convert(us) for us in self.ja_on]))
  94         if self.ja_kun:
  95             lines.append(_(u"Kunyomi: %s")
  96                          % u"、".join(
  97                              [jstring_convert(us) for us in self.ja_kun]))
  98         if self.nanori:
  99             lines.append(_(u"Nanori: %s")
 100                          % u"、".join(
 101                              [jstring_convert(us) for us in self.nanori]))
 102         if self.meanings:
 103             for k, v in self.meanings.iteritems():
 104                 lines.append(_(u"Meaning (%s): %s") % (k, _(u"; ").join(v)))
 105
 106         if self.strokes:
 107             lines.append(_(u"Stroke count: %d") % self.strokes)
 108         if self.strokes_miss:
 109             lines.append(_(u"Common miscounts: %s")
 110                          % _(u", ").join(self.strokes_miss))
 111         if self.freq:
 112             lines.append(_(u"Newspaper Frequency: %d") % self.freq)
 113         if self.grade:
 114             if self.grade in range(1, 7):
 115                 grade_str = unicode(self.grade)
 116             elif self.grade == 8:
 117                 grade_str = _(u"General usage")
 118             elif self.grade == 9:
 119                 grade_str = _(u"Jinmeiyou (Characters for names)")
 120             elif self.grade == None:
 121                 grade_str = _(u"Unspecified")
 122             else:
 123                 grade_str = _(u"Unhandled grade level (Grade %d)") % self.grade
 124             lines.append(_(u"Jouyou Grade: %s") % grade_str)
 125         if self.jlpt:
 126             lines.append(_(u"JLPT Level: %d") % self.jlpt)
 127
 128         # Query codes
 129         if self.qcodes:
 130             for k, v in self.qcodes.iteritems():
 131                 desc = qcode_to_desc(k)
 132                 lines.append(_(u"%s code: %s") % (desc, self.qcodes[k]))
 133
 134                 if k == 'skip' and self.misclass:
 135                     miscodes = []
 136                     for code in self.misclass:
 137                         code_type = code[:2]
 138                         code_val = code[2:]
 139                         if code_type == u'SP':   # "stroke_count"
 140                             miscodes.append(_(u"%s (stroke count)") % code_val)
 141                         elif code_type == u'PP': # "posn"
 142                             miscodes.append(_(u"%s (position)") % code_val)
 143                         elif code_type == u'BP': # "stroke_and_posn"
 144                             miscodes.append(_(u"%s (stroke and position)") % code_val)
 145                         elif code_type == u'RP': # "stroke_diff"
 146                             miscodes.append(_(u"%s (debatable count)") % code_val)
 147                         else:
 148                             lines.append(_(u"Unrecognized misclassification code: %s")
 149                                          % unicode(code))
 150                     if miscodes:
 151                         lines.append(_(u"SKIP miscodes: %s")
 152                                      % _(u", ").join(miscodes))
 153
 154         if self.dcodes:
 155             # Probably we should sort these in some way... but for
 156             # now, just display.
 157             for k, v in self.dcodes.iteritems():
 158                 k = kanjidic2_key_to_str(k)
 159                 lines.append(_(u"%s: %s") % (k, v))
 160
 161         if self.radname:
 162             lines.append(_(u"Radical name: %s") % self.radname)
 163         if self.radical:
 164             lines.append(_(u"Nelson Radical: %d") % self.radical)
 165         if self.radical_c:
 166             lines.append(_(u"KangXi Zidian Radical: %d") % self.radical_c)
 167
 168         if self.korean_h:
 169             lines.append(_(u"Korean: %s")
 170                          % _(u", ").join(self.korean_h))
 171         if self.korean_r:
 172             lines.append(_(u"Korean romanization: %s")
 173                          % _(u", ").join(self.korean_r))
 174         if self.pinyin:
 175             lines.append(_(u"Pinyin romanization: %s")
 176                          % _(u", ").join(self.pinyin))
 177
 178         # "self.unicode" is always present. ;)
 179         lines.append(_(u"Unicode: 0x%04X") % ord(self.literal))
 180         if self.jis:
 181             def jis_hex_to_kuten(hex_code):
 182                 """KANJIDIC2-style kuten string"""
 183                 return u"%s-%s" % (
 184                     (((hex_code >> 8) & 0xFF) - 0x20),
 185                     ((hex_code & 0xFF) - 0x20))
 186
 187             kuten = jis_hex_to_kuten(self.jis)
 188             lines.append(_(u"JIS code: Kuten = %s, Hex = 0x%04X")
 189                          % (kuten, self.jis))
 190
 191         #self.xref = []
 192         if self.xref:
 193             # From KANJIDIC documentation:
 194             #
 195             # Xxxxxxx -- a cross-reference code. An entry of, say,
 196             # XN1234 will mean that the user is referred to the kanji
 197             # with the (unique) Nelson index of 1234. XJ0xxxx and
 198             # XJ1xxxx are cross-references to the kanji with the JIS
 199             # hexadecimal code of xxxx. The `0' means the reference is
 200             # to a JIS X 0208 kanji, and the `1' references a JIS X
 201             # 0212 kanji.
 202             #
 203
 204             # For now, just dump to the console.
 205             lines.append(_(u"Crossref codes: %s") % ", ".join(self.xref))
 206
 207             # From J-Ben 1:
 208             #/* Crossref codes */
 209             #if(!k.var_j208.empty())
 210             #result << "<li>JIS-208: " << k.var_j208 << "</li>";
 211             #if(!k.var_j212.empty())
 212             #result << "<li>JIS-212: " << k.var_j212 << "</li>";
 213             #if(!k.var_j213.empty())
 214             #result << "<li>JIS-213: " << k.var_j213 << "</li>";
 215             #if(!k.var_ucs.empty())
 216             #result << "<li>Unicode: " << k.var_ucs << "</li>";
 217             #if(!k.var_deroo.empty())
 218             #result << "<li>De Roo code: " << k.var_deroo << "</li>";
 219             #if(!k.var_nelson_c.empty())
 220             #result << "<li>Modern Reader's Japanese-English Character "
 221             #"Dictionary (Nelson): " << k.var_nelson_c << "</li>";
 222             #if(!k.var_njecd.empty())
 223             #result << "<li>New Japanese-English Character Dictionary "
 224             #"(Halpern): " << k.var_njecd << "</li>";
 225             #if(!k.var_oneill.empty())
 226             #result << "<li>Japanese Names (O'Neill): " << k.var_oneill
 227             #<< "</li>";
 228             #if(!k.var_s_h.empty())
 229             #result << "<li>Spahn/Hadamitzky Kanji Dictionary code: "
 230             #<< k.var_s_h << "</li>";
 231
 232         if self.unparsed:
 233             lines.append(_(u"Unrecognized codes: %s")
 234                          % (u", ").join(self.unparsed))
 235
 236         return u"\n".join(lines)
 237
 238 class KD2SAXHandler(xml.sax.handler.ContentHandler):
 239
 240     """SAX handler for KANJIDIC2."""
 241
 242     def __init__(self, *args, **kwargs):
 243         #self.limit = 1
 244         xml.sax.handler.ContentHandler.__init__(self, *args, **kwargs)
 245         self.parsing = False
 246         self.kanji = None
 247         self.path = []
 248         self.full_keys = set()
 249         self.data = {}
 250
 251     def get_path(self):
 252         return u"/".join([i[0] for i in self.path])
 253
 254     def get_attr_str(self):
 255         return u", ".join([u"%s: %s" % (k, v)
 256                            for k, v in self.path[-1][1].items()])
 257
 258     def startElement(self, name, attrs):
 259         if name == "character":
 260             self.parsing = True
 261             #print "startElement called:", name, attrs
 262             #print "Beginning of character entry found"
 263             self.kanji = Kanjidic2Entry()
 264         elif self.parsing:
 265             self.path.append((name, attrs))
 266             #print u"Current path: %s, attributes: %s" % \
 267             #      (self.get_path(), str(attrs.items()))
 268
 269     def endElement(self, name):
 270         if self.parsing:
 271             if self.path:
 272                 if name != self.path[-1][0]:
 273                     # Shouldn't ever happen, but mistakes *can* slip in...
 274                     print u"Mismatch detected, path is %s, element name is %s" \
 275                           % (self.get_path(), name)
 276                 else:
 277                     self.path.pop()
 278             if name == "character":
 279                 #print "endElement called:", name
 280                 #print "End of character entry reached"
 281                 self.data[self.kanji.literal] = self.kanji
 282                 self.kanji = None
 283                 self.parsing = False
 284                 #self.limit -= 1
 285                 #if self.limit <= 0: exit(0)
 286
 287     def characters(self, content):
 288         content = content.strip()
 289         if content and self.parsing:
 290             # Sanity check: see if the current node type is already
 291             # included under a different full path.
 292             #path = self.get_path()
 293             #self.full_keys.add(path)
 294             #
 295             #keys = [k for k in self.full_keys if k[-(len(node)):] == node]
 296             #if len(keys) != 1:
 297             #    print "CHECKME: Node: %s, Keys: %s" % (node, str(keys))
 298
 299             node, attrs = self.path[-1]
 300
 301             # I am exploiting the fact that any given element type can
 302             # only belong to one type of parent.  For example,
 303             # "reading" objects are always fully pathed-out to
 304             # reading_meaning.rmgroup.reading.
 305
 306             # In case this changes in the future, I've attached
 307             # comments of the full paths below.
 308
 309             if node == u"literal":     # literal
 310                 self.kanji.literal = content
 311             elif node == u"reading":   # reading_meaning/rmgroup/reading
 312                 # These will do stuff in the future...
 313                 #on_type = attrs.get(u"on_type")
 314                 #r_status = attrs.get(u"r_status")
 315                 # Store reading
 316                 getattr(self.kanji, attrs[u'r_type']).append(content)
 317             elif node == u"meaning":   # reading_meaning/rmgroup/meaning
 318                 m_lang = attrs.get(u'm_lang', u'en')
 319                 self.kanji.meanings.setdefault(m_lang, []).append(content)
 320             elif node == u"nanori":    # reading_meaning/nanori
 321                 self.kanji.nanori.append(content)
 322             elif node == u"grade":     # misc/grade
 323                 self.kanji.grade = int(content)
 324             elif node == u"freq":      # misc/freq
 325                 self.kanji.freq = int(content)
 326             elif node == u"jlpt":      # misc/jlpt
 327                 self.kanji.jlpt = int(content)
 328             elif node == u"stroke_count":   # misc/strokes
 329                 if not self.kanji.strokes:
 330                     self.kanji.strokes = int(content)
 331                 else:
 332                     self.kanji.strokes_miss.append(int(content))
 333             elif node == u"q_code":    # query_code/q_code
 334                 qc_type = attrs[u'qc_type']
 335                 if qc_type == 'skip':
 336                     misclass = attrs.get(u'skip_misclass')
 337                     if misclass:
 338                         # HANDLE LATER, TODO
 339                         pass
 340                     else:
 341                         self.kanji.qcodes[qc_type] = content
 342                 else:
 343                     self.kanji.qcodes[qc_type] = content
 344             elif node == u"dic_ref":          # dic_number/dic_ref
 345                 attr = attrs[u'dr_type']
 346                 if attr == u'moro':
 347                     m_vol = attrs.get(u'm_vol')
 348                     m_page = attrs.get(u'm_page')
 349                     # Do something with this... TODO
 350                 else:
 351                     try:
 352                         self.kanji.dcodes[attr] = int(content)
 353                     except ValueError:
 354                         self.kanji.dcodes[attr] = content
 355             elif node == u"cp_value":  # codepoint/cp_value
 356                 pass
 357             elif node == u"rad_value": # radical/rad_value
 358                 pass
 359             elif node == u"variant":   # misc/variant
 360                 pass
 361             elif node == u"rad_name":  # misc/rad_name
 362                 pass
 363             else:
 364                 try:
 365                     path = self.get_path()
 366                     print u"Characters found: path=%s, attrs=(%s), content: %s" \
 367                           % (path,
 368                              self.get_attr_str(),
 369                              content)
 370                     # Do some stuff based upon the current path and content
 371                 except UnicodeEncodeError:
 372                     pass  # Can't display code on console; just squelch the output.
 373                 except Exception, e:
 374                     print u"EXCEPTION occurred:", unicode(e.__class__.__str__), unicode(e)
 375
 376 class Kanjidic2Parser(object):
 377
 378     def __init__(self, filename, encoding="utf-8"):
 379         self.filename = filename
 380         self.encoding = encoding
 381         self.cache = None
 382
 383     def load_via_sax(self):
 384         if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 385             f = gzip.open(self.filename)
 386         else:
 387             f = open(self.filename, "rb")
 388
 389         sh = KD2SAXHandler()
 390         isource = xml.sax.xmlreader.InputSource()
 391         isource.setEncoding("utf-8")
 392         isource.setByteStream(f)
 393         xml.sax.parse(isource, sh)
 394         f.close()
 395         self.cache = sh.data
 396
 397     def search(self, search_str, use_cache=True):
 398         # Cacheing has 2 meanings in J-Ben:
 399         # 1. Storing the results of a previous read locally.
 400         # 2. Reading in prepased data from a file on disk
 401         #
 402         # KANJIDIC2 is a huge file; although it's huge to store it in memory,
 403         # it's even harsher to repeatedly seek the whole file from disk on
 404         # each search.
 405         if (not use_cache) or (not self.cache):
 406             # Pick a loader.
 407             # Opt 1: sax... very powerful, but too much code with my impl?
 408             # Opt 2: elementtree... more memory required, loads
 409             # everything at once...
 410             # Opt 3: sax... redo to store all vars as lists, or similar.
 411             self.load_via_sax()  # First attempt of a SAX style loader.
 412
 413         for char in search_str:
 414             kanji = self.cache.get(char)
 415             if kanji: yield kanji
 416
 417
 418 if __name__ == "__main__":
 419     import sys, os
 420
 421     if len(sys.argv) < 2:
 422         print _(u"Please specify a dictionary file.")
 423         exit(-1)
 424     try:
 425         kp = Kanjidic2Parser(sys.argv[1])
 426     except Exception, e:
 427         print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
 428         exit(-1)
 429
 430     if len(sys.argv) < 3:
 431         print _(u"Please specify a kanji.  "
 432                 u"(Copy/paste, or Alt-Zenkaku/Hankaku)")
 433         exit(-1)
 434
 435     if os.name == "nt":
 436         charset = "cp932"
 437     else:
 438         charset = "utf-8"
 439
 440     for i, kanji in enumerate(kp.search(sys.argv[2].decode(charset))):
 441         lines = kanji.to_string().split(u'\n')
 442         def encode_or_else(s):
 443             try:
 444                 val = s.encode("cp932")
 445                 val = s
 446             except:
 447                 val = None
 448             return val
 449         xlines = map(encode_or_else, lines)
 450         xlines = [l for l in xlines if l]
 451         xlines = u"\n".join(list(xlines))
 452         print _(u"Entry %d:\n%s\n") % (i+1, xlines)