jbparse/jbparse/kanjidic2.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for KANJIDIC2."""
  32
  33 from __future__ import absolute_import
  34
  35 import os, gzip, gettext, warnings
  36 from xml.etree.cElementTree import ElementTree
  37 gettext.install('pyjben', unicode=True)
  38
  39 from .kanjidic_common \
  40      import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
  41
  42 def jis_kuten_to_hex(kuten):
  43     """Kuten string to hex conversion"""
  44     pieces = map(int, kuten.split(u'-'))
  45     return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
  46
  47 def xml2text(o):
  48     return o.text
  49
  50 def mapdict(fn, d):
  51     result = {}
  52     for k, v in d.iteritems():
  53         result[k] = map(fn, v)
  54     return result
  55
  56
  57 class Kanjidic2Node(object):
  58
  59     def __init__(self, xml_node):
  60         self.xml = xml_node
  61         self.literal = self._get_literal()
  62
  63     def _get_literal(self):
  64         literal = self.xml.find("literal").text.strip()
  65         assert len(literal) == 1, _(u"Literal has more than one character!")
  66         return literal
  67
  68     def get_grade(self):
  69         o = self.xml.find("misc/grade")
  70         return int(o.text) if o else None
  71
  72     def get_freq(self):
  73         # By the spec, it seems like multiple freqs are possible??
  74         # So... let's get all entries and assert.
  75         o = self.xml.findall("misc/freq")
  76         if not o:
  77             return None
  78         assert len(o) == 1, _(
  79             u"Character %s: Expected 1 freq entry, found %d") % \
  80             (self._get_literal(), len(o))
  81         return int(o[0].text)
  82
  83     def get_jlpt(self):
  84         o = self.xml.find("misc/jlpt")
  85         return int(o.text) if o else None
  86
  87     def get_strokes(self):
  88         """Gets stroke count.
  89
  90         Returns a tuple of (stroke_count, miscounts), where miscounts
  91         is either None or a list of common miscounts for the
  92         character.
  93
  94         """
  95         nodes = self.xml.findall("misc/stroke_count")
  96         scnode, misnodes = nodes[0], nodes[1:]
  97         sc = int(nodes[0].text)
  98         if misnodes:
  99             miss = map(int, [o.text for o in misnodes])
 100         else:
 101             miss = None
 102         return (sc, miss)
 103
 104     def _get_nanori_nodes(self):
 105         nodes = self.xml.findall("reading_meaning/nanori")
 106         return nodes or None
 107
 108     def _get_attrdict(self, path, attr_name):
 109         """Helper: stores elements on path in dict, keyed by attribute."""
 110         d = {}
 111         nodes = self.xml.findall(path)
 112         #attrs = set(o.attrib.get(attr_name) for o in nodes)
 113         for o in nodes:
 114             d.setdefault(o.attrib.get(attr_name), []).append(o)
 115         #for attr in attrs:
 116         #    d[attr] = [o for o in nodes
 117         #               if o.attrib.get(attr_name) == attr]
 118         return d
 119
 120     def _get_reading_nodes(self):
 121         """Returns dictionary of reading lists, keyed by type."""
 122         # NEEDS AN UPDATE: Just noticed, rmgroup allows
 123         # readings/meanings to be meaningfully grouped together.  We
 124         # -can- dump everything together, but we -should- handle the
 125         # groups.
 126         return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
 127
 128     def _get_meaning_nodes(self):
 129         """Returns dictionary of gloss lists, keyed by language prefix."""
 130         # NEEDS AN UPDATE: See _get_reading_nodes.
 131         meaning_d = self._get_attrdict(
 132             "reading_meaning/rmgroup/meaning", "m_lang")
 133         if None in meaning_d:
 134             meaning_d['en'] = meaning_d[None]
 135             del meaning_d[None]
 136         return meaning_d
 137
 138     def _get_dictcodes(self):
 139         return self._get_attrdict("dic_number/dic_ref", "dr_type")
 140
 141     def _get_querycodes(self):
 142         return self._get_attrdict("query_code/q_code", "qc_type")
 143
 144     def get_nanori(self):
 145         nanori = map(xml2text, self._get_nanori_nodes() or [])
 146         if nanori:
 147             return _(u"%s: %s") % (_(u"Nanori"), u"、".join(nanori))
 148
 149     def get_readings(self, rtypes):
 150         """Gets readings as text strings.
 151
 152         Takes in any number of reading keys, and returns a list
 153         containing user-friendly output strings.
 154
 155         Valid keys include: ja_on, ja_kun, korean_h, korean_r, pinyin,
 156         and nanori.
 157
 158         Note: Nanori is also handled independently, as it is stored
 159         differently than the other readings.
 160
 161         """
 162         d = {
 163             "ja_on": _(u"On-yomi"),
 164             "ja_kun": _(u"Kun-yomi"),
 165             "korean_h": _(u"Korean (Hangul)"),
 166             "korean_r": _(u"Korean (Romanized)"),
 167             "pinyin": _(u"Pinyin"),
 168             }
 169         romanized = ("korean_r", "pinyin")
 170         readings = mapdict(xml2text, self._get_reading_nodes())
 171         pieces = []
 172         for rt in rtypes:
 173             if rt == "nanori":
 174                 s = self.get_nanori()
 175                 if s:
 176                     pieces.append(s)
 177             elif rt in d:
 178                 if rt not in readings:
 179                     continue
 180                 separator = u", " if rt in romanized else u"、"
 181                 reading_str = separator.join(readings[rt])
 182                 pieces.append(_(u"%s: %s") % (d[rt], reading_str))
 183         return pieces
 184
 185     def get_meanings(self):
 186         meanings = mapdict(xml2text, self._get_meaning_nodes())
 187         pieces = []
 188         for lang in sorted(meanings):
 189             pieces.append(_(u"Meanings (%s): %s") %
 190                           (lang, u"; ".join(meanings[lang])))
 191         return pieces
 192
 193     def get_dict_codes(self, keys, all=False):
 194         """Gets dictionary codes as strings for display to the user.
 195
 196         Accepts a list of dictionary keys.  To get all keys, set the
 197         all keyword to true.  (The keys parameter will be ignored in
 198         this case.)
 199
 200         """
 201         pieces = []
 202         dicts = self._get_dictcodes()
 203         for dcode in sorted(dicts):
 204             if (not all) and dcode not in keys:
 205                 continue
 206             nodes = dicts[dcode]
 207             assert len(nodes) == 1, _(
 208                 u"Character %s: Multiple (%d) entries found for "
 209                 u"dict code %s") % \
 210                 (self._get_literal(), len(nodes), dcode)
 211             o = nodes[0]
 212             dname = kanjidic2_key_to_str(dcode)
 213             if dcode == "moro":
 214                 s = _(u"Index %s, volume %s, page %s") % \
 215                     (o.text, o.attrib['m_vol'], o.attrib['m_page'])
 216             else:
 217                 s = o.text
 218             pieces.append(_(u"%s: %s") % (dname, s))
 219         return pieces
 220
 221     def get_query_codes(self, keys, all=False):
 222         pieces = []
 223         qcodes = self._get_querycodes()
 224         for qcode in sorted(qcodes):
 225             if (not all) and qcode not in keys:
 226                 continue
 227             nodes = qcodes[qcode]
 228             qname = qcode_to_desc(qcode)
 229             if qcode == "skip":
 230                 d = {}
 231                 for o in nodes:
 232                     d.setdefault(o.attrib.get("skip_misclass"), []).append(o)
 233                 for misclass in sorted(d):
 234                     if misclass:
 235                         outname = _(u"%s miscode (%s)") % (qname, misclass)
 236                     else:
 237                         outname = qname
 238                     s = u", ".join(o.text for o in d[misclass])
 239                     pieces.append(_(u"%s: %s") % (outname, s))
 240             else:
 241                 s = u", ".join(o.text for o in nodes)
 242                 pieces.append(_(u"%s: %s") % (qname, s))
 243         return pieces
 244
 245     def __unicode__(self):
 246
 247         def indent_strs(strs):
 248             return [u"  %s" % s for s in strs]
 249
 250         pieces = []
 251
 252         pieces.append(u"=" * 70)
 253         pieces.append(_(u"Literal: %s") % self.literal)
 254         pieces.append(u"-" * 70)
 255
 256         pieces.append(_(u"Readings:"))
 257         r_strs = indent_strs(self.get_readings(
 258             ("ja_on", "ja_kun", "nanori", "korean_h", "korean_r", "pinyin")))
 259         pieces.extend(r_strs)
 260         pieces.append(u"-" * 70)
 261
 262         m_strs = indent_strs(self.get_meanings())
 263         pieces.extend(m_strs)
 264         pieces.append(u"-" * 70)
 265
 266         pieces.append(_(u"Miscellaneous:"))
 267         jlpt = self.get_jlpt()
 268         if jlpt:
 269             pieces.append(_(u"  JLPT grade level: %d") % jlpt)
 270         grade = self.get_grade()
 271         if self.get_grade():
 272             pieces.append(_(u"  Jouyou grade level: %d") % grade)
 273         freq = self.get_freq()
 274         if self.get_freq():
 275             pieces.append(_(u"  Newspaper frequency: %d") % freq)
 276         strokes, misstrokes = self.get_strokes()
 277         pieces.append(_(u"  Stroke count: %d") % strokes)
 278         if misstrokes:
 279             pieces.append(_(u"  Common stroke miscounts: %s") %
 280                           ", ".join(map(str, misstrokes)))
 281         pieces.append(u"-" * 70)
 282
 283         pieces.append(_(u"Dictionary codes:"))
 284         d_strs = indent_strs(self.get_dict_codes([], all=True))
 285         pieces.extend(d_strs)
 286         pieces.append(u"-" * 70)
 287
 288         pieces.append(_(u"Query codes:"))
 289         qc_strs = indent_strs(self.get_query_codes([], all=True))
 290         pieces.extend(qc_strs)
 291         pieces.append(u"-" * 70)
 292
 293         pieces.append(_(u"Other information:"))
 294
 295         # RADICAL node info
 296         #rad_strs = self.get_rad_info()
 297
 298         # CODEPOINT node info
 299         #cp_strs = indent_strs(self.get_codepoints())
 300         pieces.append(_(u"  Unicode value: %04X") % ord(self.literal))
 301
 302         # MISC node children
 303         #variant_strs = self.get_variants()   # AKA cross refs
 304         #radname_strs = self.get_radical_name()  # "T2" KANJIDIC code
 305
 306         pieces.append(u"=" * 70)
 307
 308         return u"\n".join(pieces)
 309
 310
 311 class Parser(object):
 312
 313     def __init__(self, filename, encoding="utf-8"):
 314         """Initializer for Kanjidic2Parser.
 315
 316         About use_cache: Kanjidic2 is a large, heavy to parse file.
 317         Although it takes a large amount of memory, it is better to
 318         retain it in memory to increase the speed of subsequent
 319         searches.
 320
 321         """
 322         if not os.path.exists(filename):
 323             raise Exception(u_("Dictionary file does not exist."))
 324         self.filename = filename
 325         self.encoding = encoding
 326         self.indexed = False
 327         self.header, self.characters = self.load_via_etree()
 328         self._check_version()
 329
 330     def _check_version(self):
 331         version = int(self.header.find('file_version').text)
 332         assert version >= 4, _(
 333             u"This parser won't work with versions of KANJIDIC2 "
 334             u"older than version 4.")
 335         if version > 3:
 336             s = _(u"Parser version is for version 4, detected version is %d"
 337                   ) % version
 338             warnings.warn(s)
 339
 340     def load_via_etree(self):
 341         if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 342             f = gzip.open(self.filename)
 343         else:
 344             f = open(self.filename, "rb")
 345         et = ElementTree(file=f)
 346         f.close()
 347         nodes = et.getroot().getchildren()
 348         header, characters = nodes[0], nodes[1:]
 349         characters = [Kanjidic2Node(char) for char in characters]
 350         return header, characters
 351
 352     def get_header(self):
 353         d = {}
 354         for o in self.header.getchildren():
 355             cdata = u"".join((o.text, o.tail)).strip()
 356             d[o.tag] = cdata
 357         return u"\n".join(u"%s: %s" % (k, d[k]) for k in sorted(d))
 358
 359     def search(self, query):
 360         self.create_indices()
 361         for u in query:
 362             c = self.by_kanji.get(u)
 363             if c:
 364                 yield c
 365
 366     def create_indices(self):
 367         if self.indexed:
 368             return
 369         self.indexed = True
 370         self.by_kanji = {}
 371         for char in self.characters:
 372             literal = char.xml.find("literal").text.strip()
 373             self.by_kanji[literal] = char
 374
 375
 376 def encode_or_else(s):
 377     if os.name == "nt":
 378         charset = "cp932"
 379     else:
 380         charset = "utf-8"
 381     lines = s.split(u"\n")
 382     out = []
 383     for line in lines:
 384         try:
 385             val = line.encode(charset)
 386             out.append(line)
 387         except:
 388             pass
 389     return u"\n".join(out)
 390
 391
 392 if __name__ == "__main__":
 393     import sys
 394
 395     try:
 396         dfname, args = sys.argv[1], sys.argv[2:]
 397         assert args
 398     except (IndexError, AssertionError):
 399         print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
 400         exit(-1)
 401
 402     try:
 403         p = Parser(dfname)
 404     except Exception, e:
 405         print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
 406         exit(-1)
 407
 408     if os.name == "nt":
 409         charset = "cp932"
 410     else:
 411         charset = "utf-8"
 412
 413     print u"HEADER"
 414     print u"======"
 415     print p.get_header()
 416     print
 417     print u"%d characters found" % len(p.characters)
 418
 419     for i, kanji in enumerate(p.search("".join(args).decode(charset))):
 420         kstr = encode_or_else(unicode(kanji))
 421         print _(u"Entry %d:\n%s\n") % (i+1, kstr)