jbparse/jbparse/kanjidic2.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for KANJIDIC2."""
  32
  33 from __future__ import absolute_import
  34
  35 import os, gzip, gettext, warnings
  36 from xml.etree.cElementTree import ElementTree
  37 gettext.install('pyjben', unicode=True)
  38
  39 from .kanjidic_common \
  40      import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
  41
  42 def jis_kuten_to_hex(kuten):
  43     """Kuten string to hex conversion"""
  44     pieces = map(int, kuten.split(u'-'))
  45     return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
  46
  47 def xml2text(o):
  48     return o.text
  49
  50 def mapdict(fn, d):
  51     result = {}
  52     for k, v in d.iteritems():
  53         result[k] = map(fn, v)
  54     return result
  55
  56
  57 class Kanjidic2Node(object):
  58
  59     def __init__(self, xml_node):
  60         self.xml = xml_node
  61         self.literal = self._get_literal()
  62
  63     def _get_literal(self):
  64         literal = self.xml.find("literal").text.strip()
  65         assert len(literal) == 1, _(u"Literal has more than one character!")
  66         return literal
  67
  68     def get_grade(self):
  69         o = self.xml.find("misc/grade")
  70         return int(o.text) if o else None
  71
  72     def get_freq(self):
  73         # By the spec, it seems like multiple freqs are possible??
  74         # So... let's get all entries and assert.
  75         o = self.xml.findall("misc/freq")
  76         if not o:
  77             return None
  78         assert len(o) == 1, _(
  79             u"Character %s: Expected 1 freq entry, found %d") % \
  80             (self._get_literal(), len(o))
  81         return int(o[0].text)
  82
  83     def get_jlpt(self):
  84         o = self.xml.find("misc/jlpt")
  85         return int(o.text) if o else None
  86
  87     def get_strokes(self):
  88         """Gets stroke count.
  89
  90         Returns a tuple of (stroke_count, miscounts), where miscounts
  91         is either None or a list of common miscounts for the
  92         character.
  93
  94         """
  95         nodes = self.xml.findall("misc/stroke_count")
  96         scnode, misnodes = nodes[0], nodes[1:]
  97         sc = int(nodes[0].text)
  98         if misnodes:
  99             miss = map(int, [o.text for o in misnodes])
 100         else:
 101             miss = None
 102         return (sc, miss)
 103
 104     def _get_nanori_nodes(self):
 105         nodes = self.xml.findall("reading_meaning/nanori")
 106         return nodes or None
 107
 108     def _get_attrdict(self, path, attr_name):
 109         """Helper: stores elements on path in dict, keyed by attribute."""
 110         d = {}
 111         nodes = self.xml.findall(path)
 112         #attrs = set(o.attrib.get(attr_name) for o in nodes)
 113         for o in nodes:
 114             d.setdefault(o.attrib.get(attr_name), []).append(o)
 115         #for attr in attrs:
 116         #    d[attr] = [o for o in nodes
 117         #               if o.attrib.get(attr_name) == attr]
 118         return d
 119
 120     def _get_reading_nodes(self):
 121         """Returns dictionary of reading lists, keyed by type."""
 122         return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
 123
 124     def _get_meaning_nodes(self):
 125         """Returns dictionary of gloss lists, keyed by language prefix."""
 126         meaning_d = self._get_attrdict(
 127             "reading_meaning/rmgroup/meaning", "m_lang")
 128         if None in meaning_d:
 129             meaning_d['en'] = meaning_d[None]
 130             del meaning_d[None]
 131         return meaning_d
 132
 133     def _get_dictcodes(self):
 134         return self._get_attrdict("dic_number/dic_ref", "dr_type")
 135
 136     def _get_querycodes(self):
 137         return self._get_attrdict("query_code/q_code", "qc_type")
 138
 139     def get_nanori(self):
 140         nanori = map(xml2text, self._get_nanori_nodes() or [])
 141         if nanori:
 142             return _(u"%s: %s") % (_(u"Nanori"), u"、".join(nanori))
 143
 144     def get_readings(self, rtypes):
 145         """Gets readings as text strings.
 146
 147         Takes in any number of reading keys, and returns a list
 148         containing user-friendly output strings.
 149
 150         Valid keys include: ja_on, ja_kun, korean_h, korean_r, pinyin,
 151         and nanori.
 152
 153         Note: Nanori is also handled independently, as it is stored
 154         differently than the other readings.
 155
 156         """
 157         d = {
 158             "ja_on": _(u"On-yomi"),
 159             "ja_kun": _(u"Kun-yomi"),
 160             "korean_h": _(u"Korean (Hangul)"),
 161             "korean_r": _(u"Korean (Romanized)"),
 162             "pinyin": _(u"Pinyin"),
 163             }
 164         romanized = ("korean_r", "pinyin")
 165         readings = mapdict(xml2text, self._get_reading_nodes())
 166         pieces = []
 167         for rt in rtypes:
 168             if rt == "nanori":
 169                 s = self.get_nanori()
 170                 if s:
 171                     pieces.append(s)
 172             elif rt in d:
 173                 if rt not in readings:
 174                     continue
 175                 separator = u", " if rt in romanized else u"、"
 176                 reading_str = separator.join(readings[rt])
 177                 pieces.append(_(u"%s: %s") % (d[rt], reading_str))
 178         return pieces
 179
 180     def get_meanings(self):
 181         meanings = mapdict(xml2text, self._get_meaning_nodes())
 182         pieces = []
 183         for lang in sorted(meanings):
 184             pieces.append(_(u"Meanings (%s): %s") %
 185                           (lang, u"; ".join(meanings[lang])))
 186         return pieces
 187
 188     def get_dict_codes(self, keys, all=False):
 189         """Gets dictionary codes as strings for display to the user.
 190
 191         Accepts a list of dictionary keys.  To get all keys, set the
 192         all keyword to true.  (The keys parameter will be ignored in
 193         this case.)
 194
 195         """
 196         pieces = []
 197         dicts = self._get_dictcodes()
 198         for dcode in sorted(dicts):
 199             if (not all) and dcode not in keys:
 200                 continue
 201             nodes = dicts[dcode]
 202             assert len(nodes) == 1, _(
 203                 u"Character %s: Multiple (%d) entries found for "
 204                 u"dict code %s") % \
 205                 (self._get_literal(), len(nodes), dcode)
 206             o = nodes[0]
 207             dname = kanjidic2_key_to_str(dcode)
 208             if dcode == "moro":
 209                 s = _(u"Index %s, volume %s, page %s") % \
 210                     (o.text, o.attrib['m_vol'], o.attrib['m_page'])
 211             else:
 212                 s = o.text
 213             pieces.append(_(u"%s: %s") % (dname, s))
 214         return pieces
 215
 216     def get_query_codes(self, keys, all=False):
 217         pieces = []
 218         qcodes = self._get_querycodes()
 219         for qcode in sorted(qcodes):
 220             if (not all) and qcode not in keys:
 221                 continue
 222             nodes = qcodes[qcode]
 223             qname = qcode_to_desc(qcode)
 224             if qcode == "skip":
 225                 d = {}
 226                 for o in nodes:
 227                     d.setdefault(o.attrib.get("skip_misclass"), []).append(o)
 228                 for misclass in sorted(d):
 229                     if misclass:
 230                         outname = _(u"%s miscode (%s)") % (qname, misclass)
 231                     else:
 232                         outname = qname
 233                     s = u", ".join(o.text for o in d[misclass])
 234                     pieces.append(_(u"%s: %s") % (outname, s))
 235             else:
 236                 s = u", ".join(o.text for o in nodes)
 237                 pieces.append(_(u"%s: %s") % (qname, s))
 238         return pieces
 239
 240     def __unicode__(self):
 241         pieces = []
 242
 243         pieces.append(u"=" * 70)
 244         pieces.append(_(u"Literal: %s") % self.literal)
 245         pieces.append(u"-" * 70)
 246
 247         pieces.append(_(u"Readings:"))
 248         r_strs = [u"  %s" % s for s in
 249                   self.get_readings(
 250                       ("ja_on", "ja_kun", "nanori",
 251                        "korean_h", "korean_r", "pinyin"))]
 252         pieces.extend(r_strs)
 253         pieces.append(u"-" * 70)
 254
 255         m_strs = [u"  %s" % s for s in self.get_meanings()]
 256         pieces.extend(m_strs)
 257         pieces.append(u"-" * 70)
 258
 259         pieces.append(_(u"Miscellaneous:"))
 260         jlpt = self.get_jlpt()
 261         if jlpt:
 262             pieces.append(_(u"  JLPT grade level: %d") % jlpt)
 263         grade = self.get_grade()
 264         if self.get_grade():
 265             pieces.append(_(u"  Jouyou grade level: %d") % grade)
 266         freq = self.get_freq()
 267         if self.get_freq():
 268             pieces.append(_(u"  Newspaper frequency: %d") % freq)
 269         strokes, misstrokes = self.get_strokes()
 270         pieces.append(_(u"  Stroke count: %d") % strokes)
 271         if misstrokes:
 272             pieces.append(_(u"  Common stroke miscounts: %s") %
 273                           ", ".join(map(str, misstrokes)))
 274         pieces.append(u"-" * 70)
 275
 276         pieces.append(_(u"Dictionary codes:"))
 277         d_strs = [u"  %s" % s for s in self.get_dict_codes([], all=True)]
 278         pieces.extend(d_strs)
 279         pieces.append(u"-" * 70)
 280
 281         pieces.append(_(u"Query codes:"))
 282         qc_strs = [u"  %s" % s for s in self.get_query_codes([], all=True)]
 283         pieces.extend(qc_strs)
 284         pieces.append(u"-" * 70)
 285
 286         pieces.append(_(u"Other information:"))
 287         #cp_strs = self.get_codepoints()
 288         #rad_strs = self.get_rad_info()
 289         #variant_strs = self.get_variants()
 290         pieces.append(_(u"  Unicode value: %04X") % ord(self.literal))
 291
 292         pieces.append(u"=" * 70)
 293
 294         return u"\n".join(pieces)
 295
 296
 297 class Parser(object):
 298
 299     def __init__(self, filename, encoding="utf-8"):
 300         """Initializer for Kanjidic2Parser.
 301
 302         About use_cache: Kanjidic2 is a large, heavy to parse file.
 303         Although it takes a large amount of memory, it is better to
 304         retain it in memory to increase the speed of subsequent
 305         searches.
 306
 307         """
 308         if not os.path.exists(filename):
 309             raise Exception(u_("Dictionary file does not exist."))
 310         self.filename = filename
 311         self.encoding = encoding
 312         self.indexed = False
 313         self.header, self.characters = self.load_via_etree()
 314         self._check_version()
 315
 316     def _check_version(self):
 317         version = int(self.header.find('file_version').text)
 318         assert version >= 4, _(
 319             u"This parser won't work with versions of KANJIDIC2 "
 320             u"older than version 4.")
 321         if version > 3:
 322             s = _(u"Parser version is for version 4, detected version is %d"
 323                   ) % version
 324             warnings.warn(s)
 325
 326     def load_via_etree(self):
 327         if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 328             f = gzip.open(self.filename)
 329         else:
 330             f = open(self.filename, "rb")
 331         et = ElementTree(file=f)
 332         f.close()
 333         nodes = et.getroot().getchildren()
 334         header, characters = nodes[0], nodes[1:]
 335         characters = [Kanjidic2Node(char) for char in characters]
 336         return header, characters
 337
 338     def get_header(self):
 339         d = {}
 340         for o in self.header.getchildren():
 341             cdata = u"".join((o.text, o.tail)).strip()
 342             d[o.tag] = cdata
 343         return u"\n".join(u"%s: %s" % (k, d[k]) for k in sorted(d))
 344
 345     def search(self, query):
 346         self.create_indices()
 347         for u in query:
 348             c = self.by_kanji.get(u)
 349             if c:
 350                 yield c
 351
 352     def create_indices(self):
 353         if self.indexed:
 354             return
 355         self.indexed = True
 356         self.by_kanji = {}
 357         for char in self.characters:
 358             literal = char.xml.find("literal").text.strip()
 359             self.by_kanji[literal] = char
 360
 361
 362 def encode_or_else(s):
 363     if os.name == "nt":
 364         charset = "cp932"
 365     else:
 366         charset = "utf-8"
 367     lines = s.split(u"\n")
 368     out = []
 369     for line in lines:
 370         try:
 371             val = line.encode(charset)
 372             out.append(line)
 373         except:
 374             pass
 375     return u"\n".join(out)
 376
 377
 378 if __name__ == "__main__":
 379     import sys
 380
 381     try:
 382         dfname, args = sys.argv[1], sys.argv[2:]
 383         assert args
 384     except (IndexError, AssertionError):
 385         print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
 386         exit(-1)
 387
 388     try:
 389         p = Parser(dfname)
 390     except Exception, e:
 391         print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
 392         exit(-1)
 393
 394     if os.name == "nt":
 395         charset = "cp932"
 396     else:
 397         charset = "utf-8"
 398
 399     print u"HEADER"
 400     print u"======"
 401     print p.get_header()
 402     print
 403     print u"%d characters found" % len(p.characters)
 404
 405     for i, kanji in enumerate(p.search("".join(args).decode(charset))):
 406         kstr = encode_or_else(unicode(kanji))
 407         print _(u"Entry %d:\n%s\n") % (i+1, kstr)