jbparse/jbparse/kanjidic2.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for KANJIDIC2."""
  32
  33 from __future__ import absolute_import
  34
  35 import os, gzip, gettext
  36 from xml.etree.cElementTree import ElementTree
  37 gettext.install('pyjben', unicode=True)
  38
  39 from .kanjidic_common \
  40      import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
  41
  42 def jis_kuten_to_hex(kuten):
  43     """Kuten string to hex conversion"""
  44     pieces = map(int, kuten.split(u'-'))
  45     print u"DEBUG: kuten: %s, pieces: %s" % (kuten, str(pieces))
  46     return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
  47
  48
  49 class Kanjidic2Node(object):
  50
  51     def __init__(self, xml_node):
  52         self.xml = xml_node
  53         self.literal = self._get_literal()
  54
  55     def _get_literal(self):
  56         literal = self.xml.find("literal").text.strip()
  57         assert len(literal) == 1, u"Literal has more than one character!"
  58         return literal
  59
  60     def _get_grade(self):
  61         o = self.xml.find("misc/grade")
  62         return int(o.text)
  63
  64     def _get_freq(self):
  65         # By the spec, it seems like multiple freqs are possible??
  66         # So... let's get all entries and assert.
  67         o = self.xml.findall("misc/freq")
  68         if not o:
  69             return None
  70         assert len(o) == 1, (
  71             u"Character %s: Expected 1 freq entry, found %d" %
  72             (self._get_literal(), len(o)))
  73         return int(o[0].text)
  74
  75     def _get_jlpt(self):
  76         o = self.xml.find("misc/jlpt")
  77         return int(o.text)
  78
  79     def _get_nanori_nodes(self):
  80         nodes = self.xml.findall("reading_meaning/nanori")
  81         return nodes or None
  82
  83     def _get_attrdict(self, path, attr_name):
  84         """Helper: stores elements on path in dict, keyed by attribute."""
  85         d = {}
  86         nodes = self.xml.findall(path)
  87         #attrs = set(o.attrib.get(attr_name) for o in nodes)
  88         for o in nodes:
  89             d.setdefault(o.attrib.get(attr_name), []).append(o)
  90         #for attr in attrs:
  91         #    d[attr] = [o for o in nodes
  92         #               if o.attrib.get(attr_name) == attr]
  93         return d
  94
  95     def _get_reading_nodes(self):
  96         """Returns dictionary of reading lists, keyed by type."""
  97         return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
  98
  99     def _get_meaning_nodes(self):
 100         """Returns dictionary of gloss lists, keyed by language prefix."""
 101         meaning_d = self._get_attrdict(
 102             "reading_meaning/rmgroup/meaning", "m_lang")
 103         if None in meaning_d:
 104             meaning_d['en'] = meaning_d[None]
 105             del meaning_d[None]
 106         return meaning_d
 107
 108     def _get_dictcodes(self):
 109         return self._get_attrdict("dic_number/dic_ref", "dr_type")
 110
 111     def _get_querycodes(self):
 112         return self._get_attrdict("query_code/q_code", "qc_type")
 113
 114     def __unicode__(self):
 115
 116         def xml2text(o):
 117             return o.text
 118
 119         def mapdict(fn, d):
 120             result = {}
 121             for k, v in d.iteritems():
 122                 result[k] = map(fn, v)
 123             return result
 124
 125         readings = mapdict(xml2text, self._get_reading_nodes())
 126         meanings = mapdict(xml2text, self._get_meaning_nodes())
 127         nanori = map(xml2text, self._get_nanori_nodes())
 128         grade = self._get_grade()
 129         jlpt = self._get_jlpt()
 130         freq = self._get_freq()
 131         dicts = self._get_dictcodes()
 132         qcodes = self._get_querycodes()
 133
 134         pieces = []
 135
 136         pieces.append(u"=" * 70)
 137
 138         pieces.append(u"Literal: %s" % self.literal)
 139
 140         pieces.append(u"-" * 70)
 141         pieces.append(u"Readings:")
 142
 143         pieces.append(u"  On-yomi: %s" % u"、".join(readings['ja_on']))
 144         pieces.append(u"  Kun-yomi: %s" % u"、".join(readings['ja_kun']))
 145         pieces.append(u"  Nanori: %s" % u"、".join(nanori))
 146         pieces.append(u"  Korean (Hangul): %s" %
 147                       u", ".join(readings['korean_h']))
 148         pieces.append(u"  Korean (Romanized): %s" %
 149                       u", ".join(readings['korean_r']))
 150         pieces.append(u"  Pinyin: %s" % u", ".join(readings['pinyin']))
 151
 152         pieces.append(u"-" * 70)
 153
 154         for lang in sorted(meanings):
 155             pieces.append(u"Meanings (%s): %s" %
 156                           (lang, u"; ".join(meanings[lang])))
 157
 158         pieces.append(u"-" * 70)
 159         pieces.append(u"Miscellaneous:")
 160
 161         if jlpt:
 162             pieces.append(u"  JLPT grade level: %d" % jlpt)
 163         if grade:
 164             pieces.append(u"  Jouyou grade level: %d" % grade)
 165         if freq:
 166             pieces.append(u"  Newspaper frequency: %d" % freq)
 167
 168         pieces.append(u"-" * 70)
 169         pieces.append(u"Dictionary codes:")
 170
 171         for dcode in sorted(dicts):
 172             nodes = dicts[dcode]
 173             assert len(nodes) == 1, (
 174                 u"Character %s: Multiple (%d) entries found for "
 175                 u"dict code %s" %
 176                 (self._get_literal(), len(nodes), dcode))
 177             o = nodes[0]
 178             dname = kanjidic2_key_to_str(dcode)
 179             if dcode == "moro":
 180                 s = u"Index %s, volume %s, page %s" % \
 181                     (o.text, o.attrib['m_vol'], o.attrib['m_page'])
 182             else:
 183                 s = o.text
 184             pieces.append(u"  %s: %s" % (dname, s))
 185
 186         pieces.append(u"-" * 70)
 187         pieces.append(u"Query codes:")
 188
 189         for qcode in sorted(qcodes):
 190             nodes = qcodes[qcode]
 191             if qcode == "skip":
 192                 # SKIP has miscodes; do later
 193                 continue
 194             s = u", ".join(o.text for o in nodes)
 195             qname = qcode_to_desc(qcode)
 196             pieces.append(u"  %s: %s" % (qname, s))
 197
 198         pieces.append(u"-" * 70)
 199
 200         pieces.append(u"Unicode value: %04X" % ord(self.literal))
 201
 202         pieces.append(u"=" * 70)
 203
 204         return u"\n".join(pieces)
 205
 206
 207 class Parser(object):
 208
 209     def __init__(self, filename, encoding="utf-8"):
 210         """Initializer for Kanjidic2Parser.
 211
 212         About use_cache: Kanjidic2 is a large, heavy to parse file.
 213         Although it takes a large amount of memory, it is better to
 214         retain it in memory to increase the speed of subsequent
 215         searches.
 216
 217         """
 218         if not os.path.exists(filename):
 219             raise Exception("Dictionary file does not exist.")
 220         self.filename = filename
 221         self.encoding = encoding
 222         self.indexed = False
 223         self.header, self.characters = self.load_via_etree()
 224
 225     def load_via_etree(self):
 226         if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 227             f = gzip.open(self.filename)
 228         else:
 229             f = open(self.filename, "rb")
 230         et = ElementTree(file=f)
 231         f.close()
 232         nodes = et.getroot().getchildren()
 233         header, characters = nodes[0], nodes[1:]
 234         characters = [Kanjidic2Node(char) for char in characters]
 235         return header, characters
 236
 237     def get_header(self):
 238         d = {}
 239         for o in self.header.getchildren():
 240             cdata = u"".join((o.text, o.tail)).strip()
 241             d[o.tag] = cdata
 242         return u"\n".join(u"%s: %s" % (k, d[k]) for k in sorted(d))
 243
 244     def search(self, query):
 245         self.create_indices()
 246         for u in query:
 247             c = self.by_kanji.get(u)
 248             if c:
 249                 yield c
 250
 251     def create_indices(self):
 252         if self.indexed:
 253             return
 254         self.indexed = True
 255         self.by_kanji = {}
 256         for char in self.characters:
 257             literal = char.xml.find("literal").text.strip()
 258             self.by_kanji[literal] = char
 259
 260
 261 def encode_or_else(s):
 262     if os.name == "nt":
 263         charset = "cp932"
 264     else:
 265         charset = "utf-8"
 266     lines = s.split(u"\n")
 267     out = []
 268     for line in lines:
 269         try:
 270             val = line.encode(charset)
 271             out.append(line)
 272         except:
 273             pass
 274     return u"\n".join(out)
 275
 276
 277 if __name__ == "__main__":
 278     import sys
 279
 280     try:
 281         dfname, args = sys.argv[1], sys.argv[2:]
 282         assert args
 283     except (IndexError, AssertionError):
 284         print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
 285         exit(-1)
 286
 287     try:
 288         p = Parser(dfname)
 289     except Exception, e:
 290         print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
 291         exit(-1)
 292
 293     if os.name == "nt":
 294         charset = "cp932"
 295     else:
 296         charset = "utf-8"
 297
 298     print u"HEADER"
 299     print u"======"
 300     print p.get_header()
 301     print
 302     print u"%d characters found" % len(p.characters)
 303
 304     for i, kanji in enumerate(p.search("".join(args).decode(charset))):
 305         kstr = encode_or_else(unicode(kanji))
 306         print _(u"Entry %d:\n%s\n") % (i+1, kstr)