jbparse/jbparse/kanjidic2.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for KANJIDIC2."""
  32
  33 from __future__ import absolute_import
  34
  35 import gzip, gettext
  36 from xml.etree.cElementTree import ElementTree
  37 gettext.install('pyjben', unicode=True)
  38
  39 from .kanjidic_common \
  40      import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
  41
  42 def jis_kuten_to_hex(kuten):
  43     """Kuten string to hex conversion"""
  44     pieces = map(int, kuten.split(u'-'))
  45     print "DEBUG: kuten: %s, pieces: %s" % (kuten, str(pieces))
  46     return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
  47
  48
  49 class Kanjidic2Node(object):
  50
  51     def __init__(self, xml_node):
  52         self.xml = xml_node
  53         self.literal = self._get_literal()
  54
  55     def _get_literal(self):
  56         literal = self.xml.find("literal").text.strip()
  57         assert len(literal) == 1, "Literal has more than one character!"
  58         return literal
  59
  60     def _get_grade(self):
  61         o = self.xml.find("misc/grade")
  62         return int(o.text)
  63
  64     def _get_freq(self):
  65         # By the spec, it seems like multiple freqs are possible??
  66         # So... let's get all entries and assert.
  67         o = self.xml.findall("misc/freq")
  68         if not o:
  69             return None
  70         assert len(o) == 1, (
  71             u"Character %s: Expected 1 freq entry, found %d" %
  72             (self._get_literal(), len(o)))
  73         return int(o[0].text)
  74
  75     def _get_jlpt(self):
  76         o = self.xml.find("misc/jlpt")
  77         return int(o.text)
  78
  79     def _get_nanori(self):
  80         nodes = self.xml.findall("reading_meaning/nanori")
  81         if not nodes:
  82             return None
  83         nanori = [o.text for o in nodes]
  84         return nanori
  85
  86     def _get_attrdict(self, path, attr_name):
  87         """Helper: stores elements on path in dict, keyed by attribute."""
  88         d = {}
  89         nodes = self.xml.findall(path)
  90         attrs = set(o.attrib.get(attr_name) for o in nodes)
  91         for attr in attrs:
  92             d[attr] = [o.text for o in nodes
  93                        if o.attrib.get(attr_name) == attr]
  94         return d
  95
  96     def _get_readings(self):
  97         """Returns dictionary of reading lists, keyed by type."""
  98         return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
  99
 100     def _get_meanings(self):
 101         """Returns dictionary of gloss lists, keyed by language prefix."""
 102         meaning_d = self._get_attrdict(
 103             "reading_meaning/rmgroup/meaning", "m_lang")
 104         if None in meaning_d:
 105             meaning_d['en'] = meaning_d[None]
 106             del meaning_d[None]
 107         return meaning_d
 108
 109     def __unicode__(self):
 110         readings = self._get_readings()
 111         meanings = self._get_meanings()
 112         nanori = self._get_nanori()
 113         grade = self._get_grade()
 114         jlpt = self._get_jlpt()
 115         freq = self._get_freq()
 116
 117         pieces = []
 118         pieces.append(u"Literal: %s" % self.literal)
 119
 120         pieces.append(u"On-yomi: %s" % u"、".join(readings['ja_on']))
 121         pieces.append(u"Kun-yomi: %s" % u"、".join(readings['ja_kun']))
 122         pieces.append(u"Nanori: %s" % u"、".join(nanori))
 123
 124         pieces.append(u"Korean (Hangul): %s" %
 125                       u", ".join(readings['korean_h']))
 126         pieces.append(u"Korean (Romanized): %s" %
 127                       u", ".join(readings['korean_r']))
 128         pieces.append(u"Pinyin: %s" % u", ".join(readings['pinyin']))
 129
 130         for lang in sorted(meanings):
 131             pieces.append(u"Meanings (%s): %s" %
 132                           (lang, "; ".join(meanings[lang])))
 133
 134         if jlpt:
 135             pieces.append(u"JLPT grade level: %d" % jlpt)
 136         if grade:
 137             pieces.append(u"Jouyou grade level: %d" % grade)
 138         if freq:
 139             pieces.append(u"Newspaper frequency: %d" % freq)
 140
 141         pieces.append(u"Unicode value: %04X" % ord(self.literal))
 142
 143         return u"\n".join(pieces)
 144
 145
 146 class Parser(object):
 147
 148     def __init__(self, filename, encoding="utf-8"):
 149         """Initializer for Kanjidic2Parser.
 150
 151         About use_cache: Kanjidic2 is a large, heavy to parse file.
 152         Although it takes a large amount of memory, it is better to
 153         retain it in memory to increase the speed of subsequent
 154         searches.
 155
 156         """
 157         if not os.path.exists(filename):
 158             raise Exception("Dictionary file does not exist.")
 159         self.filename = filename
 160         self.encoding = encoding
 161         self.indexed = False
 162         self.header, self.characters = self.load_via_etree()
 163
 164     def load_via_etree(self):
 165         if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 166             f = gzip.open(self.filename)
 167         else:
 168             f = open(self.filename, "rb")
 169         et = ElementTree(file=f)
 170         f.close()
 171         nodes = et.getroot().getchildren()
 172         header, characters = nodes[0], nodes[1:]
 173         characters = [Kanjidic2Node(char) for char in characters]
 174         return header, characters
 175
 176     def get_header(self):
 177         d = {}
 178         for o in self.header.getchildren():
 179             cdata = "".join((o.text, o.tail)).strip()
 180             d[o.tag] = cdata
 181         return "\n".join("%s: %s" % (k, d[k]) for k in sorted(d))
 182
 183     def search(self, query):
 184         self.create_indices()
 185         for u in query:
 186             c = self.by_kanji.get(u)
 187             if c:
 188                 yield c
 189
 190     def create_indices(self):
 191         if self.indexed:
 192             return
 193         print "Creating indices..."
 194         self.indexed = True
 195
 196         self.by_kanji = {}
 197         for char in self.characters:
 198             literal = char.xml.find("literal").text.strip()
 199             self.by_kanji[literal] = char
 200         print "Done creating indices!"
 201
 202
 203 def encode_or_else(s):
 204     if os.name == "nt":
 205         charset = "cp932"
 206     else:
 207         charset = "utf-8"
 208     lines = s.split(u"\n")
 209     out = []
 210     for line in lines:
 211         try:
 212             val = line.encode(charset)
 213             out.append(line)
 214         except:
 215             pass
 216     return u"\n".join(out)
 217
 218
 219 if __name__ == "__main__":
 220     import sys, os
 221
 222     try:
 223         dfname, args = sys.argv[1], sys.argv[2:]
 224         assert args
 225     except (IndexError, AssertionError):
 226         print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
 227         exit(-1)
 228
 229     try:
 230         p = Parser(dfname)
 231     except Exception, e:
 232         print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
 233         exit(-1)
 234
 235     if os.name == "nt":
 236         charset = "cp932"
 237     else:
 238         charset = "utf-8"
 239
 240     print "HEADER"
 241     print "======"
 242     print p.get_header()
 243     print
 244     print "%d characters found" % len(p.characters)
 245
 246     for i, kanji in enumerate(p.search("".join(args).decode(charset))):
 247         kstr = encode_or_else(unicode(kanji))
 248         print _(u"Entry %d:\n%s\n") % (i+1, kstr)