Fleshed out cElementTree-based Kanjidic2 parser.
[jben2_gui.git] / jbparse / jbparse / kanjidic2.py
blob3690b6003e9ba5b46229f9da3ac9f1d1c8d92ec1
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__ import absolute_import
35 import gzip, gettext
36 from xml.etree.cElementTree import ElementTree
37 gettext.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
42 def jis_kuten_to_hex(kuten):
43 """Kuten string to hex conversion"""
44 pieces = map(int, kuten.split(u'-'))
45 print "DEBUG: kuten: %s, pieces: %s" % (kuten, str(pieces))
46 return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
49 class Kanjidic2Node(object):
51 def __init__(self, xml_node):
52 self.xml = xml_node
53 self.literal = self._get_literal()
55 def _get_literal(self):
56 literal = self.xml.find("literal").text.strip()
57 assert len(literal) == 1, "Literal has more than one character!"
58 return literal
60 def _get_grade(self):
61 o = self.xml.find("misc/grade")
62 return int(o.text)
64 def _get_freq(self):
65 # By the spec, it seems like multiple freqs are possible??
66 # So... let's get all entries and assert.
67 o = self.xml.findall("misc/freq")
68 if not o:
69 return None
70 assert len(o) == 1, (
71 u"Character %s: Expected 1 freq entry, found %d" %
72 (self._get_literal(), len(o)))
73 return int(o[0].text)
75 def _get_jlpt(self):
76 o = self.xml.find("misc/jlpt")
77 return int(o.text)
79 def _get_nanori(self):
80 nodes = self.xml.findall("reading_meaning/nanori")
81 if not nodes:
82 return None
83 nanori = [o.text for o in nodes]
84 return nanori
86 def _get_attrdict(self, path, attr_name):
87 """Helper: stores elements on path in dict, keyed by attribute."""
88 d = {}
89 nodes = self.xml.findall(path)
90 attrs = set(o.attrib.get(attr_name) for o in nodes)
91 for attr in attrs:
92 d[attr] = [o.text for o in nodes
93 if o.attrib.get(attr_name) == attr]
94 return d
96 def _get_readings(self):
97 """Returns dictionary of reading lists, keyed by type."""
98 return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
100 def _get_meanings(self):
101 """Returns dictionary of gloss lists, keyed by language prefix."""
102 meaning_d = self._get_attrdict(
103 "reading_meaning/rmgroup/meaning", "m_lang")
104 if None in meaning_d:
105 meaning_d['en'] = meaning_d[None]
106 del meaning_d[None]
107 return meaning_d
109 def __unicode__(self):
110 readings = self._get_readings()
111 meanings = self._get_meanings()
112 nanori = self._get_nanori()
113 grade = self._get_grade()
114 jlpt = self._get_jlpt()
115 freq = self._get_freq()
117 pieces = []
118 pieces.append(u"Literal: %s" % self.literal)
120 pieces.append(u"On-yomi: %s" % u"、".join(readings['ja_on']))
121 pieces.append(u"Kun-yomi: %s" % u"、".join(readings['ja_kun']))
122 pieces.append(u"Nanori: %s" % u"、".join(nanori))
124 pieces.append(u"Korean (Hangul): %s" %
125 u", ".join(readings['korean_h']))
126 pieces.append(u"Korean (Romanized): %s" %
127 u", ".join(readings['korean_r']))
128 pieces.append(u"Pinyin: %s" % u", ".join(readings['pinyin']))
130 for lang in sorted(meanings):
131 pieces.append(u"Meanings (%s): %s" %
132 (lang, "; ".join(meanings[lang])))
134 if jlpt:
135 pieces.append(u"JLPT grade level: %d" % jlpt)
136 if grade:
137 pieces.append(u"Jouyou grade level: %d" % grade)
138 if freq:
139 pieces.append(u"Newspaper frequency: %d" % freq)
141 pieces.append(u"Unicode value: %04X" % ord(self.literal))
143 return u"\n".join(pieces)
146 class Parser(object):
148 def __init__(self, filename, encoding="utf-8"):
149 """Initializer for Kanjidic2Parser.
151 About use_cache: Kanjidic2 is a large, heavy to parse file.
152 Although it takes a large amount of memory, it is better to
153 retain it in memory to increase the speed of subsequent
154 searches.
157 if not os.path.exists(filename):
158 raise Exception("Dictionary file does not exist.")
159 self.filename = filename
160 self.encoding = encoding
161 self.indexed = False
162 self.header, self.characters = self.load_via_etree()
164 def load_via_etree(self):
165 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
166 f = gzip.open(self.filename)
167 else:
168 f = open(self.filename, "rb")
169 et = ElementTree(file=f)
170 f.close()
171 nodes = et.getroot().getchildren()
172 header, characters = nodes[0], nodes[1:]
173 characters = [Kanjidic2Node(char) for char in characters]
174 return header, characters
176 def get_header(self):
177 d = {}
178 for o in self.header.getchildren():
179 cdata = "".join((o.text, o.tail)).strip()
180 d[o.tag] = cdata
181 return "\n".join("%s: %s" % (k, d[k]) for k in sorted(d))
183 def search(self, query):
184 self.create_indices()
185 for u in query:
186 c = self.by_kanji.get(u)
187 if c:
188 yield c
190 def create_indices(self):
191 if self.indexed:
192 return
193 print "Creating indices..."
194 self.indexed = True
196 self.by_kanji = {}
197 for char in self.characters:
198 literal = char.xml.find("literal").text.strip()
199 self.by_kanji[literal] = char
200 print "Done creating indices!"
203 def encode_or_else(s):
204 if os.name == "nt":
205 charset = "cp932"
206 else:
207 charset = "utf-8"
208 lines = s.split(u"\n")
209 out = []
210 for line in lines:
211 try:
212 val = line.encode(charset)
213 out.append(line)
214 except:
215 pass
216 return u"\n".join(out)
219 if __name__ == "__main__":
220 import sys, os
222 try:
223 dfname, args = sys.argv[1], sys.argv[2:]
224 assert args
225 except (IndexError, AssertionError):
226 print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
227 exit(-1)
229 try:
230 p = Parser(dfname)
231 except Exception, e:
232 print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
233 exit(-1)
235 if os.name == "nt":
236 charset = "cp932"
237 else:
238 charset = "utf-8"
240 print "HEADER"
241 print "======"
242 print p.get_header()
243 print
244 print "%d characters found" % len(p.characters)
246 for i, kanji in enumerate(p.search("".join(args).decode(charset))):
247 kstr = encode_or_else(unicode(kanji))
248 print _(u"Entry %d:\n%s\n") % (i+1, kstr)