Update: added support for most dictionary and query codes.
[jben2_gui.git] / jbparse / jbparse / kanjidic2.py
blob66c88e17d1fe2e40badd9b9f3a1ea62b78ce3bd9
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__ import absolute_import
35 import os, gzip, gettext
36 from xml.etree.cElementTree import ElementTree
37 gettext.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
42 def jis_kuten_to_hex(kuten):
43 """Kuten string to hex conversion"""
44 pieces = map(int, kuten.split(u'-'))
45 print u"DEBUG: kuten: %s, pieces: %s" % (kuten, str(pieces))
46 return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
49 class Kanjidic2Node(object):
51 def __init__(self, xml_node):
52 self.xml = xml_node
53 self.literal = self._get_literal()
55 def _get_literal(self):
56 literal = self.xml.find("literal").text.strip()
57 assert len(literal) == 1, u"Literal has more than one character!"
58 return literal
60 def _get_grade(self):
61 o = self.xml.find("misc/grade")
62 return int(o.text)
64 def _get_freq(self):
65 # By the spec, it seems like multiple freqs are possible??
66 # So... let's get all entries and assert.
67 o = self.xml.findall("misc/freq")
68 if not o:
69 return None
70 assert len(o) == 1, (
71 u"Character %s: Expected 1 freq entry, found %d" %
72 (self._get_literal(), len(o)))
73 return int(o[0].text)
75 def _get_jlpt(self):
76 o = self.xml.find("misc/jlpt")
77 return int(o.text)
79 def _get_nanori_nodes(self):
80 nodes = self.xml.findall("reading_meaning/nanori")
81 return nodes or None
83 def _get_attrdict(self, path, attr_name):
84 """Helper: stores elements on path in dict, keyed by attribute."""
85 d = {}
86 nodes = self.xml.findall(path)
87 #attrs = set(o.attrib.get(attr_name) for o in nodes)
88 for o in nodes:
89 d.setdefault(o.attrib.get(attr_name), []).append(o)
90 #for attr in attrs:
91 # d[attr] = [o for o in nodes
92 # if o.attrib.get(attr_name) == attr]
93 return d
95 def _get_reading_nodes(self):
96 """Returns dictionary of reading lists, keyed by type."""
97 return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
99 def _get_meaning_nodes(self):
100 """Returns dictionary of gloss lists, keyed by language prefix."""
101 meaning_d = self._get_attrdict(
102 "reading_meaning/rmgroup/meaning", "m_lang")
103 if None in meaning_d:
104 meaning_d['en'] = meaning_d[None]
105 del meaning_d[None]
106 return meaning_d
108 def _get_dictcodes(self):
109 return self._get_attrdict("dic_number/dic_ref", "dr_type")
111 def _get_querycodes(self):
112 return self._get_attrdict("query_code/q_code", "qc_type")
114 def __unicode__(self):
116 def xml2text(o):
117 return o.text
119 def mapdict(fn, d):
120 result = {}
121 for k, v in d.iteritems():
122 result[k] = map(fn, v)
123 return result
125 readings = mapdict(xml2text, self._get_reading_nodes())
126 meanings = mapdict(xml2text, self._get_meaning_nodes())
127 nanori = map(xml2text, self._get_nanori_nodes())
128 grade = self._get_grade()
129 jlpt = self._get_jlpt()
130 freq = self._get_freq()
131 dicts = self._get_dictcodes()
132 qcodes = self._get_querycodes()
134 pieces = []
136 pieces.append(u"=" * 70)
138 pieces.append(u"Literal: %s" % self.literal)
140 pieces.append(u"-" * 70)
141 pieces.append(u"Readings:")
143 pieces.append(u" On-yomi: %s" % u"、".join(readings['ja_on']))
144 pieces.append(u" Kun-yomi: %s" % u"、".join(readings['ja_kun']))
145 pieces.append(u" Nanori: %s" % u"、".join(nanori))
146 pieces.append(u" Korean (Hangul): %s" %
147 u", ".join(readings['korean_h']))
148 pieces.append(u" Korean (Romanized): %s" %
149 u", ".join(readings['korean_r']))
150 pieces.append(u" Pinyin: %s" % u", ".join(readings['pinyin']))
152 pieces.append(u"-" * 70)
154 for lang in sorted(meanings):
155 pieces.append(u"Meanings (%s): %s" %
156 (lang, u"; ".join(meanings[lang])))
158 pieces.append(u"-" * 70)
159 pieces.append(u"Miscellaneous:")
161 if jlpt:
162 pieces.append(u" JLPT grade level: %d" % jlpt)
163 if grade:
164 pieces.append(u" Jouyou grade level: %d" % grade)
165 if freq:
166 pieces.append(u" Newspaper frequency: %d" % freq)
168 pieces.append(u"-" * 70)
169 pieces.append(u"Dictionary codes:")
171 for dcode in sorted(dicts):
172 nodes = dicts[dcode]
173 assert len(nodes) == 1, (
174 u"Character %s: Multiple (%d) entries found for "
175 u"dict code %s" %
176 (self._get_literal(), len(nodes), dcode))
177 o = nodes[0]
178 dname = kanjidic2_key_to_str(dcode)
179 if dcode == "moro":
180 s = u"Index %s, volume %s, page %s" % \
181 (o.text, o.attrib['m_vol'], o.attrib['m_page'])
182 else:
183 s = o.text
184 pieces.append(u" %s: %s" % (dname, s))
186 pieces.append(u"-" * 70)
187 pieces.append(u"Query codes:")
189 for qcode in sorted(qcodes):
190 nodes = qcodes[qcode]
191 if qcode == "skip":
192 # SKIP has miscodes; do later
193 continue
194 s = u", ".join(o.text for o in nodes)
195 qname = qcode_to_desc(qcode)
196 pieces.append(u" %s: %s" % (qname, s))
198 pieces.append(u"-" * 70)
200 pieces.append(u"Unicode value: %04X" % ord(self.literal))
202 pieces.append(u"=" * 70)
204 return u"\n".join(pieces)
207 class Parser(object):
209 def __init__(self, filename, encoding="utf-8"):
210 """Initializer for Kanjidic2Parser.
212 About use_cache: Kanjidic2 is a large, heavy to parse file.
213 Although it takes a large amount of memory, it is better to
214 retain it in memory to increase the speed of subsequent
215 searches.
218 if not os.path.exists(filename):
219 raise Exception("Dictionary file does not exist.")
220 self.filename = filename
221 self.encoding = encoding
222 self.indexed = False
223 self.header, self.characters = self.load_via_etree()
225 def load_via_etree(self):
226 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
227 f = gzip.open(self.filename)
228 else:
229 f = open(self.filename, "rb")
230 et = ElementTree(file=f)
231 f.close()
232 nodes = et.getroot().getchildren()
233 header, characters = nodes[0], nodes[1:]
234 characters = [Kanjidic2Node(char) for char in characters]
235 return header, characters
237 def get_header(self):
238 d = {}
239 for o in self.header.getchildren():
240 cdata = u"".join((o.text, o.tail)).strip()
241 d[o.tag] = cdata
242 return u"\n".join(u"%s: %s" % (k, d[k]) for k in sorted(d))
244 def search(self, query):
245 self.create_indices()
246 for u in query:
247 c = self.by_kanji.get(u)
248 if c:
249 yield c
251 def create_indices(self):
252 if self.indexed:
253 return
254 self.indexed = True
255 self.by_kanji = {}
256 for char in self.characters:
257 literal = char.xml.find("literal").text.strip()
258 self.by_kanji[literal] = char
261 def encode_or_else(s):
262 if os.name == "nt":
263 charset = "cp932"
264 else:
265 charset = "utf-8"
266 lines = s.split(u"\n")
267 out = []
268 for line in lines:
269 try:
270 val = line.encode(charset)
271 out.append(line)
272 except:
273 pass
274 return u"\n".join(out)
277 if __name__ == "__main__":
278 import sys
280 try:
281 dfname, args = sys.argv[1], sys.argv[2:]
282 assert args
283 except (IndexError, AssertionError):
284 print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
285 exit(-1)
287 try:
288 p = Parser(dfname)
289 except Exception, e:
290 print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
291 exit(-1)
293 if os.name == "nt":
294 charset = "cp932"
295 else:
296 charset = "utf-8"
298 print u"HEADER"
299 print u"======"
300 print p.get_header()
301 print
302 print u"%d characters found" % len(p.characters)
304 for i, kanji in enumerate(p.search("".join(args).decode(charset))):
305 kstr = encode_or_else(unicode(kanji))
306 print _(u"Entry %d:\n%s\n") % (i+1, kstr)