Added stroke counts/miscounts, fixed bugs if no JLPT or Jouyou grade are found.
[jben2_gui.git] / jbparse / jbparse / kanjidic2.py
blob1d33ec273304bb0780c207b05a7fd3ab7ba70754
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__ import absolute_import
35 import os, gzip, gettext, warnings
36 from xml.etree.cElementTree import ElementTree
37 gettext.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
42 def jis_kuten_to_hex(kuten):
43 """Kuten string to hex conversion"""
44 pieces = map(int, kuten.split(u'-'))
45 return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
47 def xml2text(o):
48 return o.text
50 def mapdict(fn, d):
51 result = {}
52 for k, v in d.iteritems():
53 result[k] = map(fn, v)
54 return result
57 class Kanjidic2Node(object):
59 def __init__(self, xml_node):
60 self.xml = xml_node
61 self.literal = self._get_literal()
63 def _get_literal(self):
64 literal = self.xml.find("literal").text.strip()
65 assert len(literal) == 1, _(u"Literal has more than one character!")
66 return literal
68 def get_grade(self):
69 o = self.xml.find("misc/grade")
70 return int(o.text) if o else None
72 def get_freq(self):
73 # By the spec, it seems like multiple freqs are possible??
74 # So... let's get all entries and assert.
75 o = self.xml.findall("misc/freq")
76 if not o:
77 return None
78 assert len(o) == 1, _(
79 u"Character %s: Expected 1 freq entry, found %d") % \
80 (self._get_literal(), len(o))
81 return int(o[0].text)
83 def get_jlpt(self):
84 o = self.xml.find("misc/jlpt")
85 return int(o.text) if o else None
87 def get_strokes(self):
88 """Gets stroke count.
90 Returns a tuple of (stroke_count, miscounts), where miscounts
91 is either None or a list of common miscounts for the
92 character.
94 """
95 nodes = self.xml.findall("misc/stroke_count")
96 scnode, misnodes = nodes[0], nodes[1:]
97 sc = int(nodes[0].text)
98 if misnodes:
99 miss = map(int, [o.text for o in misnodes])
100 else:
101 miss = None
102 return (sc, miss)
104 def _get_nanori_nodes(self):
105 nodes = self.xml.findall("reading_meaning/nanori")
106 return nodes or None
108 def _get_attrdict(self, path, attr_name):
109 """Helper: stores elements on path in dict, keyed by attribute."""
110 d = {}
111 nodes = self.xml.findall(path)
112 #attrs = set(o.attrib.get(attr_name) for o in nodes)
113 for o in nodes:
114 d.setdefault(o.attrib.get(attr_name), []).append(o)
115 #for attr in attrs:
116 # d[attr] = [o for o in nodes
117 # if o.attrib.get(attr_name) == attr]
118 return d
120 def _get_reading_nodes(self):
121 """Returns dictionary of reading lists, keyed by type."""
122 return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
124 def _get_meaning_nodes(self):
125 """Returns dictionary of gloss lists, keyed by language prefix."""
126 meaning_d = self._get_attrdict(
127 "reading_meaning/rmgroup/meaning", "m_lang")
128 if None in meaning_d:
129 meaning_d['en'] = meaning_d[None]
130 del meaning_d[None]
131 return meaning_d
133 def _get_dictcodes(self):
134 return self._get_attrdict("dic_number/dic_ref", "dr_type")
136 def _get_querycodes(self):
137 return self._get_attrdict("query_code/q_code", "qc_type")
139 def get_nanori(self):
140 nanori = map(xml2text, self._get_nanori_nodes() or [])
141 if nanori:
142 return _(u"%s: %s") % (_(u"Nanori"), u"、".join(nanori))
144 def get_readings(self, rtypes):
145 """Gets readings as text strings.
147 Takes in any number of reading keys, and returns a list
148 containing user-friendly output strings.
150 Valid keys include: ja_on, ja_kun, korean_h, korean_r, pinyin,
151 and nanori.
153 Note: Nanori is also handled independently, as it is stored
154 differently than the other readings.
157 d = {
158 "ja_on": _(u"On-yomi"),
159 "ja_kun": _(u"Kun-yomi"),
160 "korean_h": _(u"Korean (Hangul)"),
161 "korean_r": _(u"Korean (Romanized)"),
162 "pinyin": _(u"Pinyin"),
164 romanized = ("korean_r", "pinyin")
165 readings = mapdict(xml2text, self._get_reading_nodes())
166 pieces = []
167 for rt in rtypes:
168 if rt == "nanori":
169 s = self.get_nanori()
170 if s:
171 pieces.append(s)
172 elif rt in d:
173 if rt not in readings:
174 continue
175 separator = u", " if rt in romanized else u"、"
176 reading_str = separator.join(readings[rt])
177 pieces.append(_(u"%s: %s") % (d[rt], reading_str))
178 return pieces
180 def get_meanings(self):
181 meanings = mapdict(xml2text, self._get_meaning_nodes())
182 pieces = []
183 for lang in sorted(meanings):
184 pieces.append(_(u"Meanings (%s): %s") %
185 (lang, u"; ".join(meanings[lang])))
186 return pieces
188 def get_dict_codes(self, keys, all=False):
189 """Gets dictionary codes as strings for display to the user.
191 Accepts a list of dictionary keys. To get all keys, set the
192 all keyword to true. (The keys parameter will be ignored in
193 this case.)
196 pieces = []
197 dicts = self._get_dictcodes()
198 for dcode in sorted(dicts):
199 if (not all) and dcode not in keys:
200 continue
201 nodes = dicts[dcode]
202 assert len(nodes) == 1, _(
203 u"Character %s: Multiple (%d) entries found for "
204 u"dict code %s") % \
205 (self._get_literal(), len(nodes), dcode)
206 o = nodes[0]
207 dname = kanjidic2_key_to_str(dcode)
208 if dcode == "moro":
209 s = _(u"Index %s, volume %s, page %s") % \
210 (o.text, o.attrib['m_vol'], o.attrib['m_page'])
211 else:
212 s = o.text
213 pieces.append(_(u"%s: %s") % (dname, s))
214 return pieces
216 def get_query_codes(self, keys, all=False):
217 pieces = []
218 qcodes = self._get_querycodes()
219 for qcode in sorted(qcodes):
220 if (not all) and qcode not in keys:
221 continue
222 nodes = qcodes[qcode]
223 qname = qcode_to_desc(qcode)
224 if qcode == "skip":
225 d = {}
226 for o in nodes:
227 d.setdefault(o.attrib.get("skip_misclass"), []).append(o)
228 for misclass in sorted(d):
229 if misclass:
230 outname = _(u"%s miscode (%s)") % (qname, misclass)
231 else:
232 outname = qname
233 s = u", ".join(o.text for o in d[misclass])
234 pieces.append(_(u"%s: %s") % (outname, s))
235 else:
236 s = u", ".join(o.text for o in nodes)
237 pieces.append(_(u"%s: %s") % (qname, s))
238 return pieces
240 def __unicode__(self):
241 pieces = []
243 pieces.append(u"=" * 70)
244 pieces.append(_(u"Literal: %s") % self.literal)
245 pieces.append(u"-" * 70)
247 pieces.append(_(u"Readings:"))
248 r_strs = [u" %s" % s for s in
249 self.get_readings(
250 ("ja_on", "ja_kun", "nanori",
251 "korean_h", "korean_r", "pinyin"))]
252 pieces.extend(r_strs)
253 pieces.append(u"-" * 70)
255 m_strs = [u" %s" % s for s in self.get_meanings()]
256 pieces.extend(m_strs)
257 pieces.append(u"-" * 70)
259 pieces.append(_(u"Miscellaneous:"))
260 jlpt = self.get_jlpt()
261 if jlpt:
262 pieces.append(_(u" JLPT grade level: %d") % jlpt)
263 grade = self.get_grade()
264 if self.get_grade():
265 pieces.append(_(u" Jouyou grade level: %d") % grade)
266 freq = self.get_freq()
267 if self.get_freq():
268 pieces.append(_(u" Newspaper frequency: %d") % freq)
269 strokes, misstrokes = self.get_strokes()
270 pieces.append(_(u" Stroke count: %d") % strokes)
271 if misstrokes:
272 pieces.append(_(u" Common stroke miscounts: %s") %
273 ", ".join(map(str, misstrokes)))
274 pieces.append(u"-" * 70)
276 pieces.append(_(u"Dictionary codes:"))
277 d_strs = [u" %s" % s for s in self.get_dict_codes([], all=True)]
278 pieces.extend(d_strs)
279 pieces.append(u"-" * 70)
281 pieces.append(_(u"Query codes:"))
282 qc_strs = [u" %s" % s for s in self.get_query_codes([], all=True)]
283 pieces.extend(qc_strs)
284 pieces.append(u"-" * 70)
286 pieces.append(_(u"Other information:"))
287 #cp_strs = self.get_codepoints()
288 #rad_strs = self.get_rad_info()
289 #variant_strs = self.get_variants()
290 pieces.append(_(u" Unicode value: %04X") % ord(self.literal))
292 pieces.append(u"=" * 70)
294 return u"\n".join(pieces)
297 class Parser(object):
299 def __init__(self, filename, encoding="utf-8"):
300 """Initializer for Kanjidic2Parser.
302 About use_cache: Kanjidic2 is a large, heavy to parse file.
303 Although it takes a large amount of memory, it is better to
304 retain it in memory to increase the speed of subsequent
305 searches.
308 if not os.path.exists(filename):
309 raise Exception(u_("Dictionary file does not exist."))
310 self.filename = filename
311 self.encoding = encoding
312 self.indexed = False
313 self.header, self.characters = self.load_via_etree()
314 self._check_version()
316 def _check_version(self):
317 version = int(self.header.find('file_version').text)
318 assert version >= 4, _(
319 u"This parser won't work with versions of KANJIDIC2 "
320 u"older than version 4.")
321 if version > 3:
322 s = _(u"Parser version is for version 4, detected version is %d"
323 ) % version
324 warnings.warn(s)
326 def load_via_etree(self):
327 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
328 f = gzip.open(self.filename)
329 else:
330 f = open(self.filename, "rb")
331 et = ElementTree(file=f)
332 f.close()
333 nodes = et.getroot().getchildren()
334 header, characters = nodes[0], nodes[1:]
335 characters = [Kanjidic2Node(char) for char in characters]
336 return header, characters
338 def get_header(self):
339 d = {}
340 for o in self.header.getchildren():
341 cdata = u"".join((o.text, o.tail)).strip()
342 d[o.tag] = cdata
343 return u"\n".join(u"%s: %s" % (k, d[k]) for k in sorted(d))
345 def search(self, query):
346 self.create_indices()
347 for u in query:
348 c = self.by_kanji.get(u)
349 if c:
350 yield c
352 def create_indices(self):
353 if self.indexed:
354 return
355 self.indexed = True
356 self.by_kanji = {}
357 for char in self.characters:
358 literal = char.xml.find("literal").text.strip()
359 self.by_kanji[literal] = char
362 def encode_or_else(s):
363 if os.name == "nt":
364 charset = "cp932"
365 else:
366 charset = "utf-8"
367 lines = s.split(u"\n")
368 out = []
369 for line in lines:
370 try:
371 val = line.encode(charset)
372 out.append(line)
373 except:
374 pass
375 return u"\n".join(out)
378 if __name__ == "__main__":
379 import sys
381 try:
382 dfname, args = sys.argv[1], sys.argv[2:]
383 assert args
384 except (IndexError, AssertionError):
385 print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
386 exit(-1)
388 try:
389 p = Parser(dfname)
390 except Exception, e:
391 print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
392 exit(-1)
394 if os.name == "nt":
395 charset = "cp932"
396 else:
397 charset = "utf-8"
399 print u"HEADER"
400 print u"======"
401 print p.get_header()
402 print
403 print u"%d characters found" % len(p.characters)
405 for i, kanji in enumerate(p.search("".join(args).decode(charset))):
406 kstr = encode_or_else(unicode(kanji))
407 print _(u"Entry %d:\n%s\n") % (i+1, kstr)