Removed duplication, laid out remainder of __unicode__.
[jben2_gui.git] / jbparse / jbparse / kanjidic2.py
blob6e275748642d7adfb53c56c3f5817f32764c832d
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__ import absolute_import
35 import os, gzip, gettext, warnings
36 from xml.etree.cElementTree import ElementTree
37 gettext.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
42 def jis_kuten_to_hex(kuten):
43 """Kuten string to hex conversion"""
44 pieces = map(int, kuten.split(u'-'))
45 return ((pieces[0] + 0x20) << 8) + (pieces[1] + 0x20)
47 def xml2text(o):
48 return o.text
50 def mapdict(fn, d):
51 result = {}
52 for k, v in d.iteritems():
53 result[k] = map(fn, v)
54 return result
57 class Kanjidic2Node(object):
59 def __init__(self, xml_node):
60 self.xml = xml_node
61 self.literal = self._get_literal()
63 def _get_literal(self):
64 literal = self.xml.find("literal").text.strip()
65 assert len(literal) == 1, _(u"Literal has more than one character!")
66 return literal
68 def get_grade(self):
69 o = self.xml.find("misc/grade")
70 return int(o.text) if o else None
72 def get_freq(self):
73 # By the spec, it seems like multiple freqs are possible??
74 # So... let's get all entries and assert.
75 o = self.xml.findall("misc/freq")
76 if not o:
77 return None
78 assert len(o) == 1, _(
79 u"Character %s: Expected 1 freq entry, found %d") % \
80 (self._get_literal(), len(o))
81 return int(o[0].text)
83 def get_jlpt(self):
84 o = self.xml.find("misc/jlpt")
85 return int(o.text) if o else None
87 def get_strokes(self):
88 """Gets stroke count.
90 Returns a tuple of (stroke_count, miscounts), where miscounts
91 is either None or a list of common miscounts for the
92 character.
94 """
95 nodes = self.xml.findall("misc/stroke_count")
96 scnode, misnodes = nodes[0], nodes[1:]
97 sc = int(nodes[0].text)
98 if misnodes:
99 miss = map(int, [o.text for o in misnodes])
100 else:
101 miss = None
102 return (sc, miss)
104 def _get_nanori_nodes(self):
105 nodes = self.xml.findall("reading_meaning/nanori")
106 return nodes or None
108 def _get_attrdict(self, path, attr_name):
109 """Helper: stores elements on path in dict, keyed by attribute."""
110 d = {}
111 nodes = self.xml.findall(path)
112 #attrs = set(o.attrib.get(attr_name) for o in nodes)
113 for o in nodes:
114 d.setdefault(o.attrib.get(attr_name), []).append(o)
115 #for attr in attrs:
116 # d[attr] = [o for o in nodes
117 # if o.attrib.get(attr_name) == attr]
118 return d
120 def _get_reading_nodes(self):
121 """Returns dictionary of reading lists, keyed by type."""
122 # NEEDS AN UPDATE: Just noticed, rmgroup allows
123 # readings/meanings to be meaningfully grouped together. We
124 # -can- dump everything together, but we -should- handle the
125 # groups.
126 return self._get_attrdict("reading_meaning/rmgroup/reading", "r_type")
128 def _get_meaning_nodes(self):
129 """Returns dictionary of gloss lists, keyed by language prefix."""
130 # NEEDS AN UPDATE: See _get_reading_nodes.
131 meaning_d = self._get_attrdict(
132 "reading_meaning/rmgroup/meaning", "m_lang")
133 if None in meaning_d:
134 meaning_d['en'] = meaning_d[None]
135 del meaning_d[None]
136 return meaning_d
138 def _get_dictcodes(self):
139 return self._get_attrdict("dic_number/dic_ref", "dr_type")
141 def _get_querycodes(self):
142 return self._get_attrdict("query_code/q_code", "qc_type")
144 def get_nanori(self):
145 nanori = map(xml2text, self._get_nanori_nodes() or [])
146 if nanori:
147 return _(u"%s: %s") % (_(u"Nanori"), u"、".join(nanori))
149 def get_readings(self, rtypes):
150 """Gets readings as text strings.
152 Takes in any number of reading keys, and returns a list
153 containing user-friendly output strings.
155 Valid keys include: ja_on, ja_kun, korean_h, korean_r, pinyin,
156 and nanori.
158 Note: Nanori is also handled independently, as it is stored
159 differently than the other readings.
162 d = {
163 "ja_on": _(u"On-yomi"),
164 "ja_kun": _(u"Kun-yomi"),
165 "korean_h": _(u"Korean (Hangul)"),
166 "korean_r": _(u"Korean (Romanized)"),
167 "pinyin": _(u"Pinyin"),
169 romanized = ("korean_r", "pinyin")
170 readings = mapdict(xml2text, self._get_reading_nodes())
171 pieces = []
172 for rt in rtypes:
173 if rt == "nanori":
174 s = self.get_nanori()
175 if s:
176 pieces.append(s)
177 elif rt in d:
178 if rt not in readings:
179 continue
180 separator = u", " if rt in romanized else u"、"
181 reading_str = separator.join(readings[rt])
182 pieces.append(_(u"%s: %s") % (d[rt], reading_str))
183 return pieces
185 def get_meanings(self):
186 meanings = mapdict(xml2text, self._get_meaning_nodes())
187 pieces = []
188 for lang in sorted(meanings):
189 pieces.append(_(u"Meanings (%s): %s") %
190 (lang, u"; ".join(meanings[lang])))
191 return pieces
193 def get_dict_codes(self, keys, all=False):
194 """Gets dictionary codes as strings for display to the user.
196 Accepts a list of dictionary keys. To get all keys, set the
197 all keyword to true. (The keys parameter will be ignored in
198 this case.)
201 pieces = []
202 dicts = self._get_dictcodes()
203 for dcode in sorted(dicts):
204 if (not all) and dcode not in keys:
205 continue
206 nodes = dicts[dcode]
207 assert len(nodes) == 1, _(
208 u"Character %s: Multiple (%d) entries found for "
209 u"dict code %s") % \
210 (self._get_literal(), len(nodes), dcode)
211 o = nodes[0]
212 dname = kanjidic2_key_to_str(dcode)
213 if dcode == "moro":
214 s = _(u"Index %s, volume %s, page %s") % \
215 (o.text, o.attrib['m_vol'], o.attrib['m_page'])
216 else:
217 s = o.text
218 pieces.append(_(u"%s: %s") % (dname, s))
219 return pieces
221 def get_query_codes(self, keys, all=False):
222 pieces = []
223 qcodes = self._get_querycodes()
224 for qcode in sorted(qcodes):
225 if (not all) and qcode not in keys:
226 continue
227 nodes = qcodes[qcode]
228 qname = qcode_to_desc(qcode)
229 if qcode == "skip":
230 d = {}
231 for o in nodes:
232 d.setdefault(o.attrib.get("skip_misclass"), []).append(o)
233 for misclass in sorted(d):
234 if misclass:
235 outname = _(u"%s miscode (%s)") % (qname, misclass)
236 else:
237 outname = qname
238 s = u", ".join(o.text for o in d[misclass])
239 pieces.append(_(u"%s: %s") % (outname, s))
240 else:
241 s = u", ".join(o.text for o in nodes)
242 pieces.append(_(u"%s: %s") % (qname, s))
243 return pieces
245 def __unicode__(self):
247 def indent_strs(strs):
248 return [u" %s" % s for s in strs]
250 pieces = []
252 pieces.append(u"=" * 70)
253 pieces.append(_(u"Literal: %s") % self.literal)
254 pieces.append(u"-" * 70)
256 pieces.append(_(u"Readings:"))
257 r_strs = indent_strs(self.get_readings(
258 ("ja_on", "ja_kun", "nanori", "korean_h", "korean_r", "pinyin")))
259 pieces.extend(r_strs)
260 pieces.append(u"-" * 70)
262 m_strs = indent_strs(self.get_meanings())
263 pieces.extend(m_strs)
264 pieces.append(u"-" * 70)
266 pieces.append(_(u"Miscellaneous:"))
267 jlpt = self.get_jlpt()
268 if jlpt:
269 pieces.append(_(u" JLPT grade level: %d") % jlpt)
270 grade = self.get_grade()
271 if self.get_grade():
272 pieces.append(_(u" Jouyou grade level: %d") % grade)
273 freq = self.get_freq()
274 if self.get_freq():
275 pieces.append(_(u" Newspaper frequency: %d") % freq)
276 strokes, misstrokes = self.get_strokes()
277 pieces.append(_(u" Stroke count: %d") % strokes)
278 if misstrokes:
279 pieces.append(_(u" Common stroke miscounts: %s") %
280 ", ".join(map(str, misstrokes)))
281 pieces.append(u"-" * 70)
283 pieces.append(_(u"Dictionary codes:"))
284 d_strs = indent_strs(self.get_dict_codes([], all=True))
285 pieces.extend(d_strs)
286 pieces.append(u"-" * 70)
288 pieces.append(_(u"Query codes:"))
289 qc_strs = indent_strs(self.get_query_codes([], all=True))
290 pieces.extend(qc_strs)
291 pieces.append(u"-" * 70)
293 pieces.append(_(u"Other information:"))
295 # RADICAL node info
296 #rad_strs = self.get_rad_info()
298 # CODEPOINT node info
299 #cp_strs = indent_strs(self.get_codepoints())
300 pieces.append(_(u" Unicode value: %04X") % ord(self.literal))
302 # MISC node children
303 #variant_strs = self.get_variants() # AKA cross refs
304 #radname_strs = self.get_radical_name() # "T2" KANJIDIC code
306 pieces.append(u"=" * 70)
308 return u"\n".join(pieces)
311 class Parser(object):
313 def __init__(self, filename, encoding="utf-8"):
314 """Initializer for Kanjidic2Parser.
316 About use_cache: Kanjidic2 is a large, heavy to parse file.
317 Although it takes a large amount of memory, it is better to
318 retain it in memory to increase the speed of subsequent
319 searches.
322 if not os.path.exists(filename):
323 raise Exception(u_("Dictionary file does not exist."))
324 self.filename = filename
325 self.encoding = encoding
326 self.indexed = False
327 self.header, self.characters = self.load_via_etree()
328 self._check_version()
330 def _check_version(self):
331 version = int(self.header.find('file_version').text)
332 assert version >= 4, _(
333 u"This parser won't work with versions of KANJIDIC2 "
334 u"older than version 4.")
335 if version > 3:
336 s = _(u"Parser version is for version 4, detected version is %d"
337 ) % version
338 warnings.warn(s)
340 def load_via_etree(self):
341 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
342 f = gzip.open(self.filename)
343 else:
344 f = open(self.filename, "rb")
345 et = ElementTree(file=f)
346 f.close()
347 nodes = et.getroot().getchildren()
348 header, characters = nodes[0], nodes[1:]
349 characters = [Kanjidic2Node(char) for char in characters]
350 return header, characters
352 def get_header(self):
353 d = {}
354 for o in self.header.getchildren():
355 cdata = u"".join((o.text, o.tail)).strip()
356 d[o.tag] = cdata
357 return u"\n".join(u"%s: %s" % (k, d[k]) for k in sorted(d))
359 def search(self, query):
360 self.create_indices()
361 for u in query:
362 c = self.by_kanji.get(u)
363 if c:
364 yield c
366 def create_indices(self):
367 if self.indexed:
368 return
369 self.indexed = True
370 self.by_kanji = {}
371 for char in self.characters:
372 literal = char.xml.find("literal").text.strip()
373 self.by_kanji[literal] = char
376 def encode_or_else(s):
377 if os.name == "nt":
378 charset = "cp932"
379 else:
380 charset = "utf-8"
381 lines = s.split(u"\n")
382 out = []
383 for line in lines:
384 try:
385 val = line.encode(charset)
386 out.append(line)
387 except:
388 pass
389 return u"\n".join(out)
392 if __name__ == "__main__":
393 import sys
395 try:
396 dfname, args = sys.argv[1], sys.argv[2:]
397 assert args
398 except (IndexError, AssertionError):
399 print _(u"Syntax: %s <dict_file> <character [...]>") % sys.argv[0]
400 exit(-1)
402 try:
403 p = Parser(dfname)
404 except Exception, e:
405 print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
406 exit(-1)
408 if os.name == "nt":
409 charset = "cp932"
410 else:
411 charset = "utf-8"
413 print u"HEADER"
414 print u"======"
415 print p.get_header()
416 print
417 print u"%d characters found" % len(p.characters)
419 for i, kanji in enumerate(p.search("".join(args).decode(charset))):
420 kstr = encode_or_else(unicode(kanji))
421 print _(u"Entry %d:\n%s\n") % (i+1, kstr)