Forgot the -a on git commit.
[jben2_gui.git] / parsers / kanjidic2.py
blob5017635d56e8fda07272073c6f931dfec810f1ff
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2.
33 This module is incomplete and currently just holds helper code for the
34 KANJIDIC parser.
36 """
38 import gzip, xml.sax, gettext
39 gettext.install('pyjben', unicode=True)
41 from parsers.kanjidic_common \
42 import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
44 class Kanjidic2Entry(object):
46 def __init__(self):
47 # Key info
48 self.literal = None
49 self.jis = None
50 self.meanings = {}
51 self.ja_kun = []
52 self.ja_on = []
53 self.nanori = []
55 # Secondary info
56 self.strokes = None
57 self.strokes_miss = []
58 self.freq = None
59 self.grade = None
60 self.jlpt = None
62 # Info of low importance for most target users
63 self.radical = None
64 self.radical_c = None # "Classic" KangXi Zidian radical
65 self.radname = None
66 self.pinyin = []
67 self.korean_h = []
68 self.korean_r = []
70 # "Query codes": Pattern-based lookup
71 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
72 # Codes: P, DRnnnn, Inxnn.n, Qnnnn.n
73 self.qcodes = {}
75 # Dictionary codes
76 # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn
77 # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM
78 self.dcodes = {}
80 # Dictionary-related metadata
81 self.xref = []
82 self.misclass = []
84 self.unparsed = []
86 def to_string(self, **kwargs):
87 """A default "to-string" dump of a Kanjidic2Entry."""
88 lines = []
89 lines.append(_(u"Literal: %s") % self.literal)
90 if self.ja_on:
91 lines.append(_(u"Onyomi: %s")
92 % u"、".join(
93 [jstring_convert(us) for us in self.ja_on]))
94 if self.ja_kun:
95 lines.append(_(u"Kunyomi: %s")
96 % u"、".join(
97 [jstring_convert(us) for us in self.ja_kun]))
98 if self.nanori:
99 lines.append(_(u"Nanori: %s")
100 % u"、".join(
101 [jstring_convert(us) for us in self.nanori]))
102 if self.meanings:
103 for k, v in self.meanings.iteritems():
104 lines.append(_(u"Meaning (%s): %s") % (k, _(u"; ").join(v)))
106 if self.strokes:
107 lines.append(_(u"Stroke count: %d") % self.strokes)
108 if self.strokes_miss:
109 lines.append(_(u"Common miscounts: %s")
110 % _(u", ").join(self.strokes_miss))
111 if self.freq:
112 lines.append(_(u"Newspaper Frequency: %d") % self.freq)
113 if self.grade:
114 if self.grade in range(1, 7):
115 grade_str = unicode(self.grade)
116 elif self.grade == 8:
117 grade_str = _(u"General usage")
118 elif self.grade == 9:
119 grade_str = _(u"Jinmeiyou (Characters for names)")
120 elif self.grade == None:
121 grade_str = _(u"Unspecified")
122 else:
123 grade_str = _(u"Unhandled grade level (Grade %d)") % self.grade
124 lines.append(_(u"Jouyou Grade: %s") % grade_str)
125 if self.jlpt:
126 lines.append(_(u"JLPT Level: %d") % self.jlpt)
128 # Query codes
129 if self.qcodes:
130 for k, v in self.qcodes.iteritems():
131 desc = qcode_to_desc(k)
132 lines.append(_(u"%s code: %s") % (desc, self.qcodes[k]))
134 if k == 'skip' and self.misclass:
135 miscodes = []
136 for code in self.misclass:
137 code_type = code[:2]
138 code_val = code[2:]
139 if code_type == u'SP': # "stroke_count"
140 miscodes.append(_(u"%s (stroke count)") % code_val)
141 elif code_type == u'PP': # "posn"
142 miscodes.append(_(u"%s (position)") % code_val)
143 elif code_type == u'BP': # "stroke_and_posn"
144 miscodes.append(_(u"%s (stroke and position)") % code_val)
145 elif code_type == u'RP': # "stroke_diff"
146 miscodes.append(_(u"%s (debatable count)") % code_val)
147 else:
148 lines.append(_(u"Unrecognized misclassification code: %s")
149 % unicode(code))
150 if miscodes:
151 lines.append(_(u"SKIP miscodes: %s")
152 % _(u", ").join(miscodes))
154 if self.dcodes:
155 # Probably we should sort these in some way... but for
156 # now, just display.
157 for k, v in self.dcodes.iteritems():
158 k = kanjidic2_key_to_str(k)
159 lines.append(_(u"%s: %s") % (k, v))
161 if self.radname:
162 lines.append(_(u"Radical name: %s") % self.radname)
163 if self.radical:
164 lines.append(_(u"Nelson Radical: %d") % self.radical)
165 if self.radical_c:
166 lines.append(_(u"KangXi Zidian Radical: %d") % self.radical_c)
168 if self.korean_h:
169 lines.append(_(u"Korean: %s")
170 % _(u", ").join(self.korean_h))
171 if self.korean_r:
172 lines.append(_(u"Korean romanization: %s")
173 % _(u", ").join(self.korean_r))
174 if self.pinyin:
175 lines.append(_(u"Pinyin romanization: %s")
176 % _(u", ").join(self.pinyin))
178 # "self.unicode" is always present. ;)
179 lines.append(_(u"Unicode: 0x%04X") % ord(self.literal))
180 if self.jis:
181 def jis_hex_to_kuten(hex_code):
182 """KANJIDIC2-style kuten string"""
183 return u"%s-%s" % (
184 (((hex_code >> 8) & 0xFF) - 0x20),
185 ((hex_code & 0xFF) - 0x20))
187 kuten = jis_hex_to_kuten(self.jis)
188 lines.append(_(u"JIS code: Kuten = %s, Hex = 0x%04X")
189 % (kuten, self.jis))
191 #self.xref = []
192 if self.xref:
193 # From KANJIDIC documentation:
195 # Xxxxxxx -- a cross-reference code. An entry of, say,
196 # XN1234 will mean that the user is referred to the kanji
197 # with the (unique) Nelson index of 1234. XJ0xxxx and
198 # XJ1xxxx are cross-references to the kanji with the JIS
199 # hexadecimal code of xxxx. The `0' means the reference is
200 # to a JIS X 0208 kanji, and the `1' references a JIS X
201 # 0212 kanji.
204 # For now, just dump to the console.
205 lines.append(_(u"Crossref codes: %s") % ", ".join(self.xref))
207 # From J-Ben 1:
208 #/* Crossref codes */
209 #if(!k.var_j208.empty())
210 #result << "<li>JIS-208: " << k.var_j208 << "</li>";
211 #if(!k.var_j212.empty())
212 #result << "<li>JIS-212: " << k.var_j212 << "</li>";
213 #if(!k.var_j213.empty())
214 #result << "<li>JIS-213: " << k.var_j213 << "</li>";
215 #if(!k.var_ucs.empty())
216 #result << "<li>Unicode: " << k.var_ucs << "</li>";
217 #if(!k.var_deroo.empty())
218 #result << "<li>De Roo code: " << k.var_deroo << "</li>";
219 #if(!k.var_nelson_c.empty())
220 #result << "<li>Modern Reader's Japanese-English Character "
221 #"Dictionary (Nelson): " << k.var_nelson_c << "</li>";
222 #if(!k.var_njecd.empty())
223 #result << "<li>New Japanese-English Character Dictionary "
224 #"(Halpern): " << k.var_njecd << "</li>";
225 #if(!k.var_oneill.empty())
226 #result << "<li>Japanese Names (O'Neill): " << k.var_oneill
227 #<< "</li>";
228 #if(!k.var_s_h.empty())
229 #result << "<li>Spahn/Hadamitzky Kanji Dictionary code: "
230 #<< k.var_s_h << "</li>";
232 if self.unparsed:
233 lines.append(_(u"Unrecognized codes: %s")
234 % (u", ").join(self.unparsed))
236 return u"\n".join(lines)
238 class KD2SAXHandler(xml.sax.handler.ContentHandler):
240 """SAX handler for KANJIDIC2."""
242 def __init__(self, *args, **kwargs):
243 #self.limit = 1
244 xml.sax.handler.ContentHandler.__init__(self, *args, **kwargs)
245 self.parsing = False
246 self.kanji = None
247 self.path = []
248 self.full_keys = set()
249 self.data = {}
251 def get_path(self):
252 return u"/".join([i[0] for i in self.path])
254 def get_attr_str(self):
255 return u", ".join([u"%s: %s" % (k, v)
256 for k, v in self.path[-1][1].items()])
258 def startElement(self, name, attrs):
259 if name == "character":
260 self.parsing = True
261 #print "startElement called:", name, attrs
262 #print "Beginning of character entry found"
263 self.kanji = Kanjidic2Entry()
264 elif self.parsing:
265 self.path.append((name, attrs))
266 #print u"Current path: %s, attributes: %s" % \
267 # (self.get_path(), str(attrs.items()))
269 def endElement(self, name):
270 if self.parsing:
271 if self.path:
272 if name != self.path[-1][0]:
273 # Shouldn't ever happen, but mistakes *can* slip in...
274 print u"Mismatch detected, path is %s, element name is %s" \
275 % (self.get_path(), name)
276 else:
277 self.path.pop()
278 if name == "character":
279 #print "endElement called:", name
280 #print "End of character entry reached"
281 self.data[self.kanji.literal] = self.kanji
282 self.kanji = None
283 self.parsing = False
284 #self.limit -= 1
285 #if self.limit <= 0: exit(0)
287 def characters(self, content):
288 content = content.strip()
289 if content and self.parsing:
290 # Sanity check: see if the current node type is already
291 # included under a different full path.
292 #path = self.get_path()
293 #self.full_keys.add(path)
295 #keys = [k for k in self.full_keys if k[-(len(node)):] == node]
296 #if len(keys) != 1:
297 # print "CHECKME: Node: %s, Keys: %s" % (node, str(keys))
299 node, attrs = self.path[-1]
301 # I am exploiting the fact that any given element type can
302 # only belong to one type of parent. For example,
303 # "reading" objects are always fully pathed-out to
304 # reading_meaning.rmgroup.reading.
306 # In case this changes in the future, I've attached
307 # comments of the full paths below.
309 if node == u"literal": # literal
310 self.kanji.literal = content
311 elif node == u"reading": # reading_meaning/rmgroup/reading
312 # These will do stuff in the future...
313 #on_type = attrs.get(u"on_type")
314 #r_status = attrs.get(u"r_status")
315 # Store reading
316 getattr(self.kanji, attrs[u'r_type']).append(content)
317 elif node == u"meaning": # reading_meaning/rmgroup/meaning
318 m_lang = attrs.get(u'm_lang', u'en')
319 self.kanji.meanings.setdefault(m_lang, []).append(content)
320 elif node == u"nanori": # reading_meaning/nanori
321 self.kanji.nanori.append(content)
322 elif node == u"grade": # misc/grade
323 self.kanji.grade = int(content)
324 elif node == u"freq": # misc/freq
325 self.kanji.freq = int(content)
326 elif node == u"jlpt": # misc/jlpt
327 self.kanji.jlpt = int(content)
328 elif node == u"stroke_count": # misc/strokes
329 if not self.kanji.strokes:
330 self.kanji.strokes = int(content)
331 else:
332 self.kanji.strokes_miss.append(int(content))
333 elif node == u"q_code": # query_code/q_code
334 qc_type = attrs[u'qc_type']
335 if qc_type == 'skip':
336 misclass = attrs.get(u'skip_misclass')
337 if misclass:
338 # HANDLE LATER, TODO
339 pass
340 else:
341 self.kanji.qcodes[qc_type] = content
342 else:
343 self.kanji.qcodes[qc_type] = content
344 elif node == u"dic_ref": # dic_number/dic_ref
345 attr = attrs[u'dr_type']
346 if attr == u'moro':
347 m_vol = attrs.get(u'm_vol')
348 m_page = attrs.get(u'm_page')
349 # Do something with this... TODO
350 else:
351 try:
352 self.kanji.dcodes[attr] = int(content)
353 except ValueError:
354 self.kanji.dcodes[attr] = content
355 elif node == u"cp_value": # codepoint/cp_value
356 pass
357 elif node == u"rad_value": # radical/rad_value
358 pass
359 elif node == u"variant": # misc/variant
360 pass
361 elif node == u"rad_name": # misc/rad_name
362 pass
363 else:
364 try:
365 path = self.get_path()
366 print u"Characters found: path=%s, attrs=(%s), content: %s" \
367 % (path,
368 self.get_attr_str(),
369 content)
370 # Do some stuff based upon the current path and content
371 except UnicodeEncodeError:
372 pass # Can't display code on console; just squelch the output.
373 except Exception, e:
374 print u"EXCEPTION occurred:", unicode(e.__class__.__str__), unicode(e)
376 class Kanjidic2Parser(object):
378 def __init__(self, filename, encoding="utf-8"):
379 self.filename = filename
380 self.encoding = encoding
381 self.cache = None
383 def load_via_sax(self):
384 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
385 f = gzip.open(self.filename)
386 else:
387 f = open(self.filename, "rb")
389 sh = KD2SAXHandler()
390 isource = xml.sax.xmlreader.InputSource()
391 isource.setEncoding("utf-8")
392 isource.setByteStream(f)
393 xml.sax.parse(isource, sh)
394 f.close()
395 self.cache = sh.data
397 def search(self, search_str, use_cache=True):
398 # Cacheing has 2 meanings in J-Ben:
399 # 1. Storing the results of a previous read locally.
400 # 2. Reading in prepased data from a file on disk
402 # KANJIDIC2 is a huge file; although it's huge to store it in memory,
403 # it's even harsher to repeatedly seek the whole file from disk on
404 # each search.
405 if (not use_cache) or (not self.cache):
406 # Pick a loader.
407 # Opt 1: sax... very powerful, but too much code with my impl?
408 # Opt 2: elementtree... more memory required, loads
409 # everything at once...
410 # Opt 3: sax... redo to store all vars as lists, or similar.
411 self.load_via_sax() # First attempt of a SAX style loader.
413 for char in search_str:
414 kanji = self.cache.get(char)
415 if kanji: yield kanji
418 if __name__ == "__main__":
419 import sys, os
421 if len(sys.argv) < 2:
422 print _(u"Please specify a dictionary file.")
423 exit(-1)
424 try:
425 kp = Kanjidic2Parser(sys.argv[1])
426 except Exception, e:
427 print _(u"Could not create Kanjidic2Parser: %s") % unicode(e)
428 exit(-1)
430 if len(sys.argv) < 3:
431 print _(u"Please specify a kanji. "
432 u"(Copy/paste, or Alt-Zenkaku/Hankaku)")
433 exit(-1)
435 if os.name == "nt":
436 charset = "cp932"
437 else:
438 charset = "utf-8"
440 for i, kanji in enumerate(kp.search(sys.argv[2].decode(charset))):
441 lines = kanji.to_string().split(u'\n')
442 def encode_or_else(s):
443 try:
444 val = s.encode("cp932")
445 val = s
446 except:
447 val = None
448 return val
449 xlines = map(encode_or_else, lines)
450 xlines = [l for l in xlines if l]
451 xlines = u"\n".join(list(xlines))
452 print _(u"Entry %d:\n%s\n") % (i+1, xlines)