Intermediate commit
[jben2_gui.git] / parsers / kanjidic.py
blobc5435bec882d74b7202e39b58fd7eb97eb782825
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC.
33 This parser is dependent on a small amount of code kept in the
34 kanjidic2 parser, so be sure to grab both if you are using these
35 modules in your own programs.
37 """
39 import re, gzip, gettext
40 gettext.install('pyjben', unicode=True)
42 from parsers.kanjidic_common \
43 import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
46 alpha_regex = re.compile(u"(^[^0-9]+)(.*)")
48 # Copied from J-Ben 1.x and modified using Gnome Character Map's
49 # "Unicode Block" information.
50 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
52 def is_hiragana(uc):
53 # 3040..309F; Hiragana
54 o = ord(uc)
55 return o >= 0x3040 and o <= 0x309F
57 def is_katakana(uc):
58 # 30A0..30FF; Katakana
59 # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
60 o = ord(uc)
61 return o >= 0x30A0 and o <= 0x30FF
63 def is_furigana(uc):
64 return is_hiragana(uc) or is_katakana(uc)
66 def jis_hex_to_kuten(hex_code):
67 """KANJIDIC2-style kuten string"""
68 return u"%s-%s" % (
69 (((hex_code >> 8) & 0xFF) - 0x20),
70 ((hex_code & 0xFF) - 0x20))
72 def kanjidic_key_to_kanjidic2(dkey):
73 """Converts KANJIDIC dictionary keys to KANJIDIC2.
75 If unable to find a KANJIDIC2 key, returns the original key.
77 """
78 d = {
79 "H": "halpern_njecd",
80 "N": "nelson_c",
81 "V": "nelson_n",
82 "IN": "sh_kk",
83 "MN": "moro",
84 "E": "henshall",
85 "K": "gakken",
86 "L": "heisig",
87 "O": "oneill_names",
88 "DB": "busy_people",
89 "DC": "crowley",
90 "DF": "jf_cards",
91 "DG": "kodansha_compact",
92 "DH": "henshall3",
93 "DJ": "kanji_in_context",
94 "DK": "halpern_kkld",
95 "DO": "oneill_kk",
96 "DS": "sakade",
97 "DT": "tutt_cards",
98 "DM": "maniette"
100 return d.get(dkey, dkey)
102 class KanjidicEntry(object):
104 def __init__(self, raw_entry):
105 # Key info
106 self.literal = None
107 self.meanings = []
108 self.kunyomi = []
109 self.onyomi = []
110 self.nanori = []
111 # Secondary info
112 self.strokes = None
113 self.strokes_miss = []
114 self.freq = None
115 self.grade = None
116 self.jlpt = None
117 # Info of low importance for most target users
118 self.jis = None
119 self.radical = None
120 self.radical_c = None # "Classic" KangXi Zidian radical
121 self.radname = None
122 self.pinyin = []
123 self.korean = []
124 # "Query codes": Pattern-based lookup
125 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
126 self.qcodes = {}
127 # Dictionary codes
128 self.dcodes = {}
129 # Dictionary-related metadata
130 self.xref = []
131 self.misclass = []
132 self.unparsed = []
134 self.parse_entry(raw_entry)
136 def parse_entry(self, raw_entry):
137 if not raw_entry:
138 return None
140 state = ParserState() # Holds "t class"
142 # First 2 fields are always the same
143 pieces = raw_entry.split(None, 2)
144 misc = pieces.pop()
145 self.jis = int(pieces.pop(), 16)
146 self.literal = pieces.pop()
148 # Parse the remainder
149 si = ei = 0
150 while si < len(misc):
151 c = misc[si]
152 i = ord(c)
153 if c == u' ':
154 si += 1
155 continue
156 if i > 0xFF or c in (u'-', u'.'):
157 # Parse Japanese
158 ei = misc.find(u' ', si+1)
159 if ei == -1:
160 ei = len(misc) + 1
161 sub = misc[si:ei]
163 self._parse_japanese(state, sub)
164 elif c == u'{':
165 # Parse Translation
166 si += 1 # Move si inside of {
167 ei = misc.find(u'}', si+1)
168 if ei == -1:
169 ei = len(misc) + 1
170 sub = misc[si:ei]
171 ei += 1 # Move ei past }
173 self.meanings.append(sub)
174 else:
175 # Parse info field
176 ei = misc.find(u' ', si+1)
177 if ei == -1:
178 ei = len(misc) + 1
179 sub = misc[si:ei]
181 self._parse_info(state, sub)
183 si = ei + 1
185 def _parse_japanese(self, state, data):
186 if not state.t_class:
187 # Check hiragana/katakana
188 for c in data:
189 if is_hiragana(c):
190 self.kunyomi.append(data)
191 break
192 elif is_katakana(c):
193 self.onyomi.append(data)
194 break
195 elif state.t_class == 1:
196 self.nanori.append(data)
197 elif state.t_class == 2:
198 self.radname = data
200 def _parse_info(self, state, data):
201 onechar_dicts = set(('H', 'N', 'V', 'E', 'K', 'L', 'O'))
202 strval_dicts = set(('DB',))
203 intval_dicts = set(('DC', 'DF', 'DG', 'DH', 'DJ',
204 'DK', 'DO', 'DS', 'DT', 'DM'))
205 try:
206 c = data[0]
207 if c == 'U':
208 # Unicode value - we alread store the literal as unicode, so let's
209 # use this as our encoding sanity check!
210 assert ord(self.literal) == int(data[1:], 16), \
211 "Encoding error detected"
212 elif c == 'B':
213 self.radical = int(data[1:])
214 elif c == 'C':
215 self.radical_c = int(data[1:])
216 elif c == 'F':
217 self.freq = int(data[1:])
218 elif c == 'G':
219 self.grade = int(data[1:])
220 elif c == 'J':
221 self.jlpt = int(data[1:])
222 elif c == 'S':
223 i = int(data[1:])
224 if not self.strokes:
225 self.strokes = i
226 else:
227 self.strokes_miss.append(i)
228 elif c == 'W':
229 self.korean.append(data[1:])
230 elif c == 'Y':
231 self.pinyin.append(data[1:])
232 elif c == 'X':
233 self.xref.append(data[1:])
234 elif c == 'Z':
235 self.misclass.append(data[1:])
236 elif c == 'T':
237 state.t_class = int(data[1:])
238 # Below this point is dictionary/query codes.
239 elif c in onechar_dicts:
240 self.dcodes[c] = data[1:]
241 elif c == 'P':
242 # SKIP codes.
243 # Thanks to changes in permissible SKIP code usage (change to
244 # Creative Commons licensing in January 2008), we can now use
245 # this without problems. Jack Halpern, thank you!
246 if self.qcodes.get('skip'):
247 print "ALERT! ALERT! self.skip already set!"
248 exit(1)
249 self.qcodes['skip'] = data[1:];
250 elif c == 'Q':
251 # Four Corner code
252 self.qcodes['four_corner'] = data[1:]
253 elif c == 'I': # Spahn/Hadamitzky dictionaries
254 if data[1] =='N':
255 # IN = Kanji & Kana (Spahn, Hadamitzky)
256 self.dcodes[data[:2]] = data[2:]
257 else:
258 # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
259 self.qcodes['sh_desc'] = data[1:]
260 elif c == 'M':
261 # Morohashi Daikanwajiten
262 self.dcodes[data[:2]] = data[2:]
263 elif c == 'D':
264 key = data[:2]
265 if key in intval_dicts:
266 self.dcodes[key] = int(data[2:])
267 elif key in strval_dicts:
268 self.dcodes[key] = data[2:]
269 elif key == 'DR':
270 # Query Code: 2001 Kanji (De Roo)
271 self.qcodes['deroo'] = int(data[2:])
272 else:
273 self.unparsed.append(data)
274 else:
275 self.unparsed.append(data)
276 except:
277 self.unparsed.append(data)
279 def to_string(self, **kwargs):
280 """A default "to-string" dump of a KanjidicEntry."""
281 lines = []
282 lines.append(_(u"Literal: %s") % self.literal)
283 if self.onyomi:
284 lines.append(_(u"Onyomi: %s")
285 % u"、".join(
286 [jstring_convert(us) for us in self.onyomi]))
287 if self.kunyomi:
288 lines.append(_(u"Kunyomi: %s")
289 % u"、".join(
290 [jstring_convert(us) for us in self.kunyomi]))
291 if self.nanori:
292 lines.append(_(u"Nanori: %s")
293 % u"、".join(
294 [jstring_convert(us) for us in self.nanori]))
295 if self.meanings:
296 lines.append(_(u"Meaning: %s") % _(u"; ").join(self.meanings))
298 if self.strokes:
299 lines.append(_(u"Stroke count: %d") % self.strokes)
300 if self.strokes_miss:
301 lines.append(_(u"Common miscounts: %s")
302 % _(u", ").join(self.strokes_miss))
303 if self.freq:
304 lines.append(_(u"Newspaper Frequency: %d") % self.freq)
305 if self.grade:
306 if self.grade in range(1, 7):
307 grade_str = unicode(self.grade)
308 elif self.grade == 8:
309 grade_str = _(u"General usage")
310 elif self.grade == 9:
311 grade_str = _(u"Jinmeiyou (Characters for names)")
312 elif self.grade == None:
313 grade_str = _(u"Unspecified")
314 else:
315 grade_str = _(u"Unhandled grade level (Grade %d)") % self.grade
316 lines.append(_(u"Jouyou Grade: %s") % grade_str)
317 if self.jlpt:
318 lines.append(_(u"JLPT Level: %d") % self.jlpt)
320 # Query codes
321 if self.qcodes:
322 for k, v in self.qcodes.iteritems():
323 desc = qcode_to_desc(k)
324 lines.append(_(u"%s code: %s") % (desc, self.qcodes[k]))
326 if k == 'skip' and self.misclass:
327 miscodes = []
328 for code in self.misclass:
329 code_type = code[:2]
330 code_val = code[2:]
331 if code_type == u'SP': # "stroke_count"
332 miscodes.append(_(u"%s (stroke count)") % code_val)
333 elif code_type == u'PP': # "posn"
334 miscodes.append(_(u"%s (position)") % code_val)
335 elif code_type == u'BP': # "stroke_and_posn"
336 miscodes.append(_(u"%s (stroke and position)") % code_val)
337 elif code_type == u'RP': # "stroke_diff"
338 miscodes.append(_(u"%s (debatable count)") % code_val)
339 else:
340 lines.append(_(u"Unrecognized misclassification code: %s")
341 % unicode(code))
342 if miscodes:
343 lines.append(_(u"SKIP miscodes: %s")
344 % _(u", ").join(miscodes))
346 if self.dcodes:
347 # Probably we should sort these in some way... but for
348 # now, just display.
349 for k, v in self.dcodes.iteritems():
350 if k == "MP": continue
351 k = kanjidic2_key_to_str(
352 kanjidic_key_to_kanjidic2(k))
353 if k == "MN":
354 lines.append(_(u"%s: %s") % (k, v))
355 else:
356 vp = self.dcodes.get("MP")
357 if vp:
358 vol, page = vp.split('.', 1)
359 lines.append(_(u"%s: Index %s, Volume %s, Page %s")
360 % (k, v, vol, page))
361 else:
362 lines.append(_(u"%s: %s") % (k, v))
364 if self.radname:
365 lines.append(_(u"Radical name: %s") % self.radname)
366 if self.radical:
367 lines.append(_(u"Nelson Radical: %d") % self.radical)
368 if self.radical_c:
369 lines.append(_(u"KangXi Zidian Radical: %d") % self.radical_c)
371 if self.korean:
372 lines.append(_(u"Korean romanization: %s")
373 % _(u", ").join(self.korean))
374 if self.pinyin:
375 lines.append(_(u"Pinyin romanization: %s")
376 % _(u", ").join(self.pinyin))
378 # "self.unicode" is always present. ;)
379 lines.append(_(u"Unicode: 0x%04X") % ord(self.literal))
380 if self.jis:
381 kuten = jis_hex_to_kuten(self.jis)
382 jis_set = u"208" # For now, hard-code it.
383 lines.append(_(u"JIS X 0%s code: Kuten = %s, Hex = 0x%04X")
384 % (jis_set, kuten, self.jis))
386 if self.xref:
387 for ref in self.xref:
388 if ref[0] == 'J':
389 # JIS crossrefs
390 jis_id = ref[1]
391 hexcode = int(ref[2:], 16)
392 kuten = jis_hex_to_kuten(hexcode)
393 if jis_id == '0':
394 lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
395 u"Hex = 0x%04X") % (kuten, hexcode))
396 elif jis_id == '1':
397 lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
398 u"Hex = 0x%04X") % (kuten, hexcode))
399 else:
400 s = _(u"Crossref: JIS (UNHANDLED JIS CODESET): "
401 u"Kuten = %s, Hex = 0x%04X") % (kuten, hexcode)
402 lines.append(s)
403 # Not really "unparsed", but it is unhandled...
404 unparsed.append(s)
405 pass
406 else:
407 m = alpha_regex.match(ref)
408 k = kanjidic2_key_to_str(
409 kanjidic_key_to_kanjidic2(m.group(1)))
411 v = ref[m.span()[1]:]
412 lines.append(_(u"Crossref: %s: %s")
413 % (k, m.group(2)))
415 if self.unparsed:
416 lines.append(_(u"Unrecognized codes: %s")
417 % (u", ").join(self.unparsed))
418 pass
420 return u"\n".join(lines)
422 def __unicode__(self):
423 """Dummy string dumper"""
424 strs = [self.literal]
425 for l in [self.kunyomi, self.onyomi, self.nanori, self.meanings]:
426 strs.extend(l)
427 if self.radname:
428 strs.insert(3, self.radname)
430 return _(u", ").join(strs)
432 class ParserState(object):
433 def __init__(self):
434 self.t_class = 0
436 class KanjidicParser(object):
438 def __init__(self, filename, use_cache=True, encoding="EUC-JP"):
439 self.filename = filename
440 self.encoding = encoding
441 self.use_cache = use_cache
442 self.cache = {}
444 def search(self, query):
445 """Returns a list of kanji entries matching kanji in the query.
447 Note: Previous versions implemented this as a generator.
448 While I liked that solution, it did not maintain the order of
449 kanji in the query. Since the KANJIDIC2 parser does this,
450 I've done it here as well for consistency.
453 results = []
455 data = None
456 if self.use_cache: data = self.cache
458 if not data:
459 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
460 f = gzip.open(self.filename)
461 else:
462 f = open(self.filename, "rb")
463 fdata = f.read()
464 f.close()
465 fdata = fdata.decode(self.encoding)
466 lines = fdata.splitlines()
467 lines = [line for line in lines if line and (line[0] != u"#")]
469 data = {}
470 for line in lines:
471 entry = KanjidicEntry(line)
472 if self.use_cache:
473 self.cache[entry.literal] = entry
474 if entry.literal in query: data[entry.literal] = entry
476 for char in query:
477 kanji = data.get(char)
478 if kanji: results.append(kanji)
480 return results
482 if __name__ == "__main__":
483 import sys, os
485 if len(sys.argv) < 2:
486 print _(u"Please specify a dictionary file.")
487 exit(-1)
488 try:
489 kp = KanjidicParser(sys.argv[1])
490 except Exception, e:
491 print _(u"Could not create KanjidicParser: %s") % unicode(e)
492 exit(-1)
494 if len(sys.argv) < 3:
495 print _(u"Please specify a kanji. "
496 u"(Copy/paste, or Alt-Zenkaku/Hankaku)")
497 exit(-1)
499 if os.name == "nt":
500 charset = "cp932"
501 else:
502 charset = "utf-8"
504 for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
505 print _(u"Entry %d:\n%s\n") % (i+1, entry.to_string())