Fix for KanjidicEntry.to_string: repaired bad Morohashi handler.
[jben2_gui.git] / jbparse / jbparse / kanjidic.py
blobd2a2fff9f48386fd4f820c8e03367e3e4daa723b
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC.
33 This parser is dependent on a small amount of code kept in the
34 kanjidic2 parser, so be sure to grab both if you are using these
35 modules in your own programs.
37 """
39 from __future__ import absolute_import
41 import os, re, gzip, gettext
42 gettext.install('pyjben', unicode=True)
44 from .kanjidic_common \
45 import jstring_convert, kanjidic2_key_to_str, qcode_to_desc
48 alpha_regex = re.compile(u"(^[^0-9]+)(.*)")
50 # Copied from J-Ben 1.x and modified using Gnome Character Map's
51 # "Unicode Block" information.
52 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
54 def is_hiragana(uc):
55 # 3040..309F; Hiragana
56 o = ord(uc)
57 return o >= 0x3040 and o <= 0x309F
59 def is_katakana(uc):
60 # 30A0..30FF; Katakana
61 # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
62 o = ord(uc)
63 return o >= 0x30A0 and o <= 0x30FF
65 def is_furigana(uc):
66 return is_hiragana(uc) or is_katakana(uc)
68 def jis_hex_to_kuten(hex_code):
69 """KANJIDIC2-style kuten string"""
70 return u"%s-%s" % (
71 (((hex_code >> 8) & 0xFF) - 0x20),
72 ((hex_code & 0xFF) - 0x20))
74 def kanjidic_key_to_kanjidic2(dkey):
75 """Converts KANJIDIC dictionary keys to KANJIDIC2.
77 If unable to find a KANJIDIC2 key, returns the original key.
79 """
80 d = {
81 "H": "halpern_njecd",
82 "N": "nelson_c",
83 "V": "nelson_n",
84 "IN": "sh_kk",
85 "MN": "moro",
86 "E": "henshall",
87 "K": "gakken",
88 "L": "heisig",
89 "O": "oneill_names",
90 "DB": "busy_people",
91 "DC": "crowley",
92 "DF": "jf_cards",
93 "DG": "kodansha_compact",
94 "DH": "henshall3",
95 "DJ": "kanji_in_context",
96 "DK": "halpern_kkld",
97 "DO": "oneill_kk",
98 "DS": "sakade",
99 "DT": "tutt_cards",
100 "DM": "maniette"
102 return d.get(dkey, dkey)
104 class KanjidicEntry(object):
106 def __init__(self, raw_entry):
107 # Key info
108 self.literal = None
109 self.meanings = []
110 self.kunyomi = []
111 self.onyomi = []
112 self.nanori = []
113 # Secondary info
114 self.strokes = None
115 self.strokes_miss = []
116 self.freq = None
117 self.grade = None
118 self.jlpt = None
119 # Info of low importance for most target users
120 self.jis = None
121 self.radical = None
122 self.radical_c = None # "Classic" KangXi Zidian radical
123 self.radname = None
124 self.pinyin = []
125 self.korean = []
126 # "Query codes": Pattern-based lookup
127 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
128 self.qcodes = {}
129 # Dictionary codes
130 self.dcodes = {}
131 # Dictionary-related metadata
132 self.xref = []
133 self.misclass = []
134 self.unparsed = []
136 self.parse_entry(raw_entry)
138 def parse_entry(self, raw_entry):
139 if not raw_entry:
140 return None
142 state = ParserState() # Holds "t class"
144 # First 2 fields are always the same
145 pieces = raw_entry.split(None, 2)
146 misc = pieces.pop()
147 self.jis = int(pieces.pop(), 16)
148 self.literal = pieces.pop()
150 # Parse the remainder
151 si = ei = 0
152 while si < len(misc):
153 c = misc[si]
154 i = ord(c)
155 if c == u' ':
156 si += 1
157 continue
158 if i > 0xFF or c in (u'-', u'.'):
159 # Parse Japanese
160 ei = misc.find(u' ', si+1)
161 if ei == -1:
162 ei = len(misc) + 1
163 sub = misc[si:ei]
165 self._parse_japanese(state, sub)
166 elif c == u'{':
167 # Parse Translation
168 si += 1 # Move si inside of {
169 ei = misc.find(u'}', si+1)
170 if ei == -1:
171 ei = len(misc) + 1
172 sub = misc[si:ei]
173 ei += 1 # Move ei past }
175 self.meanings.append(sub)
176 else:
177 # Parse info field
178 ei = misc.find(u' ', si+1)
179 if ei == -1:
180 ei = len(misc) + 1
181 sub = misc[si:ei]
183 self._parse_info(state, sub)
185 si = ei + 1
187 def _parse_japanese(self, state, data):
188 if not state.t_class:
189 # Check hiragana/katakana
190 for c in data:
191 if is_hiragana(c):
192 self.kunyomi.append(data)
193 break
194 elif is_katakana(c):
195 self.onyomi.append(data)
196 break
197 elif state.t_class == 1:
198 self.nanori.append(data)
199 elif state.t_class == 2:
200 self.radname = data
202 def _parse_info(self, state, data):
203 onechar_dicts = set(('H', 'N', 'V', 'E', 'K', 'L', 'O'))
204 strval_dicts = set(('DB',))
205 intval_dicts = set(('DC', 'DF', 'DG', 'DH', 'DJ',
206 'DK', 'DO', 'DS', 'DT', 'DM'))
207 try:
208 c = data[0]
209 if c == 'U':
210 # Unicode value - we alread store the literal as unicode, so let's
211 # use this as our encoding sanity check!
212 assert ord(self.literal) == int(data[1:], 16), \
213 "Encoding error detected"
214 elif c == 'B':
215 self.radical = int(data[1:])
216 elif c == 'C':
217 self.radical_c = int(data[1:])
218 elif c == 'F':
219 self.freq = int(data[1:])
220 elif c == 'G':
221 self.grade = int(data[1:])
222 elif c == 'J':
223 self.jlpt = int(data[1:])
224 elif c == 'S':
225 i = int(data[1:])
226 if not self.strokes:
227 self.strokes = i
228 else:
229 self.strokes_miss.append(i)
230 elif c == 'W':
231 self.korean.append(data[1:])
232 elif c == 'Y':
233 self.pinyin.append(data[1:])
234 elif c == 'X':
235 self.xref.append(data[1:])
236 elif c == 'Z':
237 self.misclass.append(data[1:])
238 elif c == 'T':
239 state.t_class = int(data[1:])
240 # Below this point is dictionary/query codes.
241 elif c in onechar_dicts:
242 self.dcodes[c] = data[1:]
243 elif c == 'P':
244 # SKIP codes.
245 # Thanks to changes in permissible SKIP code usage (change to
246 # Creative Commons licensing in January 2008), we can now use
247 # this without problems. Jack Halpern, thank you!
248 if self.qcodes.get('skip'):
249 print "ALERT! ALERT! self.skip already set!"
250 exit(1)
251 self.qcodes['skip'] = data[1:];
252 elif c == 'Q':
253 # Four Corner code
254 self.qcodes['four_corner'] = data[1:]
255 elif c == 'I': # Spahn/Hadamitzky dictionaries
256 if data[1] =='N':
257 # IN = Kanji & Kana (Spahn, Hadamitzky)
258 self.dcodes[data[:2]] = data[2:]
259 else:
260 # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
261 self.qcodes['sh_desc'] = data[1:]
262 elif c == 'M':
263 # Morohashi Daikanwajiten
264 self.dcodes[data[:2]] = data[2:]
265 elif c == 'D':
266 key = data[:2]
267 if key in intval_dicts:
268 self.dcodes[key] = int(data[2:])
269 elif key in strval_dicts:
270 self.dcodes[key] = data[2:]
271 elif key == 'DR':
272 # Query Code: 2001 Kanji (De Roo)
273 self.qcodes['deroo'] = int(data[2:])
274 else:
275 self.unparsed.append(data)
276 else:
277 self.unparsed.append(data)
278 except:
279 self.unparsed.append(data)
281 def to_string(self, **kwargs):
282 """A default "to-string" dump of a KanjidicEntry."""
283 lines = []
284 lines.append(_(u"Literal: %s") % self.literal)
285 if self.onyomi:
286 lines.append(_(u"Onyomi: %s")
287 % u"、".join(
288 [jstring_convert(us) for us in self.onyomi]))
289 if self.kunyomi:
290 lines.append(_(u"Kunyomi: %s")
291 % u"、".join(
292 [jstring_convert(us) for us in self.kunyomi]))
293 if self.nanori:
294 lines.append(_(u"Nanori: %s")
295 % u"、".join(
296 [jstring_convert(us) for us in self.nanori]))
297 if self.meanings:
298 lines.append(_(u"Meaning: %s") % _(u"; ").join(self.meanings))
300 if self.strokes:
301 lines.append(_(u"Stroke count: %d") % self.strokes)
302 if self.strokes_miss:
303 lines.append(_(u"Common miscounts: %s")
304 % _(u", ").join(self.strokes_miss))
305 if self.freq:
306 lines.append(_(u"Newspaper Frequency: %d") % self.freq)
307 if self.grade:
308 if self.grade in range(1, 7):
309 grade_str = unicode(self.grade)
310 elif self.grade == 8:
311 grade_str = _(u"General usage")
312 elif self.grade == 9:
313 grade_str = _(u"Jinmeiyou (Characters for names)")
314 elif self.grade == None:
315 grade_str = _(u"Unspecified")
316 else:
317 grade_str = _(u"Unhandled grade level (Grade %d)") % self.grade
318 lines.append(_(u"Jouyou Grade: %s") % grade_str)
319 if self.jlpt:
320 lines.append(_(u"JLPT Level: %d") % self.jlpt)
322 # Query codes
323 if self.qcodes:
324 for k, v in self.qcodes.iteritems():
325 desc = qcode_to_desc(k)
326 lines.append(_(u"%s code: %s") % (desc, self.qcodes[k]))
328 if k == 'skip' and self.misclass:
329 miscodes = []
330 for code in self.misclass:
331 code_type = code[:2]
332 code_val = code[2:]
333 if code_type == u'SP': # "stroke_count"
334 miscodes.append(_(u"%s (stroke count)") % code_val)
335 elif code_type == u'PP': # "posn"
336 miscodes.append(_(u"%s (position)") % code_val)
337 elif code_type == u'BP': # "stroke_and_posn"
338 miscodes.append(_(u"%s (stroke and position)") % code_val)
339 elif code_type == u'RP': # "stroke_diff"
340 miscodes.append(_(u"%s (debatable count)") % code_val)
341 else:
342 lines.append(_(u"Unrecognized misclassification code: %s")
343 % unicode(code))
344 if miscodes:
345 lines.append(_(u"SKIP miscodes: %s")
346 % _(u", ").join(miscodes))
348 if self.dcodes:
349 # Probably we should sort these in some way... but for
350 # now, just display.
351 for k, v in self.dcodes.iteritems():
352 if k == "MP": continue
353 dictname = kanjidic2_key_to_str(
354 kanjidic_key_to_kanjidic2(k))
355 if k == "MN":
356 vp = self.dcodes.get("MP")
357 if vp:
358 vol, page = vp.split('.', 1)
359 lines.append(_(u"%s: Index %s, Volume %s, Page %s")
360 % (dictname, v, vol, page))
361 else:
362 lines.append(_(u"%s: %s") % (dictname, v))
363 else:
364 lines.append(_(u"%s: %s") % (dictname, v))
366 if self.radname:
367 lines.append(_(u"Radical name: %s") % self.radname)
368 if self.radical:
369 lines.append(_(u"Nelson Radical: %d") % self.radical)
370 if self.radical_c:
371 lines.append(_(u"KangXi Zidian Radical: %d") % self.radical_c)
373 if self.korean:
374 lines.append(_(u"Korean romanization: %s")
375 % _(u", ").join(self.korean))
376 if self.pinyin:
377 lines.append(_(u"Pinyin romanization: %s")
378 % _(u", ").join(self.pinyin))
380 # "self.unicode" is always present. ;)
381 lines.append(_(u"Unicode: 0x%04X") % ord(self.literal))
382 if self.jis:
383 kuten = jis_hex_to_kuten(self.jis)
384 jis_set = u"208" # For now, hard-code it.
385 lines.append(_(u"JIS X 0%s code: Kuten = %s, Hex = 0x%04X")
386 % (jis_set, kuten, self.jis))
388 if self.xref:
389 for ref in self.xref:
390 if ref[0] == 'J':
391 # JIS crossrefs
392 jis_id = ref[1]
393 hexcode = int(ref[2:], 16)
394 kuten = jis_hex_to_kuten(hexcode)
395 if jis_id == '0':
396 lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
397 u"Hex = 0x%04X") % (kuten, hexcode))
398 elif jis_id == '1':
399 lines.append(_(u"Crossref: JIS X 0208: Kuten = %s, "
400 u"Hex = 0x%04X") % (kuten, hexcode))
401 else:
402 s = _(u"Crossref: JIS (UNHANDLED JIS CODESET): "
403 u"Kuten = %s, Hex = 0x%04X") % (kuten, hexcode)
404 lines.append(s)
405 # Not really "unparsed", but it is unhandled...
406 unparsed.append(s)
407 pass
408 else:
409 m = alpha_regex.match(ref)
410 k = kanjidic2_key_to_str(
411 kanjidic_key_to_kanjidic2(m.group(1)))
413 v = ref[m.span()[1]:]
414 lines.append(_(u"Crossref: %s: %s")
415 % (k, m.group(2)))
417 if self.unparsed:
418 lines.append(_(u"Unrecognized codes: %s")
419 % (u", ").join(self.unparsed))
420 pass
422 return u"\n".join(lines)
424 def __unicode__(self):
425 """Dummy string dumper"""
426 strs = [self.literal]
427 for l in [self.kunyomi, self.onyomi, self.nanori, self.meanings]:
428 strs.extend(l)
429 if self.radname:
430 strs.insert(3, self.radname)
432 return _(u", ").join(strs)
434 class ParserState(object):
435 def __init__(self):
436 self.t_class = 0
438 class Parser(object):
440 def __init__(self, filename, use_cache=True, encoding="EUC-JP"):
441 if not os.path.exists(filename):
442 raise Exception("Dictionary file does not exist.")
443 self.filename = filename
444 self.encoding = encoding
445 self.use_cache = use_cache
446 self.cache = {}
448 def search(self, query):
449 """Returns a list of kanji entries matching kanji in the query.
451 Note: Previous versions implemented this as a generator.
452 While I liked that solution, it did not maintain the order of
453 kanji in the query. Since the KANJIDIC2 parser does this,
454 I've done it here as well for consistency.
457 results = []
459 data = None
460 if self.use_cache: data = self.cache
462 if not data:
463 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
464 f = gzip.open(self.filename)
465 else:
466 f = open(self.filename, "rb")
467 fdata = f.read()
468 f.close()
469 fdata = fdata.decode(self.encoding)
470 lines = fdata.splitlines()
471 lines = [line for line in lines if line and (line[0] != u"#")]
473 data = {}
474 for line in lines:
475 entry = KanjidicEntry(line)
476 if self.use_cache:
477 self.cache[entry.literal] = entry
478 if entry.literal in query: data[entry.literal] = entry
480 for char in query:
481 kanji = data.get(char)
482 if kanji: results.append(kanji)
484 return results
486 if __name__ == "__main__":
487 import sys
489 if len(sys.argv) < 2:
490 print _(u"Please specify a dictionary file.")
491 exit(-1)
492 try:
493 kp = Parser(sys.argv[1])
494 except Exception, e:
495 print _(u"Could not create KanjidicParser: %s") % unicode(e)
496 exit(-1)
498 if len(sys.argv) < 3:
499 print _(u"Please specify a kanji. "
500 u"(Copy/paste, or Alt-Zenkaku/Hankaku)")
501 exit(-1)
503 if os.name == "nt":
504 charset = "cp932"
505 else:
506 charset = "utf-8"
508 for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
509 print _(u"Entry %d:\n%s\n") % (i+1, entry.to_string())