2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC.
33 This parser is dependent on a small amount of code kept in the
34 kanjidic2 parser, so be sure to grab both if you are using these
35 modules in your own programs.
39 from __future__
import absolute_import
41 import os
, re
, gzip
, gettext
42 gettext
.install('pyjben', unicode=True)
44 from .kanjidic_common \
45 import jstring_convert
, kanjidic2_key_to_str
, qcode_to_desc
48 alpha_regex
= re
.compile(u
"(^[^0-9]+)(.*)")
50 # Copied from J-Ben 1.x and modified using Gnome Character Map's
51 # "Unicode Block" information.
52 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
55 # 3040..309F; Hiragana
57 return o
>= 0x3040 and o
<= 0x309F
60 # 30A0..30FF; Katakana
61 # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
63 return o
>= 0x30A0 and o
<= 0x30FF
66 return is_hiragana(uc
) or is_katakana(uc
)
68 def jis_hex_to_kuten(hex_code
):
69 """KANJIDIC2-style kuten string"""
71 (((hex_code
>> 8) & 0xFF) - 0x20),
72 ((hex_code
& 0xFF) - 0x20))
74 def kanjidic_key_to_kanjidic2(dkey
):
75 """Converts KANJIDIC dictionary keys to KANJIDIC2.
77 If unable to find a KANJIDIC2 key, returns the original key.
93 "DG": "kodansha_compact",
95 "DJ": "kanji_in_context",
102 return d
.get(dkey
, dkey
)
104 class KanjidicEntry(object):
106 def __init__(self
, raw_entry
):
115 self
.strokes_miss
= []
119 # Info of low importance for most target users
122 self
.radical_c
= None # "Classic" KangXi Zidian radical
126 # "Query codes": Pattern-based lookup
127 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
131 # Dictionary-related metadata
136 self
.parse_entry(raw_entry
)
138 def parse_entry(self
, raw_entry
):
142 state
= ParserState() # Holds "t class"
144 # First 2 fields are always the same
145 pieces
= raw_entry
.split(None, 2)
147 self
.jis
= int(pieces
.pop(), 16)
148 self
.literal
= pieces
.pop()
150 # Parse the remainder
152 while si
< len(misc
):
158 if i
> 0xFF or c
in (u
'-', u
'.'):
160 ei
= misc
.find(u
' ', si
+1)
165 self
._parse
_japanese
(state
, sub
)
168 si
+= 1 # Move si inside of {
169 ei
= misc
.find(u
'}', si
+1)
173 ei
+= 1 # Move ei past }
175 self
.meanings
.append(sub
)
178 ei
= misc
.find(u
' ', si
+1)
183 self
._parse
_info
(state
, sub
)
187 def _parse_japanese(self
, state
, data
):
188 if not state
.t_class
:
189 # Check hiragana/katakana
192 self
.kunyomi
.append(data
)
195 self
.onyomi
.append(data
)
197 elif state
.t_class
== 1:
198 self
.nanori
.append(data
)
199 elif state
.t_class
== 2:
202 def _parse_info(self
, state
, data
):
203 onechar_dicts
= set(('H', 'N', 'V', 'E', 'K', 'L', 'O'))
204 strval_dicts
= set(('DB',))
205 intval_dicts
= set(('DC', 'DF', 'DG', 'DH', 'DJ',
206 'DK', 'DO', 'DS', 'DT', 'DM'))
210 # Unicode value - we alread store the literal as unicode, so let's
211 # use this as our encoding sanity check!
212 assert ord(self
.literal
) == int(data
[1:], 16), \
213 "Encoding error detected"
215 self
.radical
= int(data
[1:])
217 self
.radical_c
= int(data
[1:])
219 self
.freq
= int(data
[1:])
221 self
.grade
= int(data
[1:])
223 self
.jlpt
= int(data
[1:])
229 self
.strokes_miss
.append(i
)
231 self
.korean
.append(data
[1:])
233 self
.pinyin
.append(data
[1:])
235 self
.xref
.append(data
[1:])
237 self
.misclass
.append(data
[1:])
239 state
.t_class
= int(data
[1:])
240 # Below this point is dictionary/query codes.
241 elif c
in onechar_dicts
:
242 self
.dcodes
[c
] = data
[1:]
245 # Thanks to changes in permissible SKIP code usage (change to
246 # Creative Commons licensing in January 2008), we can now use
247 # this without problems. Jack Halpern, thank you!
248 if self
.qcodes
.get('skip'):
249 print "ALERT! ALERT! self.skip already set!"
251 self
.qcodes
['skip'] = data
[1:];
254 self
.qcodes
['four_corner'] = data
[1:]
255 elif c
== 'I': # Spahn/Hadamitzky dictionaries
257 # IN = Kanji & Kana (Spahn, Hadamitzky)
258 self
.dcodes
[data
[:2]] = data
[2:]
260 # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
261 self
.qcodes
['sh_desc'] = data
[1:]
263 # Morohashi Daikanwajiten
264 self
.dcodes
[data
[:2]] = data
[2:]
267 if key
in intval_dicts
:
268 self
.dcodes
[key
] = int(data
[2:])
269 elif key
in strval_dicts
:
270 self
.dcodes
[key
] = data
[2:]
272 # Query Code: 2001 Kanji (De Roo)
273 self
.qcodes
['deroo'] = int(data
[2:])
275 self
.unparsed
.append(data
)
277 self
.unparsed
.append(data
)
279 self
.unparsed
.append(data
)
281 def to_string(self
, **kwargs
):
282 """A default "to-string" dump of a KanjidicEntry."""
284 lines
.append(_(u
"Literal: %s") % self
.literal
)
286 lines
.append(_(u
"Onyomi: %s")
288 [jstring_convert(us
) for us
in self
.onyomi
]))
290 lines
.append(_(u
"Kunyomi: %s")
292 [jstring_convert(us
) for us
in self
.kunyomi
]))
294 lines
.append(_(u
"Nanori: %s")
296 [jstring_convert(us
) for us
in self
.nanori
]))
298 lines
.append(_(u
"Meaning: %s") % _(u
"; ").join(self
.meanings
))
301 lines
.append(_(u
"Stroke count: %d") % self
.strokes
)
302 if self
.strokes_miss
:
303 lines
.append(_(u
"Common miscounts: %s")
304 % _(u
", ").join(self
.strokes_miss
))
306 lines
.append(_(u
"Newspaper Frequency: %d") % self
.freq
)
308 if self
.grade
in range(1, 7):
309 grade_str
= unicode(self
.grade
)
310 elif self
.grade
== 8:
311 grade_str
= _(u
"General usage")
312 elif self
.grade
== 9:
313 grade_str
= _(u
"Jinmeiyou (Characters for names)")
314 elif self
.grade
== None:
315 grade_str
= _(u
"Unspecified")
317 grade_str
= _(u
"Unhandled grade level (Grade %d)") % self
.grade
318 lines
.append(_(u
"Jouyou Grade: %s") % grade_str
)
320 lines
.append(_(u
"JLPT Level: %d") % self
.jlpt
)
324 for k
, v
in self
.qcodes
.iteritems():
325 desc
= qcode_to_desc(k
)
326 lines
.append(_(u
"%s code: %s") % (desc
, self
.qcodes
[k
]))
328 if k
== 'skip' and self
.misclass
:
330 for code
in self
.misclass
:
333 if code_type
== u
'SP': # "stroke_count"
334 miscodes
.append(_(u
"%s (stroke count)") % code_val
)
335 elif code_type
== u
'PP': # "posn"
336 miscodes
.append(_(u
"%s (position)") % code_val
)
337 elif code_type
== u
'BP': # "stroke_and_posn"
338 miscodes
.append(_(u
"%s (stroke and position)") % code_val
)
339 elif code_type
== u
'RP': # "stroke_diff"
340 miscodes
.append(_(u
"%s (debatable count)") % code_val
)
342 lines
.append(_(u
"Unrecognized misclassification code: %s")
345 lines
.append(_(u
"SKIP miscodes: %s")
346 % _(u
", ").join(miscodes
))
349 # Probably we should sort these in some way... but for
351 for k
, v
in self
.dcodes
.iteritems():
352 if k
== "MP": continue
353 dictname
= kanjidic2_key_to_str(
354 kanjidic_key_to_kanjidic2(k
))
356 vp
= self
.dcodes
.get("MP")
358 vol
, page
= vp
.split('.', 1)
359 lines
.append(_(u
"%s: Index %s, Volume %s, Page %s")
360 % (dictname
, v
, vol
, page
))
362 lines
.append(_(u
"%s: %s") % (dictname
, v
))
364 lines
.append(_(u
"%s: %s") % (dictname
, v
))
367 lines
.append(_(u
"Radical name: %s") % self
.radname
)
369 lines
.append(_(u
"Nelson Radical: %d") % self
.radical
)
371 lines
.append(_(u
"KangXi Zidian Radical: %d") % self
.radical_c
)
374 lines
.append(_(u
"Korean romanization: %s")
375 % _(u
", ").join(self
.korean
))
377 lines
.append(_(u
"Pinyin romanization: %s")
378 % _(u
", ").join(self
.pinyin
))
380 # "self.unicode" is always present. ;)
381 lines
.append(_(u
"Unicode: 0x%04X") % ord(self
.literal
))
383 kuten
= jis_hex_to_kuten(self
.jis
)
384 jis_set
= u
"208" # For now, hard-code it.
385 lines
.append(_(u
"JIS X 0%s code: Kuten = %s, Hex = 0x%04X")
386 % (jis_set
, kuten
, self
.jis
))
389 for ref
in self
.xref
:
393 hexcode
= int(ref
[2:], 16)
394 kuten
= jis_hex_to_kuten(hexcode
)
396 lines
.append(_(u
"Crossref: JIS X 0208: Kuten = %s, "
397 u
"Hex = 0x%04X") % (kuten
, hexcode
))
399 lines
.append(_(u
"Crossref: JIS X 0208: Kuten = %s, "
400 u
"Hex = 0x%04X") % (kuten
, hexcode
))
402 s
= _(u
"Crossref: JIS (UNHANDLED JIS CODESET): "
403 u
"Kuten = %s, Hex = 0x%04X") % (kuten
, hexcode
)
405 # Not really "unparsed", but it is unhandled...
409 m
= alpha_regex
.match(ref
)
410 k
= kanjidic2_key_to_str(
411 kanjidic_key_to_kanjidic2(m
.group(1)))
413 v
= ref
[m
.span()[1]:]
414 lines
.append(_(u
"Crossref: %s: %s")
418 lines
.append(_(u
"Unrecognized codes: %s")
419 % (u
", ").join(self
.unparsed
))
422 return u
"\n".join(lines
)
424 def __unicode__(self
):
425 """Dummy string dumper"""
426 strs
= [self
.literal
]
427 for l
in [self
.kunyomi
, self
.onyomi
, self
.nanori
, self
.meanings
]:
430 strs
.insert(3, self
.radname
)
432 return _(u
", ").join(strs
)
434 class ParserState(object):
438 class Parser(object):
440 def __init__(self
, filename
, use_cache
=True, encoding
="EUC-JP"):
441 if not os
.path
.exists(filename
):
442 raise Exception("Dictionary file does not exist.")
443 self
.filename
= filename
444 self
.encoding
= encoding
445 self
.use_cache
= use_cache
448 def search(self
, query
):
449 """Returns a list of kanji entries matching kanji in the query.
451 Note: Previous versions implemented this as a generator.
452 While I liked that solution, it did not maintain the order of
453 kanji in the query. Since the KANJIDIC2 parser does this,
454 I've done it here as well for consistency.
460 if self
.use_cache
: data
= self
.cache
463 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
464 f
= gzip
.open(self
.filename
)
466 f
= open(self
.filename
, "rb")
469 fdata
= fdata
.decode(self
.encoding
)
470 lines
= fdata
.splitlines()
471 lines
= [line
for line
in lines
if line
and (line
[0] != u
"#")]
475 entry
= KanjidicEntry(line
)
477 self
.cache
[entry
.literal
] = entry
478 if entry
.literal
in query
: data
[entry
.literal
] = entry
481 kanji
= data
.get(char
)
482 if kanji
: results
.append(kanji
)
486 if __name__
== "__main__":
489 if len(sys
.argv
) < 2:
490 print _(u
"Please specify a dictionary file.")
493 kp
= Parser(sys
.argv
[1])
495 print _(u
"Could not create KanjidicParser: %s") % unicode(e
)
498 if len(sys
.argv
) < 3:
499 print _(u
"Please specify a kanji. "
500 u
"(Copy/paste, or Alt-Zenkaku/Hankaku)")
508 for i
, entry
in enumerate(kp
.search(sys
.argv
[2].decode(charset
))):
509 print _(u
"Entry %d:\n%s\n") % (i
+1, entry
.to_string())