2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC.
33 This parser is dependent on a small amount of code kept in the
34 kanjidic2 parser, so be sure to grab both if you are using these
35 modules in your own programs.
39 import re
, gzip
, gettext
40 gettext
.install('pyjben', unicode=True)
42 from parsers
.kanjidic_common \
43 import jstring_convert
, kanjidic2_key_to_str
, qcode_to_desc
46 alpha_regex
= re
.compile(u
"(^[^0-9]+)(.*)")
48 # Copied from J-Ben 1.x and modified using Gnome Character Map's
49 # "Unicode Block" information.
50 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
53 # 3040..309F; Hiragana
55 return o
>= 0x3040 and o
<= 0x309F
58 # 30A0..30FF; Katakana
59 # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
61 return o
>= 0x30A0 and o
<= 0x30FF
64 return is_hiragana(uc
) or is_katakana(uc
)
66 def jis_hex_to_kuten(hex_code
):
67 """KANJIDIC2-style kuten string"""
69 (((hex_code
>> 8) & 0xFF) - 0x20),
70 ((hex_code
& 0xFF) - 0x20))
72 def kanjidic_key_to_kanjidic2(dkey
):
73 """Converts KANJIDIC dictionary keys to KANJIDIC2.
75 If unable to find a KANJIDIC2 key, returns the original key.
91 "DG": "kodansha_compact",
93 "DJ": "kanji_in_context",
100 return d
.get(dkey
, dkey
)
102 class KanjidicEntry(object):
104 def __init__(self
, raw_entry
):
113 self
.strokes_miss
= []
117 # Info of low importance for most target users
120 self
.radical_c
= None # "Classic" KangXi Zidian radical
124 # "Query codes": Pattern-based lookup
125 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
129 # Dictionary-related metadata
134 self
.parse_entry(raw_entry
)
136 def parse_entry(self
, raw_entry
):
140 state
= ParserState() # Holds "t class"
142 # First 2 fields are always the same
143 pieces
= raw_entry
.split(None, 2)
145 self
.jis
= int(pieces
.pop(), 16)
146 self
.literal
= pieces
.pop()
148 # Parse the remainder
150 while si
< len(misc
):
156 if i
> 0xFF or c
in (u
'-', u
'.'):
158 ei
= misc
.find(u
' ', si
+1)
163 self
._parse
_japanese
(state
, sub
)
166 si
+= 1 # Move si inside of {
167 ei
= misc
.find(u
'}', si
+1)
171 ei
+= 1 # Move ei past }
173 self
.meanings
.append(sub
)
176 ei
= misc
.find(u
' ', si
+1)
181 self
._parse
_info
(state
, sub
)
185 def _parse_japanese(self
, state
, data
):
186 if not state
.t_class
:
187 # Check hiragana/katakana
190 self
.kunyomi
.append(data
)
193 self
.onyomi
.append(data
)
195 elif state
.t_class
== 1:
196 self
.nanori
.append(data
)
197 elif state
.t_class
== 2:
200 def _parse_info(self
, state
, data
):
201 onechar_dicts
= set(('H', 'N', 'V', 'E', 'K', 'L', 'O'))
202 strval_dicts
= set(('DB',))
203 intval_dicts
= set(('DC', 'DF', 'DG', 'DH', 'DJ',
204 'DK', 'DO', 'DS', 'DT', 'DM'))
208 # Unicode value - we alread store the literal as unicode, so let's
209 # use this as our encoding sanity check!
210 assert ord(self
.literal
) == int(data
[1:], 16), \
211 "Encoding error detected"
213 self
.radical
= int(data
[1:])
215 self
.radical_c
= int(data
[1:])
217 self
.freq
= int(data
[1:])
219 self
.grade
= int(data
[1:])
221 self
.jlpt
= int(data
[1:])
227 self
.strokes_miss
.append(i
)
229 self
.korean
.append(data
[1:])
231 self
.pinyin
.append(data
[1:])
233 self
.xref
.append(data
[1:])
235 self
.misclass
.append(data
[1:])
237 state
.t_class
= int(data
[1:])
238 # Below this point is dictionary/query codes.
239 elif c
in onechar_dicts
:
240 self
.dcodes
[c
] = data
[1:]
243 # Thanks to changes in permissible SKIP code usage (change to
244 # Creative Commons licensing in January 2008), we can now use
245 # this without problems. Jack Halpern, thank you!
246 if self
.qcodes
.get('skip'):
247 print "ALERT! ALERT! self.skip already set!"
249 self
.qcodes
['skip'] = data
[1:];
252 self
.qcodes
['four_corner'] = data
[1:]
253 elif c
== 'I': # Spahn/Hadamitzky dictionaries
255 # IN = Kanji & Kana (Spahn, Hadamitzky)
256 self
.dcodes
[data
[:2]] = data
[2:]
258 # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
259 self
.qcodes
['sh_desc'] = data
[1:]
261 # Morohashi Daikanwajiten
262 self
.dcodes
[data
[:2]] = data
[2:]
265 if key
in intval_dicts
:
266 self
.dcodes
[key
] = int(data
[2:])
267 elif key
in strval_dicts
:
268 self
.dcodes
[key
] = data
[2:]
270 # Query Code: 2001 Kanji (De Roo)
271 self
.qcodes
['deroo'] = int(data
[2:])
273 self
.unparsed
.append(data
)
275 self
.unparsed
.append(data
)
277 self
.unparsed
.append(data
)
279 def to_string(self
, **kwargs
):
280 """A default "to-string" dump of a KanjidicEntry."""
282 lines
.append(_(u
"Literal: %s") % self
.literal
)
284 lines
.append(_(u
"Onyomi: %s")
286 [jstring_convert(us
) for us
in self
.onyomi
]))
288 lines
.append(_(u
"Kunyomi: %s")
290 [jstring_convert(us
) for us
in self
.kunyomi
]))
292 lines
.append(_(u
"Nanori: %s")
294 [jstring_convert(us
) for us
in self
.nanori
]))
296 lines
.append(_(u
"Meaning: %s") % _(u
"; ").join(self
.meanings
))
299 lines
.append(_(u
"Stroke count: %d") % self
.strokes
)
300 if self
.strokes_miss
:
301 lines
.append(_(u
"Common miscounts: %s")
302 % _(u
", ").join(self
.strokes_miss
))
304 lines
.append(_(u
"Newspaper Frequency: %d") % self
.freq
)
306 if self
.grade
in range(1, 7):
307 grade_str
= unicode(self
.grade
)
308 elif self
.grade
== 8:
309 grade_str
= _(u
"General usage")
310 elif self
.grade
== 9:
311 grade_str
= _(u
"Jinmeiyou (Characters for names)")
312 elif self
.grade
== None:
313 grade_str
= _(u
"Unspecified")
315 grade_str
= _(u
"Unhandled grade level (Grade %d)") % self
.grade
316 lines
.append(_(u
"Jouyou Grade: %s") % grade_str
)
318 lines
.append(_(u
"JLPT Level: %d") % self
.jlpt
)
322 for k
, v
in self
.qcodes
.iteritems():
323 desc
= qcode_to_desc(k
)
324 lines
.append(_(u
"%s code: %s") % (desc
, self
.qcodes
[k
]))
326 if k
== 'skip' and self
.misclass
:
328 for code
in self
.misclass
:
331 if code_type
== u
'SP': # "stroke_count"
332 miscodes
.append(_(u
"%s (stroke count)") % code_val
)
333 elif code_type
== u
'PP': # "posn"
334 miscodes
.append(_(u
"%s (position)") % code_val
)
335 elif code_type
== u
'BP': # "stroke_and_posn"
336 miscodes
.append(_(u
"%s (stroke and position)") % code_val
)
337 elif code_type
== u
'RP': # "stroke_diff"
338 miscodes
.append(_(u
"%s (debatable count)") % code_val
)
340 lines
.append(_(u
"Unrecognized misclassification code: %s")
343 lines
.append(_(u
"SKIP miscodes: %s")
344 % _(u
", ").join(miscodes
))
347 # Probably we should sort these in some way... but for
349 for k
, v
in self
.dcodes
.iteritems():
350 if k
== "MP": continue
351 k
= kanjidic2_key_to_str(
352 kanjidic_key_to_kanjidic2(k
))
354 lines
.append(_(u
"%s: %s") % (k
, v
))
356 vp
= self
.dcodes
.get("MP")
358 vol
, page
= vp
.split('.', 1)
359 lines
.append(_(u
"%s: Index %s, Volume %s, Page %s")
362 lines
.append(_(u
"%s: %s") % (k
, v
))
365 lines
.append(_(u
"Radical name: %s") % self
.radname
)
367 lines
.append(_(u
"Nelson Radical: %d") % self
.radical
)
369 lines
.append(_(u
"KangXi Zidian Radical: %d") % self
.radical_c
)
372 lines
.append(_(u
"Korean romanization: %s")
373 % _(u
", ").join(self
.korean
))
375 lines
.append(_(u
"Pinyin romanization: %s")
376 % _(u
", ").join(self
.pinyin
))
378 # "self.unicode" is always present. ;)
379 lines
.append(_(u
"Unicode: 0x%04X") % ord(self
.literal
))
381 kuten
= jis_hex_to_kuten(self
.jis
)
382 jis_set
= u
"208" # For now, hard-code it.
383 lines
.append(_(u
"JIS X 0%s code: Kuten = %s, Hex = 0x%04X")
384 % (jis_set
, kuten
, self
.jis
))
387 for ref
in self
.xref
:
391 hexcode
= int(ref
[2:], 16)
392 kuten
= jis_hex_to_kuten(hexcode
)
394 lines
.append(_(u
"Crossref: JIS X 0208: Kuten = %s, "
395 u
"Hex = 0x%04X") % (kuten
, hexcode
))
397 lines
.append(_(u
"Crossref: JIS X 0208: Kuten = %s, "
398 u
"Hex = 0x%04X") % (kuten
, hexcode
))
400 s
= _(u
"Crossref: JIS (UNHANDLED JIS CODESET): "
401 u
"Kuten = %s, Hex = 0x%04X") % (kuten
, hexcode
)
403 # Not really "unparsed", but it is unhandled...
407 m
= alpha_regex
.match(ref
)
408 k
= kanjidic2_key_to_str(
409 kanjidic_key_to_kanjidic2(m
.group(1)))
411 v
= ref
[m
.span()[1]:]
412 lines
.append(_(u
"Crossref: %s: %s")
416 lines
.append(_(u
"Unrecognized codes: %s")
417 % (u
", ").join(self
.unparsed
))
420 return u
"\n".join(lines
)
422 def __unicode__(self
):
423 """Dummy string dumper"""
424 strs
= [self
.literal
]
425 for l
in [self
.kunyomi
, self
.onyomi
, self
.nanori
, self
.meanings
]:
428 strs
.insert(3, self
.radname
)
430 return _(u
", ").join(strs
)
432 class ParserState(object):
436 class KanjidicParser(object):
438 def __init__(self
, filename
, use_cache
=True, encoding
="EUC-JP"):
439 self
.filename
= filename
440 self
.encoding
= encoding
441 self
.use_cache
= use_cache
444 def search(self
, query
):
445 """Returns a list of kanji entries matching kanji in the query.
447 Note: Previous versions implemented this as a generator.
448 While I liked that solution, it did not maintain the order of
449 kanji in the query. Since the KANJIDIC2 parser does this,
450 I've done it here as well for consistency.
456 if self
.use_cache
: data
= self
.cache
459 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
460 f
= gzip
.open(self
.filename
)
462 f
= open(self
.filename
, "rb")
465 fdata
= fdata
.decode(self
.encoding
)
466 lines
= fdata
.splitlines()
467 lines
= [line
for line
in lines
if line
and (line
[0] != u
"#")]
471 entry
= KanjidicEntry(line
)
473 self
.cache
[entry
.literal
] = entry
474 if entry
.literal
in query
: data
[entry
.literal
] = entry
477 kanji
= data
.get(char
)
478 if kanji
: results
.append(kanji
)
482 if __name__
== "__main__":
485 if len(sys
.argv
) < 2:
486 print _(u
"Please specify a dictionary file.")
489 kp
= KanjidicParser(sys
.argv
[1])
491 print _(u
"Could not create KanjidicParser: %s") % unicode(e
)
494 if len(sys
.argv
) < 3:
495 print _(u
"Please specify a kanji. "
496 u
"(Copy/paste, or Alt-Zenkaku/Hankaku)")
504 for i
, entry
in enumerate(kp
.search(sys
.argv
[2].decode(charset
))):
505 print _(u
"Entry %d:\n%s\n") % (i
+1, entry
.to_string())