2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2.
33 This module is incomplete and currently just holds helper code for the
38 import gzip
, xml
.sax
, gettext
39 gettext
.install('pyjben', unicode=True)
41 from parsers
.kanjidic_common \
42 import jstring_convert
, kanjidic2_key_to_str
, qcode_to_desc
44 class Kanjidic2Entry(object):
57 self
.strokes_miss
= []
62 # Info of low importance for most target users
64 self
.radical_c
= None # "Classic" KangXi Zidian radical
70 # "Query codes": Pattern-based lookup
71 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
72 # Codes: P, DRnnnn, Inxnn.n, Qnnnn.n
76 # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn
77 # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM
80 # Dictionary-related metadata
86 def to_string(self
, **kwargs
):
87 """A default "to-string" dump of a Kanjidic2Entry."""
89 lines
.append(_(u
"Literal: %s") % self
.literal
)
91 lines
.append(_(u
"Onyomi: %s")
93 [jstring_convert(us
) for us
in self
.ja_on
]))
95 lines
.append(_(u
"Kunyomi: %s")
97 [jstring_convert(us
) for us
in self
.ja_kun
]))
99 lines
.append(_(u
"Nanori: %s")
101 [jstring_convert(us
) for us
in self
.nanori
]))
103 for k
, v
in self
.meanings
.iteritems():
104 lines
.append(_(u
"Meaning (%s): %s") % (k
, _(u
"; ").join(v
)))
107 lines
.append(_(u
"Stroke count: %d") % self
.strokes
)
108 if self
.strokes_miss
:
109 lines
.append(_(u
"Common miscounts: %s")
110 % _(u
", ").join(self
.strokes_miss
))
112 lines
.append(_(u
"Newspaper Frequency: %d") % self
.freq
)
114 if self
.grade
in range(1, 7):
115 grade_str
= unicode(self
.grade
)
116 elif self
.grade
== 8:
117 grade_str
= _(u
"General usage")
118 elif self
.grade
== 9:
119 grade_str
= _(u
"Jinmeiyou (Characters for names)")
120 elif self
.grade
== None:
121 grade_str
= _(u
"Unspecified")
123 grade_str
= _(u
"Unhandled grade level (Grade %d)") % self
.grade
124 lines
.append(_(u
"Jouyou Grade: %s") % grade_str
)
126 lines
.append(_(u
"JLPT Level: %d") % self
.jlpt
)
130 for k
, v
in self
.qcodes
.iteritems():
131 desc
= qcode_to_desc(k
)
132 lines
.append(_(u
"%s code: %s") % (desc
, self
.qcodes
[k
]))
134 if k
== 'skip' and self
.misclass
:
136 for code
in self
.misclass
:
139 if code_type
== u
'SP': # "stroke_count"
140 miscodes
.append(_(u
"%s (stroke count)") % code_val
)
141 elif code_type
== u
'PP': # "posn"
142 miscodes
.append(_(u
"%s (position)") % code_val
)
143 elif code_type
== u
'BP': # "stroke_and_posn"
144 miscodes
.append(_(u
"%s (stroke and position)") % code_val
)
145 elif code_type
== u
'RP': # "stroke_diff"
146 miscodes
.append(_(u
"%s (debatable count)") % code_val
)
148 lines
.append(_(u
"Unrecognized misclassification code: %s")
151 lines
.append(_(u
"SKIP miscodes: %s")
152 % _(u
", ").join(miscodes
))
155 # Probably we should sort these in some way... but for
157 for k
, v
in self
.dcodes
.iteritems():
158 k
= kanjidic2_key_to_str(k
)
159 lines
.append(_(u
"%s: %s") % (k
, v
))
162 lines
.append(_(u
"Radical name: %s") % self
.radname
)
164 lines
.append(_(u
"Nelson Radical: %d") % self
.radical
)
166 lines
.append(_(u
"KangXi Zidian Radical: %d") % self
.radical_c
)
169 lines
.append(_(u
"Korean: %s")
170 % _(u
", ").join(self
.korean_h
))
172 lines
.append(_(u
"Korean romanization: %s")
173 % _(u
", ").join(self
.korean_r
))
175 lines
.append(_(u
"Pinyin romanization: %s")
176 % _(u
", ").join(self
.pinyin
))
178 # "self.unicode" is always present. ;)
179 lines
.append(_(u
"Unicode: 0x%04X") % ord(self
.literal
))
181 def jis_hex_to_kuten(hex_code
):
182 """KANJIDIC2-style kuten string"""
184 (((hex_code
>> 8) & 0xFF) - 0x20),
185 ((hex_code
& 0xFF) - 0x20))
187 kuten
= jis_hex_to_kuten(self
.jis
)
188 lines
.append(_(u
"JIS code: Kuten = %s, Hex = 0x%04X")
193 # From KANJIDIC documentation:
195 # Xxxxxxx -- a cross-reference code. An entry of, say,
196 # XN1234 will mean that the user is referred to the kanji
197 # with the (unique) Nelson index of 1234. XJ0xxxx and
198 # XJ1xxxx are cross-references to the kanji with the JIS
199 # hexadecimal code of xxxx. The `0' means the reference is
200 # to a JIS X 0208 kanji, and the `1' references a JIS X
204 # For now, just dump to the console.
205 lines
.append(_(u
"Crossref codes: %s") % ", ".join(self
.xref
))
208 #/* Crossref codes */
209 #if(!k.var_j208.empty())
210 #result << "<li>JIS-208: " << k.var_j208 << "</li>";
211 #if(!k.var_j212.empty())
212 #result << "<li>JIS-212: " << k.var_j212 << "</li>";
213 #if(!k.var_j213.empty())
214 #result << "<li>JIS-213: " << k.var_j213 << "</li>";
215 #if(!k.var_ucs.empty())
216 #result << "<li>Unicode: " << k.var_ucs << "</li>";
217 #if(!k.var_deroo.empty())
218 #result << "<li>De Roo code: " << k.var_deroo << "</li>";
219 #if(!k.var_nelson_c.empty())
220 #result << "<li>Modern Reader's Japanese-English Character "
221 #"Dictionary (Nelson): " << k.var_nelson_c << "</li>";
222 #if(!k.var_njecd.empty())
223 #result << "<li>New Japanese-English Character Dictionary "
224 #"(Halpern): " << k.var_njecd << "</li>";
225 #if(!k.var_oneill.empty())
226 #result << "<li>Japanese Names (O'Neill): " << k.var_oneill
228 #if(!k.var_s_h.empty())
229 #result << "<li>Spahn/Hadamitzky Kanji Dictionary code: "
230 #<< k.var_s_h << "</li>";
233 lines
.append(_(u
"Unrecognized codes: %s")
234 % (u
", ").join(self
.unparsed
))
236 return u
"\n".join(lines
)
238 class KD2SAXHandler(xml
.sax
.handler
.ContentHandler
):
240 """SAX handler for KANJIDIC2."""
242 def __init__(self
, *args
, **kwargs
):
244 xml
.sax
.handler
.ContentHandler
.__init
__(self
, *args
, **kwargs
)
248 self
.full_keys
= set()
252 return u
"/".join([i
[0] for i
in self
.path
])
254 def get_attr_str(self
):
255 return u
", ".join([u
"%s: %s" % (k
, v
)
256 for k
, v
in self
.path
[-1][1].items()])
258 def startElement(self
, name
, attrs
):
259 if name
== "character":
261 #print "startElement called:", name, attrs
262 #print "Beginning of character entry found"
263 self
.kanji
= Kanjidic2Entry()
265 self
.path
.append((name
, attrs
))
266 #print u"Current path: %s, attributes: %s" % \
267 # (self.get_path(), str(attrs.items()))
269 def endElement(self
, name
):
272 if name
!= self
.path
[-1][0]:
273 # Shouldn't ever happen, but mistakes *can* slip in...
274 print u
"Mismatch detected, path is %s, element name is %s" \
275 % (self
.get_path(), name
)
278 if name
== "character":
279 #print "endElement called:", name
280 #print "End of character entry reached"
281 self
.data
[self
.kanji
.literal
] = self
.kanji
285 #if self.limit <= 0: exit(0)
287 def characters(self
, content
):
288 content
= content
.strip()
289 if content
and self
.parsing
:
290 # Sanity check: see if the current node type is already
291 # included under a different full path.
292 #path = self.get_path()
293 #self.full_keys.add(path)
295 #keys = [k for k in self.full_keys if k[-(len(node)):] == node]
297 # print "CHECKME: Node: %s, Keys: %s" % (node, str(keys))
299 node
, attrs
= self
.path
[-1]
301 # I am exploiting the fact that any given element type can
302 # only belong to one type of parent. For example,
303 # "reading" objects are always fully pathed-out to
304 # reading_meaning.rmgroup.reading.
306 # In case this changes in the future, I've attached
307 # comments of the full paths below.
309 if node
== u
"literal": # literal
310 self
.kanji
.literal
= content
311 elif node
== u
"reading": # reading_meaning/rmgroup/reading
312 # These will do stuff in the future...
313 #on_type = attrs.get(u"on_type")
314 #r_status = attrs.get(u"r_status")
316 getattr(self
.kanji
, attrs
[u
'r_type']).append(content
)
317 elif node
== u
"meaning": # reading_meaning/rmgroup/meaning
318 m_lang
= attrs
.get(u
'm_lang', u
'en')
319 self
.kanji
.meanings
.setdefault(m_lang
, []).append(content
)
320 elif node
== u
"nanori": # reading_meaning/nanori
321 self
.kanji
.nanori
.append(content
)
322 elif node
== u
"grade": # misc/grade
323 self
.kanji
.grade
= int(content
)
324 elif node
== u
"freq": # misc/freq
325 self
.kanji
.freq
= int(content
)
326 elif node
== u
"jlpt": # misc/jlpt
327 self
.kanji
.jlpt
= int(content
)
328 elif node
== u
"stroke_count": # misc/strokes
329 if not self
.kanji
.strokes
:
330 self
.kanji
.strokes
= int(content
)
332 self
.kanji
.strokes_miss
.append(int(content
))
333 elif node
== u
"q_code": # query_code/q_code
334 qc_type
= attrs
[u
'qc_type']
335 if qc_type
== 'skip':
336 misclass
= attrs
.get(u
'skip_misclass')
341 self
.kanji
.qcodes
[qc_type
] = content
343 self
.kanji
.qcodes
[qc_type
] = content
344 elif node
== u
"dic_ref": # dic_number/dic_ref
345 attr
= attrs
[u
'dr_type']
347 m_vol
= attrs
.get(u
'm_vol')
348 m_page
= attrs
.get(u
'm_page')
349 # Do something with this... TODO
352 self
.kanji
.dcodes
[attr
] = int(content
)
354 self
.kanji
.dcodes
[attr
] = content
355 elif node
== u
"cp_value": # codepoint/cp_value
357 elif node
== u
"rad_value": # radical/rad_value
359 elif node
== u
"variant": # misc/variant
361 elif node
== u
"rad_name": # misc/rad_name
365 path
= self
.get_path()
366 print u
"Characters found: path=%s, attrs=(%s), content: %s" \
370 # Do some stuff based upon the current path and content
371 except UnicodeEncodeError:
372 pass # Can't display code on console; just squelch the output.
374 print u
"EXCEPTION occurred:", unicode(e
.__class
__.__str
__), unicode(e
)
376 class Kanjidic2Parser(object):
378 def __init__(self
, filename
, encoding
="utf-8"):
379 self
.filename
= filename
380 self
.encoding
= encoding
383 def load_via_sax(self
):
384 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
385 f
= gzip
.open(self
.filename
)
387 f
= open(self
.filename
, "rb")
390 isource
= xml
.sax
.xmlreader
.InputSource()
391 isource
.setEncoding("utf-8")
392 isource
.setByteStream(f
)
393 xml
.sax
.parse(isource
, sh
)
397 def search(self
, search_str
, use_cache
=True):
398 # Cacheing has 2 meanings in J-Ben:
399 # 1. Storing the results of a previous read locally.
400 # 2. Reading in prepased data from a file on disk
402 # KANJIDIC2 is a huge file; although it's huge to store it in memory,
403 # it's even harsher to repeatedly seek the whole file from disk on
405 if (not use_cache
) or (not self
.cache
):
407 # Opt 1: sax... very powerful, but too much code with my impl?
408 # Opt 2: elementtree... more memory required, loads
409 # everything at once...
410 # Opt 3: sax... redo to store all vars as lists, or similar.
411 self
.load_via_sax() # First attempt of a SAX style loader.
413 for char
in search_str
:
414 kanji
= self
.cache
.get(char
)
415 if kanji
: yield kanji
418 if __name__
== "__main__":
421 if len(sys
.argv
) < 2:
422 print _(u
"Please specify a dictionary file.")
425 kp
= Kanjidic2Parser(sys
.argv
[1])
427 print _(u
"Could not create Kanjidic2Parser: %s") % unicode(e
)
430 if len(sys
.argv
) < 3:
431 print _(u
"Please specify a kanji. "
432 u
"(Copy/paste, or Alt-Zenkaku/Hankaku)")
440 for i
, kanji
in enumerate(kp
.search(sys
.argv
[2].decode(charset
))):
441 lines
= kanji
.to_string().split(u
'\n')
442 def encode_or_else(s
):
444 val
= s
.encode("cp932")
449 xlines
= map(encode_or_else
, lines
)
450 xlines
= [l
for l
in xlines
if l
]
451 xlines
= u
"\n".join(list(xlines
))
452 print _(u
"Entry %d:\n%s\n") % (i
+1, xlines
)