2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__
import absolute_import
35 import os
, gzip
, gettext
, warnings
36 from xml
.etree
.cElementTree
import ElementTree
37 gettext
.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert
, kanjidic2_key_to_str
, qcode_to_desc
42 def jis_kuten_to_hex(kuten
):
43 """Kuten string to hex conversion"""
44 pieces
= map(int, kuten
.split(u
'-'))
45 return ((pieces
[0] + 0x20) << 8) + (pieces
[1] + 0x20)
52 for k
, v
in d
.iteritems():
53 result
[k
] = map(fn
, v
)
57 class Kanjidic2Node(object):
59 def __init__(self
, xml_node
):
61 self
.literal
= self
._get
_literal
()
63 def _get_literal(self
):
64 literal
= self
.xml
.find("literal").text
.strip()
65 assert len(literal
) == 1, _(u
"Literal has more than one character!")
69 o
= self
.xml
.find("misc/grade")
70 return int(o
.text
) if o
else None
73 # By the spec, it seems like multiple freqs are possible??
74 # So... let's get all entries and assert.
75 o
= self
.xml
.findall("misc/freq")
78 assert len(o
) == 1, _(
79 u
"Character %s: Expected 1 freq entry, found %d") % \
80 (self
._get
_literal
(), len(o
))
84 o
= self
.xml
.find("misc/jlpt")
85 return int(o
.text
) if o
else None
87 def get_strokes(self
):
90 Returns a tuple of (stroke_count, miscounts), where miscounts
91 is either None or a list of common miscounts for the
95 nodes
= self
.xml
.findall("misc/stroke_count")
96 scnode
, misnodes
= nodes
[0], nodes
[1:]
97 sc
= int(nodes
[0].text
)
99 miss
= map(int, [o
.text
for o
in misnodes
])
104 def _get_nanori_nodes(self
):
105 nodes
= self
.xml
.findall("reading_meaning/nanori")
108 def _get_attrdict(self
, path
, attr_name
):
109 """Helper: stores elements on path in dict, keyed by attribute."""
111 nodes
= self
.xml
.findall(path
)
112 #attrs = set(o.attrib.get(attr_name) for o in nodes)
114 d
.setdefault(o
.attrib
.get(attr_name
), []).append(o
)
116 # d[attr] = [o for o in nodes
117 # if o.attrib.get(attr_name) == attr]
120 def _get_reading_nodes(self
):
121 """Returns dictionary of reading lists, keyed by type."""
122 # NEEDS AN UPDATE: Just noticed, rmgroup allows
123 # readings/meanings to be meaningfully grouped together. We
124 # -can- dump everything together, but we -should- handle the
126 return self
._get
_attrdict
("reading_meaning/rmgroup/reading", "r_type")
128 def _get_meaning_nodes(self
):
129 """Returns dictionary of gloss lists, keyed by language prefix."""
130 # NEEDS AN UPDATE: See _get_reading_nodes.
131 meaning_d
= self
._get
_attrdict
(
132 "reading_meaning/rmgroup/meaning", "m_lang")
133 if None in meaning_d
:
134 meaning_d
['en'] = meaning_d
[None]
138 def _get_dictcodes(self
):
139 return self
._get
_attrdict
("dic_number/dic_ref", "dr_type")
141 def _get_querycodes(self
):
142 return self
._get
_attrdict
("query_code/q_code", "qc_type")
144 def get_nanori(self
):
145 nanori
= map(xml2text
, self
._get
_nanori
_nodes
() or [])
147 return _(u
"%s: %s") % (_(u
"Nanori"), u
"、".join(nanori
))
149 def get_readings(self
, rtypes
):
150 """Gets readings as text strings.
152 Takes in any number of reading keys, and returns a list
153 containing user-friendly output strings.
155 Valid keys include: ja_on, ja_kun, korean_h, korean_r, pinyin,
158 Note: Nanori is also handled independently, as it is stored
159 differently than the other readings.
163 "ja_on": _(u
"On-yomi"),
164 "ja_kun": _(u
"Kun-yomi"),
165 "korean_h": _(u
"Korean (Hangul)"),
166 "korean_r": _(u
"Korean (Romanized)"),
167 "pinyin": _(u
"Pinyin"),
169 romanized
= ("korean_r", "pinyin")
170 readings
= mapdict(xml2text
, self
._get
_reading
_nodes
())
174 s
= self
.get_nanori()
178 if rt
not in readings
:
180 separator
= u
", " if rt
in romanized
else u
"、"
181 reading_str
= separator
.join(readings
[rt
])
182 pieces
.append(_(u
"%s: %s") % (d
[rt
], reading_str
))
185 def get_meanings(self
):
186 meanings
= mapdict(xml2text
, self
._get
_meaning
_nodes
())
188 for lang
in sorted(meanings
):
189 pieces
.append(_(u
"Meanings (%s): %s") %
190 (lang
, u
"; ".join(meanings
[lang
])))
193 def get_dict_codes(self
, keys
, all
=False):
194 """Gets dictionary codes as strings for display to the user.
196 Accepts a list of dictionary keys. To get all keys, set the
197 all keyword to true. (The keys parameter will be ignored in
202 dicts
= self
._get
_dictcodes
()
203 for dcode
in sorted(dicts
):
204 if (not all
) and dcode
not in keys
:
207 assert len(nodes
) == 1, _(
208 u
"Character %s: Multiple (%d) entries found for "
210 (self
._get
_literal
(), len(nodes
), dcode
)
212 dname
= kanjidic2_key_to_str(dcode
)
214 s
= _(u
"Index %s, volume %s, page %s") % \
215 (o
.text
, o
.attrib
['m_vol'], o
.attrib
['m_page'])
218 pieces
.append(_(u
"%s: %s") % (dname
, s
))
221 def get_query_codes(self
, keys
, all
=False):
223 qcodes
= self
._get
_querycodes
()
224 for qcode
in sorted(qcodes
):
225 if (not all
) and qcode
not in keys
:
227 nodes
= qcodes
[qcode
]
228 qname
= qcode_to_desc(qcode
)
232 d
.setdefault(o
.attrib
.get("skip_misclass"), []).append(o
)
233 for misclass
in sorted(d
):
235 outname
= _(u
"%s miscode (%s)") % (qname
, misclass
)
238 s
= u
", ".join(o
.text
for o
in d
[misclass
])
239 pieces
.append(_(u
"%s: %s") % (outname
, s
))
241 s
= u
", ".join(o
.text
for o
in nodes
)
242 pieces
.append(_(u
"%s: %s") % (qname
, s
))
245 def __unicode__(self
):
247 def indent_strs(strs
):
248 return [u
" %s" % s
for s
in strs
]
252 pieces
.append(u
"=" * 70)
253 pieces
.append(_(u
"Literal: %s") % self
.literal
)
254 pieces
.append(u
"-" * 70)
256 pieces
.append(_(u
"Readings:"))
257 r_strs
= indent_strs(self
.get_readings(
258 ("ja_on", "ja_kun", "nanori", "korean_h", "korean_r", "pinyin")))
259 pieces
.extend(r_strs
)
260 pieces
.append(u
"-" * 70)
262 m_strs
= indent_strs(self
.get_meanings())
263 pieces
.extend(m_strs
)
264 pieces
.append(u
"-" * 70)
266 pieces
.append(_(u
"Miscellaneous:"))
267 jlpt
= self
.get_jlpt()
269 pieces
.append(_(u
" JLPT grade level: %d") % jlpt
)
270 grade
= self
.get_grade()
272 pieces
.append(_(u
" Jouyou grade level: %d") % grade
)
273 freq
= self
.get_freq()
275 pieces
.append(_(u
" Newspaper frequency: %d") % freq
)
276 strokes
, misstrokes
= self
.get_strokes()
277 pieces
.append(_(u
" Stroke count: %d") % strokes
)
279 pieces
.append(_(u
" Common stroke miscounts: %s") %
280 ", ".join(map(str, misstrokes
)))
281 pieces
.append(u
"-" * 70)
283 pieces
.append(_(u
"Dictionary codes:"))
284 d_strs
= indent_strs(self
.get_dict_codes([], all
=True))
285 pieces
.extend(d_strs
)
286 pieces
.append(u
"-" * 70)
288 pieces
.append(_(u
"Query codes:"))
289 qc_strs
= indent_strs(self
.get_query_codes([], all
=True))
290 pieces
.extend(qc_strs
)
291 pieces
.append(u
"-" * 70)
293 pieces
.append(_(u
"Other information:"))
296 #rad_strs = self.get_rad_info()
298 # CODEPOINT node info
299 #cp_strs = indent_strs(self.get_codepoints())
300 pieces
.append(_(u
" Unicode value: %04X") % ord(self
.literal
))
303 #variant_strs = self.get_variants() # AKA cross refs
304 #radname_strs = self.get_radical_name() # "T2" KANJIDIC code
306 pieces
.append(u
"=" * 70)
308 return u
"\n".join(pieces
)
311 class Parser(object):
313 def __init__(self
, filename
, encoding
="utf-8"):
314 """Initializer for Kanjidic2Parser.
316 About use_cache: Kanjidic2 is a large, heavy to parse file.
317 Although it takes a large amount of memory, it is better to
318 retain it in memory to increase the speed of subsequent
322 if not os
.path
.exists(filename
):
323 raise Exception(u_("Dictionary file does not exist."))
324 self
.filename
= filename
325 self
.encoding
= encoding
327 self
.header
, self
.characters
= self
.load_via_etree()
328 self
._check
_version
()
330 def _check_version(self
):
331 version
= int(self
.header
.find('file_version').text
)
332 assert version
>= 4, _(
333 u
"This parser won't work with versions of KANJIDIC2 "
334 u
"older than version 4.")
336 s
= _(u
"Parser version is for version 4, detected version is %d"
340 def load_via_etree(self
):
341 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
342 f
= gzip
.open(self
.filename
)
344 f
= open(self
.filename
, "rb")
345 et
= ElementTree(file=f
)
347 nodes
= et
.getroot().getchildren()
348 header
, characters
= nodes
[0], nodes
[1:]
349 characters
= [Kanjidic2Node(char
) for char
in characters
]
350 return header
, characters
352 def get_header(self
):
354 for o
in self
.header
.getchildren():
355 cdata
= u
"".join((o
.text
, o
.tail
)).strip()
357 return u
"\n".join(u
"%s: %s" % (k
, d
[k
]) for k
in sorted(d
))
359 def search(self
, query
):
360 self
.create_indices()
362 c
= self
.by_kanji
.get(u
)
366 def create_indices(self
):
371 for char
in self
.characters
:
372 literal
= char
.xml
.find("literal").text
.strip()
373 self
.by_kanji
[literal
] = char
376 def encode_or_else(s
):
381 lines
= s
.split(u
"\n")
385 val
= line
.encode(charset
)
389 return u
"\n".join(out
)
392 if __name__
== "__main__":
396 dfname
, args
= sys
.argv
[1], sys
.argv
[2:]
398 except (IndexError, AssertionError):
399 print _(u
"Syntax: %s <dict_file> <character [...]>") % sys
.argv
[0]
405 print _(u
"Could not create Kanjidic2Parser: %s") % unicode(e
)
417 print u
"%d characters found" % len(p
.characters
)
419 for i
, kanji
in enumerate(p
.search("".join(args
).decode(charset
))):
420 kstr
= encode_or_else(unicode(kanji
))
421 print _(u
"Entry %d:\n%s\n") % (i
+1, kstr
)