2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__
import absolute_import
35 import os
, gzip
, gettext
, warnings
36 from xml
.etree
.cElementTree
import ElementTree
37 gettext
.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert
, kanjidic2_key_to_str
, qcode_to_desc
42 def jis_kuten_to_hex(kuten
):
43 """Kuten string to hex conversion"""
44 pieces
= map(int, kuten
.split(u
'-'))
45 return ((pieces
[0] + 0x20) << 8) + (pieces
[1] + 0x20)
52 for k
, v
in d
.iteritems():
53 result
[k
] = map(fn
, v
)
57 class Kanjidic2Node(object):
59 def __init__(self
, xml_node
):
61 self
.literal
= self
._get
_literal
()
63 def _get_literal(self
):
64 literal
= self
.xml
.find("literal").text
.strip()
65 assert len(literal
) == 1, _(u
"Literal has more than one character!")
69 o
= self
.xml
.find("misc/grade")
70 return int(o
.text
) if o
else None
73 # By the spec, it seems like multiple freqs are possible??
74 # So... let's get all entries and assert.
75 o
= self
.xml
.findall("misc/freq")
78 assert len(o
) == 1, _(
79 u
"Character %s: Expected 1 freq entry, found %d") % \
80 (self
._get
_literal
(), len(o
))
84 o
= self
.xml
.find("misc/jlpt")
85 return int(o
.text
) if o
else None
87 def get_strokes(self
):
90 Returns a tuple of (stroke_count, miscounts), where miscounts
91 is either None or a list of common miscounts for the
95 nodes
= self
.xml
.findall("misc/stroke_count")
96 scnode
, misnodes
= nodes
[0], nodes
[1:]
97 sc
= int(nodes
[0].text
)
99 miss
= map(int, [o
.text
for o
in misnodes
])
104 def _get_nanori_nodes(self
):
105 nodes
= self
.xml
.findall("reading_meaning/nanori")
108 def _get_attrdict(self
, path
, attr_name
):
109 """Helper: stores elements on path in dict, keyed by attribute."""
111 nodes
= self
.xml
.findall(path
)
112 #attrs = set(o.attrib.get(attr_name) for o in nodes)
114 d
.setdefault(o
.attrib
.get(attr_name
), []).append(o
)
116 # d[attr] = [o for o in nodes
117 # if o.attrib.get(attr_name) == attr]
120 def _get_reading_nodes(self
):
121 """Returns dictionary of reading lists, keyed by type."""
122 return self
._get
_attrdict
("reading_meaning/rmgroup/reading", "r_type")
124 def _get_meaning_nodes(self
):
125 """Returns dictionary of gloss lists, keyed by language prefix."""
126 meaning_d
= self
._get
_attrdict
(
127 "reading_meaning/rmgroup/meaning", "m_lang")
128 if None in meaning_d
:
129 meaning_d
['en'] = meaning_d
[None]
133 def _get_dictcodes(self
):
134 return self
._get
_attrdict
("dic_number/dic_ref", "dr_type")
136 def _get_querycodes(self
):
137 return self
._get
_attrdict
("query_code/q_code", "qc_type")
139 def get_nanori(self
):
140 nanori
= map(xml2text
, self
._get
_nanori
_nodes
() or [])
142 return _(u
"%s: %s") % (_(u
"Nanori"), u
"、".join(nanori
))
144 def get_readings(self
, rtypes
):
145 """Gets readings as text strings.
147 Takes in any number of reading keys, and returns a list
148 containing user-friendly output strings.
150 Valid keys include: ja_on, ja_kun, korean_h, korean_r, pinyin,
153 Note: Nanori is also handled independently, as it is stored
154 differently than the other readings.
158 "ja_on": _(u
"On-yomi"),
159 "ja_kun": _(u
"Kun-yomi"),
160 "korean_h": _(u
"Korean (Hangul)"),
161 "korean_r": _(u
"Korean (Romanized)"),
162 "pinyin": _(u
"Pinyin"),
164 romanized
= ("korean_r", "pinyin")
165 readings
= mapdict(xml2text
, self
._get
_reading
_nodes
())
169 s
= self
.get_nanori()
173 if rt
not in readings
:
175 separator
= u
", " if rt
in romanized
else u
"、"
176 reading_str
= separator
.join(readings
[rt
])
177 pieces
.append(_(u
"%s: %s") % (d
[rt
], reading_str
))
180 def get_meanings(self
):
181 meanings
= mapdict(xml2text
, self
._get
_meaning
_nodes
())
183 for lang
in sorted(meanings
):
184 pieces
.append(_(u
"Meanings (%s): %s") %
185 (lang
, u
"; ".join(meanings
[lang
])))
188 def get_dict_codes(self
, keys
, all
=False):
189 """Gets dictionary codes as strings for display to the user.
191 Accepts a list of dictionary keys. To get all keys, set the
192 all keyword to true. (The keys parameter will be ignored in
197 dicts
= self
._get
_dictcodes
()
198 for dcode
in sorted(dicts
):
199 if (not all
) and dcode
not in keys
:
202 assert len(nodes
) == 1, _(
203 u
"Character %s: Multiple (%d) entries found for "
205 (self
._get
_literal
(), len(nodes
), dcode
)
207 dname
= kanjidic2_key_to_str(dcode
)
209 s
= _(u
"Index %s, volume %s, page %s") % \
210 (o
.text
, o
.attrib
['m_vol'], o
.attrib
['m_page'])
213 pieces
.append(_(u
"%s: %s") % (dname
, s
))
216 def get_query_codes(self
, keys
, all
=False):
218 qcodes
= self
._get
_querycodes
()
219 for qcode
in sorted(qcodes
):
220 if (not all
) and qcode
not in keys
:
222 nodes
= qcodes
[qcode
]
223 qname
= qcode_to_desc(qcode
)
227 d
.setdefault(o
.attrib
.get("skip_misclass"), []).append(o
)
228 for misclass
in sorted(d
):
230 outname
= _(u
"%s miscode (%s)") % (qname
, misclass
)
233 s
= u
", ".join(o
.text
for o
in d
[misclass
])
234 pieces
.append(_(u
"%s: %s") % (outname
, s
))
236 s
= u
", ".join(o
.text
for o
in nodes
)
237 pieces
.append(_(u
"%s: %s") % (qname
, s
))
240 def __unicode__(self
):
243 pieces
.append(u
"=" * 70)
244 pieces
.append(_(u
"Literal: %s") % self
.literal
)
245 pieces
.append(u
"-" * 70)
247 pieces
.append(_(u
"Readings:"))
248 r_strs
= [u
" %s" % s
for s
in
250 ("ja_on", "ja_kun", "nanori",
251 "korean_h", "korean_r", "pinyin"))]
252 pieces
.extend(r_strs
)
253 pieces
.append(u
"-" * 70)
255 m_strs
= [u
" %s" % s
for s
in self
.get_meanings()]
256 pieces
.extend(m_strs
)
257 pieces
.append(u
"-" * 70)
259 pieces
.append(_(u
"Miscellaneous:"))
260 jlpt
= self
.get_jlpt()
262 pieces
.append(_(u
" JLPT grade level: %d") % jlpt
)
263 grade
= self
.get_grade()
265 pieces
.append(_(u
" Jouyou grade level: %d") % grade
)
266 freq
= self
.get_freq()
268 pieces
.append(_(u
" Newspaper frequency: %d") % freq
)
269 strokes
, misstrokes
= self
.get_strokes()
270 pieces
.append(_(u
" Stroke count: %d") % strokes
)
272 pieces
.append(_(u
" Common stroke miscounts: %s") %
273 ", ".join(map(str, misstrokes
)))
274 pieces
.append(u
"-" * 70)
276 pieces
.append(_(u
"Dictionary codes:"))
277 d_strs
= [u
" %s" % s
for s
in self
.get_dict_codes([], all
=True)]
278 pieces
.extend(d_strs
)
279 pieces
.append(u
"-" * 70)
281 pieces
.append(_(u
"Query codes:"))
282 qc_strs
= [u
" %s" % s
for s
in self
.get_query_codes([], all
=True)]
283 pieces
.extend(qc_strs
)
284 pieces
.append(u
"-" * 70)
286 pieces
.append(_(u
"Other information:"))
287 #cp_strs = self.get_codepoints()
288 #rad_strs = self.get_rad_info()
289 #variant_strs = self.get_variants()
290 pieces
.append(_(u
" Unicode value: %04X") % ord(self
.literal
))
292 pieces
.append(u
"=" * 70)
294 return u
"\n".join(pieces
)
297 class Parser(object):
299 def __init__(self
, filename
, encoding
="utf-8"):
300 """Initializer for Kanjidic2Parser.
302 About use_cache: Kanjidic2 is a large, heavy to parse file.
303 Although it takes a large amount of memory, it is better to
304 retain it in memory to increase the speed of subsequent
308 if not os
.path
.exists(filename
):
309 raise Exception(u_("Dictionary file does not exist."))
310 self
.filename
= filename
311 self
.encoding
= encoding
313 self
.header
, self
.characters
= self
.load_via_etree()
314 self
._check
_version
()
316 def _check_version(self
):
317 version
= int(self
.header
.find('file_version').text
)
318 assert version
>= 4, _(
319 u
"This parser won't work with versions of KANJIDIC2 "
320 u
"older than version 4.")
322 s
= _(u
"Parser version is for version 4, detected version is %d"
326 def load_via_etree(self
):
327 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
328 f
= gzip
.open(self
.filename
)
330 f
= open(self
.filename
, "rb")
331 et
= ElementTree(file=f
)
333 nodes
= et
.getroot().getchildren()
334 header
, characters
= nodes
[0], nodes
[1:]
335 characters
= [Kanjidic2Node(char
) for char
in characters
]
336 return header
, characters
338 def get_header(self
):
340 for o
in self
.header
.getchildren():
341 cdata
= u
"".join((o
.text
, o
.tail
)).strip()
343 return u
"\n".join(u
"%s: %s" % (k
, d
[k
]) for k
in sorted(d
))
345 def search(self
, query
):
346 self
.create_indices()
348 c
= self
.by_kanji
.get(u
)
352 def create_indices(self
):
357 for char
in self
.characters
:
358 literal
= char
.xml
.find("literal").text
.strip()
359 self
.by_kanji
[literal
] = char
362 def encode_or_else(s
):
367 lines
= s
.split(u
"\n")
371 val
= line
.encode(charset
)
375 return u
"\n".join(out
)
378 if __name__
== "__main__":
382 dfname
, args
= sys
.argv
[1], sys
.argv
[2:]
384 except (IndexError, AssertionError):
385 print _(u
"Syntax: %s <dict_file> <character [...]>") % sys
.argv
[0]
391 print _(u
"Could not create Kanjidic2Parser: %s") % unicode(e
)
403 print u
"%d characters found" % len(p
.characters
)
405 for i
, kanji
in enumerate(p
.search("".join(args
).decode(charset
))):
406 kstr
= encode_or_else(unicode(kanji
))
407 print _(u
"Entry %d:\n%s\n") % (i
+1, kstr
)