2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__
import absolute_import
35 import os
, gzip
, gettext
36 from xml
.etree
.cElementTree
import ElementTree
37 gettext
.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert
, kanjidic2_key_to_str
, qcode_to_desc
42 def jis_kuten_to_hex(kuten
):
43 """Kuten string to hex conversion"""
44 pieces
= map(int, kuten
.split(u
'-'))
45 print u
"DEBUG: kuten: %s, pieces: %s" % (kuten
, str(pieces
))
46 return ((pieces
[0] + 0x20) << 8) + (pieces
[1] + 0x20)
49 class Kanjidic2Node(object):
51 def __init__(self
, xml_node
):
53 self
.literal
= self
._get
_literal
()
55 def _get_literal(self
):
56 literal
= self
.xml
.find("literal").text
.strip()
57 assert len(literal
) == 1, u
"Literal has more than one character!"
61 o
= self
.xml
.find("misc/grade")
65 # By the spec, it seems like multiple freqs are possible??
66 # So... let's get all entries and assert.
67 o
= self
.xml
.findall("misc/freq")
71 u
"Character %s: Expected 1 freq entry, found %d" %
72 (self
._get
_literal
(), len(o
)))
76 o
= self
.xml
.find("misc/jlpt")
79 def _get_nanori_nodes(self
):
80 nodes
= self
.xml
.findall("reading_meaning/nanori")
83 def _get_attrdict(self
, path
, attr_name
):
84 """Helper: stores elements on path in dict, keyed by attribute."""
86 nodes
= self
.xml
.findall(path
)
87 #attrs = set(o.attrib.get(attr_name) for o in nodes)
89 d
.setdefault(o
.attrib
.get(attr_name
), []).append(o
)
91 # d[attr] = [o for o in nodes
92 # if o.attrib.get(attr_name) == attr]
95 def _get_reading_nodes(self
):
96 """Returns dictionary of reading lists, keyed by type."""
97 return self
._get
_attrdict
("reading_meaning/rmgroup/reading", "r_type")
99 def _get_meaning_nodes(self
):
100 """Returns dictionary of gloss lists, keyed by language prefix."""
101 meaning_d
= self
._get
_attrdict
(
102 "reading_meaning/rmgroup/meaning", "m_lang")
103 if None in meaning_d
:
104 meaning_d
['en'] = meaning_d
[None]
108 def _get_dictcodes(self
):
109 return self
._get
_attrdict
("dic_number/dic_ref", "dr_type")
111 def _get_querycodes(self
):
112 return self
._get
_attrdict
("query_code/q_code", "qc_type")
114 def __unicode__(self
):
121 for k
, v
in d
.iteritems():
122 result
[k
] = map(fn
, v
)
125 readings
= mapdict(xml2text
, self
._get
_reading
_nodes
())
126 meanings
= mapdict(xml2text
, self
._get
_meaning
_nodes
())
127 nanori
= map(xml2text
, self
._get
_nanori
_nodes
())
128 grade
= self
._get
_grade
()
129 jlpt
= self
._get
_jlpt
()
130 freq
= self
._get
_freq
()
131 dicts
= self
._get
_dictcodes
()
132 qcodes
= self
._get
_querycodes
()
136 pieces
.append(u
"=" * 70)
138 pieces
.append(u
"Literal: %s" % self
.literal
)
140 pieces
.append(u
"-" * 70)
141 pieces
.append(u
"Readings:")
143 pieces
.append(u
" On-yomi: %s" % u
"、".join(readings
['ja_on']))
144 pieces
.append(u
" Kun-yomi: %s" % u
"、".join(readings
['ja_kun']))
145 pieces
.append(u
" Nanori: %s" % u
"、".join(nanori
))
146 pieces
.append(u
" Korean (Hangul): %s" %
147 u
", ".join(readings
['korean_h']))
148 pieces
.append(u
" Korean (Romanized): %s" %
149 u
", ".join(readings
['korean_r']))
150 pieces
.append(u
" Pinyin: %s" % u
", ".join(readings
['pinyin']))
152 pieces
.append(u
"-" * 70)
154 for lang
in sorted(meanings
):
155 pieces
.append(u
"Meanings (%s): %s" %
156 (lang
, u
"; ".join(meanings
[lang
])))
158 pieces
.append(u
"-" * 70)
159 pieces
.append(u
"Miscellaneous:")
162 pieces
.append(u
" JLPT grade level: %d" % jlpt
)
164 pieces
.append(u
" Jouyou grade level: %d" % grade
)
166 pieces
.append(u
" Newspaper frequency: %d" % freq
)
168 pieces
.append(u
"-" * 70)
169 pieces
.append(u
"Dictionary codes:")
171 for dcode
in sorted(dicts
):
173 assert len(nodes
) == 1, (
174 u
"Character %s: Multiple (%d) entries found for "
176 (self
._get
_literal
(), len(nodes
), dcode
))
178 dname
= kanjidic2_key_to_str(dcode
)
180 s
= u
"Index %s, volume %s, page %s" % \
181 (o
.text
, o
.attrib
['m_vol'], o
.attrib
['m_page'])
184 pieces
.append(u
" %s: %s" % (dname
, s
))
186 pieces
.append(u
"-" * 70)
187 pieces
.append(u
"Query codes:")
189 for qcode
in sorted(qcodes
):
190 nodes
= qcodes
[qcode
]
192 # SKIP has miscodes; do later
194 s
= u
", ".join(o
.text
for o
in nodes
)
195 qname
= qcode_to_desc(qcode
)
196 pieces
.append(u
" %s: %s" % (qname
, s
))
198 pieces
.append(u
"-" * 70)
200 pieces
.append(u
"Unicode value: %04X" % ord(self
.literal
))
202 pieces
.append(u
"=" * 70)
204 return u
"\n".join(pieces
)
207 class Parser(object):
209 def __init__(self
, filename
, encoding
="utf-8"):
210 """Initializer for Kanjidic2Parser.
212 About use_cache: Kanjidic2 is a large, heavy to parse file.
213 Although it takes a large amount of memory, it is better to
214 retain it in memory to increase the speed of subsequent
218 if not os
.path
.exists(filename
):
219 raise Exception("Dictionary file does not exist.")
220 self
.filename
= filename
221 self
.encoding
= encoding
223 self
.header
, self
.characters
= self
.load_via_etree()
225 def load_via_etree(self
):
226 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
227 f
= gzip
.open(self
.filename
)
229 f
= open(self
.filename
, "rb")
230 et
= ElementTree(file=f
)
232 nodes
= et
.getroot().getchildren()
233 header
, characters
= nodes
[0], nodes
[1:]
234 characters
= [Kanjidic2Node(char
) for char
in characters
]
235 return header
, characters
237 def get_header(self
):
239 for o
in self
.header
.getchildren():
240 cdata
= u
"".join((o
.text
, o
.tail
)).strip()
242 return u
"\n".join(u
"%s: %s" % (k
, d
[k
]) for k
in sorted(d
))
244 def search(self
, query
):
245 self
.create_indices()
247 c
= self
.by_kanji
.get(u
)
251 def create_indices(self
):
256 for char
in self
.characters
:
257 literal
= char
.xml
.find("literal").text
.strip()
258 self
.by_kanji
[literal
] = char
261 def encode_or_else(s
):
266 lines
= s
.split(u
"\n")
270 val
= line
.encode(charset
)
274 return u
"\n".join(out
)
277 if __name__
== "__main__":
281 dfname
, args
= sys
.argv
[1], sys
.argv
[2:]
283 except (IndexError, AssertionError):
284 print _(u
"Syntax: %s <dict_file> <character [...]>") % sys
.argv
[0]
290 print _(u
"Could not create Kanjidic2Parser: %s") % unicode(e
)
302 print u
"%d characters found" % len(p
.characters
)
304 for i
, kanji
in enumerate(p
.search("".join(args
).decode(charset
))):
305 kstr
= encode_or_else(unicode(kanji
))
306 print _(u
"Entry %d:\n%s\n") % (i
+1, kstr
)