2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for KANJIDIC2."""
33 from __future__
import absolute_import
36 from xml
.etree
.cElementTree
import ElementTree
37 gettext
.install('pyjben', unicode=True)
39 from .kanjidic_common \
40 import jstring_convert
, kanjidic2_key_to_str
, qcode_to_desc
42 def jis_kuten_to_hex(kuten
):
43 """Kuten string to hex conversion"""
44 pieces
= map(int, kuten
.split(u
'-'))
45 print "DEBUG: kuten: %s, pieces: %s" % (kuten
, str(pieces
))
46 return ((pieces
[0] + 0x20) << 8) + (pieces
[1] + 0x20)
49 class Kanjidic2Node(object):
51 def __init__(self
, xml_node
):
53 self
.literal
= self
._get
_literal
()
55 def _get_literal(self
):
56 literal
= self
.xml
.find("literal").text
.strip()
57 assert len(literal
) == 1, "Literal has more than one character!"
61 o
= self
.xml
.find("misc/grade")
65 # By the spec, it seems like multiple freqs are possible??
66 # So... let's get all entries and assert.
67 o
= self
.xml
.findall("misc/freq")
71 u
"Character %s: Expected 1 freq entry, found %d" %
72 (self
._get
_literal
(), len(o
)))
76 o
= self
.xml
.find("misc/jlpt")
79 def _get_nanori(self
):
80 nodes
= self
.xml
.findall("reading_meaning/nanori")
83 nanori
= [o
.text
for o
in nodes
]
86 def _get_attrdict(self
, path
, attr_name
):
87 """Helper: stores elements on path in dict, keyed by attribute."""
89 nodes
= self
.xml
.findall(path
)
90 attrs
= set(o
.attrib
.get(attr_name
) for o
in nodes
)
92 d
[attr
] = [o
.text
for o
in nodes
93 if o
.attrib
.get(attr_name
) == attr
]
96 def _get_readings(self
):
97 """Returns dictionary of reading lists, keyed by type."""
98 return self
._get
_attrdict
("reading_meaning/rmgroup/reading", "r_type")
100 def _get_meanings(self
):
101 """Returns dictionary of gloss lists, keyed by language prefix."""
102 meaning_d
= self
._get
_attrdict
(
103 "reading_meaning/rmgroup/meaning", "m_lang")
104 if None in meaning_d
:
105 meaning_d
['en'] = meaning_d
[None]
109 def __unicode__(self
):
110 readings
= self
._get
_readings
()
111 meanings
= self
._get
_meanings
()
112 nanori
= self
._get
_nanori
()
113 grade
= self
._get
_grade
()
114 jlpt
= self
._get
_jlpt
()
115 freq
= self
._get
_freq
()
118 pieces
.append(u
"Literal: %s" % self
.literal
)
120 pieces
.append(u
"On-yomi: %s" % u
"、".join(readings
['ja_on']))
121 pieces
.append(u
"Kun-yomi: %s" % u
"、".join(readings
['ja_kun']))
122 pieces
.append(u
"Nanori: %s" % u
"、".join(nanori
))
124 pieces
.append(u
"Korean (Hangul): %s" %
125 u
", ".join(readings
['korean_h']))
126 pieces
.append(u
"Korean (Romanized): %s" %
127 u
", ".join(readings
['korean_r']))
128 pieces
.append(u
"Pinyin: %s" % u
", ".join(readings
['pinyin']))
130 for lang
in sorted(meanings
):
131 pieces
.append(u
"Meanings (%s): %s" %
132 (lang
, "; ".join(meanings
[lang
])))
135 pieces
.append(u
"JLPT grade level: %d" % jlpt
)
137 pieces
.append(u
"Jouyou grade level: %d" % grade
)
139 pieces
.append(u
"Newspaper frequency: %d" % freq
)
141 pieces
.append(u
"Unicode value: %04X" % ord(self
.literal
))
143 return u
"\n".join(pieces
)
146 class Parser(object):
148 def __init__(self
, filename
, encoding
="utf-8"):
149 """Initializer for Kanjidic2Parser.
151 About use_cache: Kanjidic2 is a large, heavy to parse file.
152 Although it takes a large amount of memory, it is better to
153 retain it in memory to increase the speed of subsequent
157 if not os
.path
.exists(filename
):
158 raise Exception("Dictionary file does not exist.")
159 self
.filename
= filename
160 self
.encoding
= encoding
162 self
.header
, self
.characters
= self
.load_via_etree()
164 def load_via_etree(self
):
165 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
166 f
= gzip
.open(self
.filename
)
168 f
= open(self
.filename
, "rb")
169 et
= ElementTree(file=f
)
171 nodes
= et
.getroot().getchildren()
172 header
, characters
= nodes
[0], nodes
[1:]
173 characters
= [Kanjidic2Node(char
) for char
in characters
]
174 return header
, characters
176 def get_header(self
):
178 for o
in self
.header
.getchildren():
179 cdata
= "".join((o
.text
, o
.tail
)).strip()
181 return "\n".join("%s: %s" % (k
, d
[k
]) for k
in sorted(d
))
183 def search(self
, query
):
184 self
.create_indices()
186 c
= self
.by_kanji
.get(u
)
190 def create_indices(self
):
193 print "Creating indices..."
197 for char
in self
.characters
:
198 literal
= char
.xml
.find("literal").text
.strip()
199 self
.by_kanji
[literal
] = char
200 print "Done creating indices!"
203 def encode_or_else(s
):
208 lines
= s
.split(u
"\n")
212 val
= line
.encode(charset
)
216 return u
"\n".join(out
)
219 if __name__
== "__main__":
223 dfname
, args
= sys
.argv
[1], sys
.argv
[2:]
225 except (IndexError, AssertionError):
226 print _(u
"Syntax: %s <dict_file> <character [...]>") % sys
.argv
[0]
232 print _(u
"Could not create Kanjidic2Parser: %s") % unicode(e
)
244 print "%d characters found" % len(p
.characters
)
246 for i
, kanji
in enumerate(p
.search("".join(args
).decode(charset
))):
247 kstr
= encode_or_else(unicode(kanji
))
248 print _(u
"Entry %d:\n%s\n") % (i
+1, kstr
)