2 # -*- coding: utf-8 -*-
5 gettext
.install('pyjben', unicode=True)
7 # Copied from J-Ben 1.x and modified using Gnome Character Map's
8 # "Unicode Block" information.
9 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
12 # 3040..309F; Hiragana
14 return o
>= 0x3040 and o
<= 0x309F
17 # 30A0..30FF; Katakana
18 # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
20 return o
>= 0x30A0 and o
<= 0x30FF
23 return is_hiragana(uc
) or is_katakana(uc
)
26 class KanjidicEntry(object):
43 # Info of low importance for most target users
45 self
.radical_c
= None # "Classic" KangXi Zidian radical
50 # "Query codes": Pattern-based lookup
51 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
52 # Codes: P, DRnnnn, Inxnn.n, Qnnnn.n
59 # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn
60 # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM
63 # Dictionary-related metadata
69 def __unicode__(self
):
70 """Dummy string dumper"""
72 for l
in [self
.kunyomi
, self
.onyomi
, self
.nanori
, self
.meanings
]:
75 strs
.insert(3, self
.radname
)
77 return u
", ".join(strs
)
79 class ParserState(object):
83 class KanjidicParser(object):
85 def __init__(self
, filename
, encoding
="EUC-JP"):
86 f
= open(filename
, "rb")
89 data
= data
.decode(encoding
)
90 self
.data
= data
.splitlines()
94 while self
.data
and (not line
or line
[0] == u
"#"):
95 line
= self
.data
.pop(0).strip()
96 return self
.parse_line(line
)
99 def _parse_japanese(self
, entry
, state
, data
):
100 if not state
.t_class
:
101 # Check hiragana/katakana
104 entry
.kunyomi
.append(data
)
107 entry
.onyomi
.append(data
)
109 elif state
.t_class
== 1:
110 entry
.nanori
.append(data
)
111 elif state
.t_class
== 2:
114 def _parse_info(self
, entry
, state
, data
):
118 # Unicode value - we alread store the literal as unicode, so let's
119 # use this as our encoding sanity check!
120 assert ord(entry
.literal
) == int(data
[1:], 16), \
121 "Encoding error detected"
123 entry
.radical
= int(data
[1:])
125 entry
.radical_c
= int(data
[1:])
127 entry
.freq
= int(data
[1:])
129 entry
.grade
= int(data
[1:])
131 entry
.jlpt
= int(data
[1:])
134 if not entry
.strokes
:
137 entry
.strokes_alt
.append(i
)
139 entry
.korean
.append(data
[1:])
141 entry
.pinyin
.append(data
[1:])
143 entry
.xref
.append(data
[1:])
145 entry
.misclass
.append(data
[1:])
147 state
.t_class
= int(data
[1:])
148 # Below this point is dictionary/query codes.
149 # Much of this is copied and modified from J-Ben 1's source code.
151 # New Japanese-English Character Dictionary (Halpern)
152 entry
.dcodes
["halpern_njecd"] = data
[1:]
154 # Modern Reader's Japanese-English Character Dictionary (Nelson)
155 entry
.dcodes
["nelson_c"] = data
[1:]
157 # The New Nelson's Japanese-English Character Dictionary
158 entry
.dcodes
["nelson_n"] = data
[1:]
161 # Thanks to changes in permissible SKIP code usage (change to
162 # Creative Commons licensing in January 2008), we can now use
163 # this without problems.
164 entry
.skip
.append(data
[1:]);
165 elif c
== 'I': # Spahn/Hadamitzky dictionaries
167 # Kanji & Kana (Spahn, Hadamitzky)
168 entry
.dcodes
["sh_kk"] = data
[2:]
170 # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
171 entry
.sh_desc
= data
[1:]
177 # Morohashi Daikanwajiten Index
178 #entry.dcodes["moro"].insert(0,"] ps->substr(2));
181 # Morohashi Daikanwajiten Volume/Page
182 #entry.dcodes["moro"] \
183 # .append(1, '/').append(ps->substr(2));
186 # A Guide to Remembering Japanese Characters (Henshall)
187 entry
.dcodes
["henshall"] = data
[1:]
189 # Gakken Kanji Dictionary ("A New Dictionary of Kanji Usage")
190 entry
.dcodes
["gakken"] = data
[1:]
192 # Remembering the Kanji (Heisig)
193 entry
.dcodes
["heisig"] = data
[1:]
195 # Japanese Names (O'Neill)
196 entry
.dcodes
["oneill_names"] = data
[1:]
200 # Japanese for Busy People (AJLT)
201 entry
.dcodes
["busy_people"] = data
[2:]
203 # The Kanji Way to Japanese Language Power (Crowley)
204 entry
.dcodes
["crowley"] = int(data
[2:])
206 # Japanese Kanji Flashcards (White Rabbit Press)
207 entry
.dcodes
["jf_cards"] = int(data
[2:])
209 # Kodansha Compact Kanji Guide
210 entry
.dcodes
["kodansha_compact"] = int(data
[2:])
212 # A Guide To Reading and Writing Japanese (Henshall)
213 entry
.dcodes
["henshall3"] = int(data
[2:])
215 # Kanji in Context (Nishiguchi and Kono)
216 entry
.dcodes
["kanji_in_context"] = int(data
[2:])
218 # Kodansha Kanji Learner's Dictionary (Halpern)
219 entry
.dcodes
["halpern_kkld"] = int(data
[2:])
221 # Essential Kanji (O'Neill)
222 entry
.dcodes
["oneill_kk"] = int(data
[2:])
224 # Query Code: 2001 Kanji (De Roo)
225 entry
.deroo
= int(data
[2:])
227 # A Guide to Reading and Writing Japanese (Sakade)
228 entry
.dcodes
["sakade"] = int(data
[2:])
230 # Tuttle Kanji Cards (Kask)
231 entry
.dcodes
["tutt_cards"] = int(data
[2:])
233 # Yves Maniette's French adaption of Heisig
234 entry
.dcodes
["maniette"] = int(data
[2:])
236 entry
.unparsed
.append(data
)
238 entry
.unparsed
.append(data
)
240 entry
.unparsed
.append(data
)
242 def parse_line(self
, line
):
245 entry
= KanjidicEntry()
246 state
= ParserState() # Holds "t class"
248 # First 2 fields are always the same
249 pieces
= line
.split(None, 2)
250 entry
.literal
= pieces
.pop(0)
251 entry
.jis
= int(pieces
.pop(0), 16)
254 # Parse the remainder
256 while si
< len(misc
):
262 if i
> 0xFF or c
in (u
'-', u
'.'):
264 ei
= misc
.find(u
' ', si
+1)
269 self
._parse
_japanese
(entry
, state
, sub
)
272 si
+= 1 # Move si inside of {
273 ei
= misc
.find(u
'}', si
+1)
277 ei
+= 1 # Move ei past }
279 entry
.meanings
.append(sub
)
282 ei
= misc
.find(u
' ', si
+1)
287 self
._parse
_info
(entry
, state
, sub
)
294 if __name__
== "__main__":
297 if len(sys
.argv
) < 2:
298 print _("Please specify a dictionary file.")
301 kp
= KanjidicParser(sys
.argv
[1])
303 print _("Could not create KanjidicParser: %s") % str(e
)
307 entry
= kp
.get_entry()
312 lines
.append(_(u
"[%s] Unparsed: [%s]")
313 % (entry
.literal
, ", ".join(entry
.unparsed
)))
314 print u
"\n".join(lines
)
315 except UnicodeEncodeError, e
:
317 entry
= kp
.get_entry()
319 print _("Warning: could not print %d entries, since they could not be "
320 "properly displayed on your terminal.") % err_count