Font selection added. Preference saving/loading improved.
[jben2_gui.git] / parser_kanjidic.py
blobd41344e73bdd21cb34633c7c218210fda797c107
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 import gettext
5 gettext.install('pyjben', unicode=True)
7 # Copied from J-Ben 1.x and modified using Gnome Character Map's
8 # "Unicode Block" information.
9 # Verified against http://unicode.org/Public/UNIDATA/Blocks.txt.
11 def is_hiragana(uc):
12 # 3040..309F; Hiragana
13 o = ord(uc)
14 return o >= 0x3040 and o <= 0x309F
16 def is_katakana(uc):
17 # 30A0..30FF; Katakana
18 # 31F0..31FF; Katakana Phonetic Extensions (Not currently used in J-Ben)
19 o = ord(uc)
20 return o >= 0x30A0 and o <= 0x30FF
22 def is_furigana(uc):
23 return is_hiragana(uc) or is_katakana(uc)
26 class KanjidicEntry(object):
28 def __init__(self):
29 # Key info
30 self.literal = None
31 self.meanings = []
32 self.kunyomi = []
33 self.onyomi = []
34 self.nanori = []
36 # Secondary info
37 self.strokes = None
38 self.strokes_alt = []
39 self.freq = None
40 self.grade = None
41 self.jlpt = None
43 # Info of low importance for most target users
44 self.radical = None
45 self.radical_c = None # "Classic" KangXi Zidian radical
46 self.radname = None
47 self.pinyin = []
48 self.korean = []
50 # "Query codes": Pattern-based lookup
51 # Includes SKIP, DeRoo, Spahn/Hadamitzky, and Four Corners systems
52 # Codes: P, DRnnnn, Inxnn.n, Qnnnn.n
53 self.skip = []
54 self.deroo = None
55 self.sh_desc = None
56 self.fc = None
58 # Dictionary codes
59 # Non-D codes: H, N, V, INnnnn, MNnnnnnnn/MPnn.nnnn, Ennnn, Knnnn, Lnnnn, Onnnn
60 # D codes: DB, DC, DF, DG, DH, DJ, DK, DM, DO, DR, DS, DT, DM
61 self.dcodes = {}
63 # Dictionary-related metadata
64 self.xref = []
65 self.misclass = []
67 self.unparsed = []
69 def __unicode__(self):
70 """Dummy string dumper"""
71 strs = [self.literal]
72 for l in [self.kunyomi, self.onyomi, self.nanori, self.meanings]:
73 strs.extend(l)
74 if self.radname:
75 strs.insert(3, self.radname)
77 return u", ".join(strs)
79 class ParserState(object):
80 def __init__(self):
81 self.t_class = 0
83 class KanjidicParser(object):
85 def __init__(self, filename, encoding="EUC-JP"):
86 f = open(filename, "rb")
87 data = f.read()
88 f.close()
89 data = data.decode(encoding)
90 self.data = data.splitlines()
92 def get_entry(self):
93 line = None
94 while self.data and (not line or line[0] == u"#"):
95 line = self.data.pop(0).strip()
96 return self.parse_line(line)
99 def _parse_japanese(self, entry, state, data):
100 if not state.t_class:
101 # Check hiragana/katakana
102 for c in data:
103 if is_hiragana(c):
104 entry.kunyomi.append(data)
105 break
106 elif is_katakana(c):
107 entry.onyomi.append(data)
108 break
109 elif state.t_class == 1:
110 entry.nanori.append(data)
111 elif state.t_class == 2:
112 entry.radname = data
114 def _parse_info(self, entry, state, data):
115 try:
116 c = data[0]
117 if c == 'U':
118 # Unicode value - we alread store the literal as unicode, so let's
119 # use this as our encoding sanity check!
120 assert ord(entry.literal) == int(data[1:], 16), \
121 "Encoding error detected"
122 elif c == 'B':
123 entry.radical = int(data[1:])
124 elif c == 'C':
125 entry.radical_c = int(data[1:])
126 elif c == 'F':
127 entry.freq = int(data[1:])
128 elif c == 'G':
129 entry.grade = int(data[1:])
130 elif c == 'J':
131 entry.jlpt = int(data[1:])
132 elif c == 'S':
133 i = int(data[1:])
134 if not entry.strokes:
135 entry.strokes = i
136 else:
137 entry.strokes_alt.append(i)
138 elif c == 'W':
139 entry.korean.append(data[1:])
140 elif c == 'Y':
141 entry.pinyin.append(data[1:])
142 elif c == 'X':
143 entry.xref.append(data[1:])
144 elif c == 'Z':
145 entry.misclass.append(data[1:])
146 elif c == 'T':
147 state.t_class = int(data[1:])
148 # Below this point is dictionary/query codes.
149 # Much of this is copied and modified from J-Ben 1's source code.
150 elif c == 'H':
151 # New Japanese-English Character Dictionary (Halpern)
152 entry.dcodes["halpern_njecd"] = data[1:]
153 elif c == 'N':
154 # Modern Reader's Japanese-English Character Dictionary (Nelson)
155 entry.dcodes["nelson_c"] = data[1:]
156 elif c == 'V':
157 # The New Nelson's Japanese-English Character Dictionary
158 entry.dcodes["nelson_n"] = data[1:]
159 elif c == 'P':
160 # SKIP codes.
161 # Thanks to changes in permissible SKIP code usage (change to
162 # Creative Commons licensing in January 2008), we can now use
163 # this without problems.
164 entry.skip.append(data[1:]);
165 elif c == 'I': # Spahn/Hadamitzky dictionaries
166 if data[1] =='N':
167 # Kanji & Kana (Spahn, Hadamitzky)
168 entry.dcodes["sh_kk"] = data[2:]
169 else:
170 # Query Code: Kanji Dictionary (Spahn, Hadamitzky)
171 entry.sh_desc = data[1:]
172 elif c == 'Q':
173 # Four Corner code
174 entry.fc = data[1:]
175 elif c == 'M':
176 if data[1] == 'N':
177 # Morohashi Daikanwajiten Index
178 #entry.dcodes["moro"].insert(0,"] ps->substr(2));
179 pass
180 elif data[1] == 'P':
181 # Morohashi Daikanwajiten Volume/Page
182 #entry.dcodes["moro"] \
183 # .append(1, '/').append(ps->substr(2));
184 pass
185 elif c == 'E':
186 # A Guide to Remembering Japanese Characters (Henshall)
187 entry.dcodes["henshall"] = data[1:]
188 elif c == 'K':
189 # Gakken Kanji Dictionary ("A New Dictionary of Kanji Usage")
190 entry.dcodes["gakken"] = data[1:]
191 elif c == 'L':
192 # Remembering the Kanji (Heisig)
193 entry.dcodes["heisig"] = data[1:]
194 elif c == 'O':
195 # Japanese Names (O'Neill)
196 entry.dcodes["oneill_names"] = data[1:]
197 elif c == 'D':
198 c = data[1]
199 if c == 'B':
200 # Japanese for Busy People (AJLT)
201 entry.dcodes["busy_people"] = data[2:]
202 elif c == 'C':
203 # The Kanji Way to Japanese Language Power (Crowley)
204 entry.dcodes["crowley"] = int(data[2:])
205 elif c == 'F':
206 # Japanese Kanji Flashcards (White Rabbit Press)
207 entry.dcodes["jf_cards"] = int(data[2:])
208 elif c == 'G':
209 # Kodansha Compact Kanji Guide
210 entry.dcodes["kodansha_compact"] = int(data[2:])
211 elif c == 'H':
212 # A Guide To Reading and Writing Japanese (Henshall)
213 entry.dcodes["henshall3"] = int(data[2:])
214 elif c == 'J':
215 # Kanji in Context (Nishiguchi and Kono)
216 entry.dcodes["kanji_in_context"] = int(data[2:])
217 elif c == 'K':
218 # Kodansha Kanji Learner's Dictionary (Halpern)
219 entry.dcodes["halpern_kkld"] = int(data[2:])
220 elif c == 'O':
221 # Essential Kanji (O'Neill)
222 entry.dcodes["oneill_kk"] = int(data[2:])
223 elif c == 'R':
224 # Query Code: 2001 Kanji (De Roo)
225 entry.deroo = int(data[2:])
226 elif c == 'S':
227 # A Guide to Reading and Writing Japanese (Sakade)
228 entry.dcodes["sakade"] = int(data[2:])
229 elif c == 'T':
230 # Tuttle Kanji Cards (Kask)
231 entry.dcodes["tutt_cards"] = int(data[2:])
232 elif c == 'M':
233 # Yves Maniette's French adaption of Heisig
234 entry.dcodes["maniette"] = int(data[2:])
235 else:
236 entry.unparsed.append(data)
237 else:
238 entry.unparsed.append(data)
239 except:
240 entry.unparsed.append(data)
242 def parse_line(self, line):
243 if not line:
244 return None
245 entry = KanjidicEntry()
246 state = ParserState() # Holds "t class"
248 # First 2 fields are always the same
249 pieces = line.split(None, 2)
250 entry.literal = pieces.pop(0)
251 entry.jis = int(pieces.pop(0), 16)
252 misc = pieces.pop()
254 # Parse the remainder
255 si = ei = 0
256 while si < len(misc):
257 c = misc[si]
258 i = ord(c)
259 if c == u' ':
260 si += 1
261 continue
262 if i > 0xFF or c in (u'-', u'.'):
263 # Parse Japanese
264 ei = misc.find(u' ', si+1)
265 if ei == -1:
266 ei = len(misc) + 1
267 sub = misc[si:ei]
269 self._parse_japanese(entry, state, sub)
270 elif c == u'{':
271 # Parse Translation
272 si += 1 # Move si inside of {
273 ei = misc.find(u'}', si+1)
274 if ei == -1:
275 ei = len(misc) + 1
276 sub = misc[si:ei]
277 ei += 1 # Move ei past }
279 entry.meanings.append(sub)
280 else:
281 # Parse info field
282 ei = misc.find(u' ', si+1)
283 if ei == -1:
284 ei = len(misc) + 1
285 sub = misc[si:ei]
287 self._parse_info(entry, state, sub)
289 si = ei + 1
291 return entry
294 if __name__ == "__main__":
295 import sys
297 if len(sys.argv) < 2:
298 print _("Please specify a dictionary file.")
299 exit(-1)
300 try:
301 kp = KanjidicParser(sys.argv[1])
302 except Exception, e:
303 print _("Could not create KanjidicParser: %s") % str(e)
304 exit(-1)
306 err_count = 0
307 entry = kp.get_entry()
308 while entry:
309 try:
310 if entry.unparsed:
311 lines = []
312 lines.append(_(u"[%s] Unparsed: [%s]")
313 % (entry.literal, ", ".join(entry.unparsed)))
314 print u"\n".join(lines)
315 except UnicodeEncodeError, e:
316 err_count += 1
317 entry = kp.get_entry()
318 if err_count:
319 print _("Warning: could not print %d entries, since they could not be "
320 "properly displayed on your terminal.") % err_count