Directory restructuring. No tests yet done.
[jben2_gui.git] / jben / parsers / edict.py
bloba0fab44180b367ebe2f08638d9c32f525e7ecdfc
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for EDICT.
33 This version is intended to be a more-or-less complete EDICT parser,
34 with the exception of not doing special parsing for loan word tags.
35 If you require special handling for those, then you probably ought to
36 be using JMdict instead.
38 """
40 import re, gzip, gettext
41 gettext.install('pyjben', unicode=True)
44 # Below follows the information codes sorted more-or-less as they are
45 # on http://www.csse.monash.edu.au/~jwb/edict_doc.html, however more
46 # up to date. These sets are accurate as of 2009-Jul-17.
48 # Part of speech codes
49 valid_pos_codes = set((
50 "adj-i", "adj-na", "adj-no", "adj-pn", "adj-t", "adj-f", "adj",
51 "adv", "adv-to", "aux", "aux-v", "aux-adj", "conj", "ctr", "exp",
52 "int", "iv", "n", "n-adv", "n-suf", "n-pref", "n-t", "num", "pn",
53 "pref", "prt", "suf", "v1", "v2a-s", "v4h", "v4r", "v5", "v5aru",
54 "v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r", "v5r-i", "v5s",
55 "v5t", "v5u", "v5u-s", "v5uru", "v5z", "vz", "vi", "vk", "vn",
56 "vr", "vs", "vs-s", "vs-i", "vt",
59 # Field of application codes
60 valid_foa_codes = set((
61 "Buddh", "MA", "comp", "food", "geom", "ling", "math", "mil",
62 "physics", "chem"
65 # Miscellaneous marking codes
66 valid_misc_codes = set((
67 "X", "abbr", "arch", "ateji", "chn", "col", "derog", "eK", "ek",
68 "fam", "fem", "gikun", "hon", "hum", "iK", "id", "ik", "io",
69 "m-sl", "male", "male-sl", "oK", "obs", "obsc", "ok", "on-mim",
70 "poet", "pol", "rare", "sens", "sl", "uK", "uk", "vulg"
73 # Dialect codes
74 valid_dialect_codes = set((
75 "kyb", "osb", "ksb", "ktb", "tsb", "thb", "tsug", "kyu", "rkb",
76 "nab"
79 # Grab all ()'s before a gloss
80 all_paren_match = re.compile("^(\([^)]*\)[ ]*)+")
81 # Grab the first () data entry, with group(1) set to the contents
82 paren_match = re.compile(u"^[ ]*\(([^)]+)\)[ ]*")
84 def info_field_valid(i_field):
85 """Returns whether a given info code is valid."""
87 # Validity is a sticky issue since there's so many fields:
89 # - Sense markers (1, 2, 3, ...)
90 # - Part of speech markers (n, adv, v5r)
91 # - Field of application markers (comp, math, mil)
92 # - Miscellaneous meanings (X, abbr, arch, ateji, ..........)
93 # - Word priority (P)
94 # ? Okurigana variants (Maybe this is JMdict only?)
95 # - Loan words, a.k.a. Gairaigo
96 # - Regional Japanese words (Kansai-ben, etc.)
98 # Thankfully, this function should be reusable in the edict2 parser...
100 if i_field in valid_pos_codes: return True
101 if i_field == "P": return True
102 if i_field in valid_misc_codes: return True
103 if i_field in valid_foa_codes: return True
104 if i_field[:-1] in valid_dialect_codes: return True
105 # Check for (1), (2), etc.
106 try:
107 i = int(i_field)
108 return True
109 except:
110 return False
112 class EdictEntry(object):
114 def __init__(self, raw_entry, quick_parsing=True):
116 # Japanese - note, if only a kana reading is present, it's
117 # stored as "japanese", and furigana is left as None.
118 self.japanese = None
119 self.furigana = None
120 # Native language glosses
121 self.glosses = []
122 # Info fields should be inserted here as "tags".
123 self.tags = set()
124 # Currently unhandled stuff goes here...
125 self.unparsed = []
127 # Most people don't need ultra-fancy parsing and can happily
128 # take glosses with keywords stuck in them. In this case,
129 # they can save processing time by using parse_entry_quick.
130 # However, this will mean that "J-Ben"-style entry sorting may
131 # not work exactly as expected because of tags being appended
132 # to the beginning or end.
134 # Note: Even with full parsing, due to a few entries with tags
135 # at the end of their glosses, there's a few entries which will not
136 # successfully match on an "ends with" search.
138 # ENABLE THIS once parse_entry_quick is implemented.
139 if quick_parsing:
140 self.parse_entry_quick(raw_entry)
141 else:
142 self.parse_entry(raw_entry)
144 def parse_entry(self, raw_entry):
145 if not raw_entry:
146 return None
148 jdata, ndata = raw_entry.split(u'/', 1)
150 # Get Japanese
151 pieces = jdata.split(u'[', 1)
152 self.japanese = pieces[0].strip()
153 if len(pieces) > 1:
154 # Store furigana without '[]'
155 self.furigana = pieces[1].strip()[:-1]
157 #if self.furigana:
158 # print "JAPANESE: %s, FURIGANA: %s" % (self.japanese, self.furigana)
159 #else:
160 # print "JAPANESE: %s" % self.japanese
162 # Get native language data
163 glosses = ndata.split(u'/')
164 for gloss in glosses:
165 # For each gloss, we need to check for ()'s at the beginning.
166 # Multiple such ()'s may be present.
167 # The actual gloss does not begin until the last set (or
168 # an unhandled one) is encountered.
170 if not gloss: continue
171 #print "Unparsed gloss: [%s]" % gloss
173 info = None
174 m = all_paren_match.match(gloss)
175 if m:
176 info = m.group(0)
177 if info:
178 gloss_start = m.span()[1]
179 gloss = gloss[gloss_start:]
180 #print "Info field captured: [%s]" % info
182 while info:
183 m = paren_match.match(info)
184 #if not m: break # Shouldn't ever happen...
185 i_field = m.group(1)
186 #print "INFO FIELD FOUND:", i_field
187 i_fields = i_field.split(u',')
189 # Check that all i_fields are valid
190 bools = map(info_field_valid, i_fields)
191 ok = reduce(lambda x, y: x and y, bools)
193 if not ok:
194 #print "INVALID INFO FIELD FOUND, REVERTING"
195 #print "INFO WAS %s, GLOSS WAS %s" % (info, gloss)
196 print info
197 gloss = info + gloss
198 #print "RESTORED GLOSS:", gloss
199 break
201 for tag in i_fields:
202 self.tags.add(tag.rstrip(':')) # Handles "ksb:"
203 # and other
204 # dialect codes
205 #print "INFO FIELD FOUND:", i
206 next_i = m.span()[1]
207 info = info[next_i:]
209 #print "APPENDING GLOSS:", gloss
210 self.glosses.append(gloss)
212 def parse_entry_quick(self, raw_entry):
213 if not raw_entry:
214 return None
216 jdata, ndata = raw_entry.split(u'/', 1)
218 # Get Japanese
219 pieces = jdata.split(u'[', 1)
220 self.japanese = pieces[0].strip()
221 if len(pieces) > 1:
222 # Store furigana without '[]'
223 self.furigana = pieces[1].strip()[:-1]
225 # Get native language data
226 self.glosses = ndata.split(u'/')
228 # EDICT FORMAT:
229 # KANJI [KANA] /(general information) gloss/gloss/.../
230 # or
231 # KANA /(general information) gloss/gloss/.../
233 # Where there are multiple senses, these are indicated by (1), (2),
234 # etc. before the first gloss in each sense. As this format only
235 # allows a single kanji headword and reading, entries are generated
236 # for each possible headword/reading combination. As the format
237 # restricts Japanese characters to the kanji and kana fields, any
238 # cross-reference data and other informational fields are omitted.
240 # EDICT2 FORMAT:
242 # KANJI-1;KANJI-2 [KANA-1;KANA-2] /(general information) (see xxxx) gloss/gloss/.../
249 # # First 2 fields are always the same
250 # pieces = raw_entry.split(None, 2)
251 # misc = pieces.pop()
252 # self.jis = int(pieces.pop(), 16)
253 # self.literal = pieces.pop()
255 # # Parse the remainder
256 # si = ei = 0
257 # while si < len(misc):
258 # c = misc[si]
259 # i = ord(c)
260 # if c == u' ':
261 # si += 1
262 # continue
263 # if i > 0xFF or c in (u'-', u'.'):
264 # # Parse Japanese
265 # ei = misc.find(u' ', si+1)
266 # if ei == -1:
267 # ei = len(misc) + 1
268 # sub = misc[si:ei]
270 # self._parse_japanese(state, sub)
271 # elif c == u'{':
272 # # Parse Translation
273 # si += 1 # Move si inside of {
274 # ei = misc.find(u'}', si+1)
275 # if ei == -1:
276 # ei = len(misc) + 1
277 # sub = misc[si:ei]
278 # ei += 1 # Move ei past }
280 # self.meanings.append(sub)
281 # else:
282 # # Parse info field
283 # ei = misc.find(u' ', si+1)
284 # if ei == -1:
285 # ei = len(misc) + 1
286 # sub = misc[si:ei]
288 # self._parse_info(state, sub)
290 # si = ei + 1
292 def to_string(self, **kwargs):
293 if self.furigana:
294 ja = _(u"%s [%s]") % (self.japanese, self.furigana)
295 else:
296 ja = self.japanese
297 native = _(u"; ").join(self.glosses)
298 return _(u"%s: %s") % (ja, native)
300 def __unicode__(self):
301 """Dummy string dumper"""
302 return unicode(self.__repr__())
304 class EdictParser(object):
306 def __init__(self, filename, use_cache=True, encoding="EUC-JP"):
307 self.filename = filename
308 self.encoding = encoding
309 self.use_cache = use_cache
310 self.cache = {}
312 def search(self, query):
313 """Returns a list of entries matching the query."""
314 results = []
316 def proc_entry(entry):
317 if query in entry.japanese:
318 results.append(entry)
319 else:
320 for gloss in entry.glosses:
321 if query in gloss:
322 results.append(entry)
323 break
325 if self.use_cache and self.cache:
326 # Read from cache
327 for k, entry in self.cache.iteritems():
328 proc_entry(entry)
329 else:
330 # Read from file
331 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
332 f = gzip.open(self.filename)
333 else:
334 f = open(self.filename, "rb")
335 fdata = f.read()
336 f.close()
337 fdata = fdata.decode(self.encoding)
338 lines = fdata.splitlines()
339 lines = [line for line in lines if line and (line[0] != u"#")]
341 data = {}
342 for line in lines:
343 entry = EdictEntry(line)
344 if self.use_cache:
345 self.cache[entry.japanese] = entry
346 proc_entry(entry)
348 # Very simple sorting of results.
349 # (Requires that (P) is left in glosses...)
350 common = []
351 other = []
353 for item in results:
354 is_common = False
355 for gloss in item.glosses:
356 if u'(P)' in gloss:
357 is_common = True
358 break
359 if is_common:
360 common.append(item)
361 else:
362 other.append(item)
364 results = common
365 results.extend(other)
367 # Return results
368 return results
370 if __name__ == "__main__":
371 import sys, os
373 if len(sys.argv) < 2:
374 print _(u"Please specify a dictionary file.")
375 exit(-1)
376 try:
377 kp = EdictParser(sys.argv[1])
378 except Exception, e:
379 print _(u"Could not create EdictParser: %s") % unicode(e)
380 exit(-1)
382 if len(sys.argv) < 3:
383 print _(u"Please specify a search query.")
384 exit(-1)
386 if os.name == "nt":
387 charset = "cp932"
388 else:
389 charset = "utf-8"
391 for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
392 print _(u"Entry %d: %s") % (i+1, entry.to_string())