2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A parser for EDICT.
33 This version is intended to be a more-or-less complete EDICT parser,
34 with the exception of not doing special parsing for loan word tags.
35 If you require special handling for those, then you probably ought to
36 be using JMdict instead.
40 import re
, gzip
, gettext
41 gettext
.install('pyjben', unicode=True)
44 # Below follows the information codes sorted more-or-less as they are
45 # on http://www.csse.monash.edu.au/~jwb/edict_doc.html, however more
46 # up to date. These sets are accurate as of 2009-Jul-17.
48 # Part of speech codes
49 valid_pos_codes
= set((
50 "adj-i", "adj-na", "adj-no", "adj-pn", "adj-t", "adj-f", "adj",
51 "adv", "adv-to", "aux", "aux-v", "aux-adj", "conj", "ctr", "exp",
52 "int", "iv", "n", "n-adv", "n-suf", "n-pref", "n-t", "num", "pn",
53 "pref", "prt", "suf", "v1", "v2a-s", "v4h", "v4r", "v5", "v5aru",
54 "v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r", "v5r-i", "v5s",
55 "v5t", "v5u", "v5u-s", "v5uru", "v5z", "vz", "vi", "vk", "vn",
56 "vr", "vs", "vs-s", "vs-i", "vt",
59 # Field of application codes
60 valid_foa_codes
= set((
61 "Buddh", "MA", "comp", "food", "geom", "ling", "math", "mil",
65 # Miscellaneous marking codes
66 valid_misc_codes
= set((
67 "X", "abbr", "arch", "ateji", "chn", "col", "derog", "eK", "ek",
68 "fam", "fem", "gikun", "hon", "hum", "iK", "id", "ik", "io",
69 "m-sl", "male", "male-sl", "oK", "obs", "obsc", "ok", "on-mim",
70 "poet", "pol", "rare", "sens", "sl", "uK", "uk", "vulg"
74 valid_dialect_codes
= set((
75 "kyb", "osb", "ksb", "ktb", "tsb", "thb", "tsug", "kyu", "rkb",
79 # Grab all ()'s before a gloss
80 all_paren_match
= re
.compile("^(\([^)]*\)[ ]*)+")
81 # Grab the first () data entry, with group(1) set to the contents
82 paren_match
= re
.compile(u
"^[ ]*\(([^)]+)\)[ ]*")
84 def info_field_valid(i_field
):
85 """Returns whether a given info code is valid."""
87 # Validity is a sticky issue since there's so many fields:
89 # - Sense markers (1, 2, 3, ...)
90 # - Part of speech markers (n, adv, v5r)
91 # - Field of application markers (comp, math, mil)
92 # - Miscellaneous meanings (X, abbr, arch, ateji, ..........)
94 # ? Okurigana variants (Maybe this is JMdict only?)
95 # - Loan words, a.k.a. Gairaigo
96 # - Regional Japanese words (Kansai-ben, etc.)
98 # Thankfully, this function should be reusable in the edict2 parser...
100 if i_field
in valid_pos_codes
: return True
101 if i_field
== "P": return True
102 if i_field
in valid_misc_codes
: return True
103 if i_field
in valid_foa_codes
: return True
104 if i_field
[:-1] in valid_dialect_codes
: return True
105 # Check for (1), (2), etc.
112 class EdictEntry(object):
114 def __init__(self
, raw_entry
, quick_parsing
=True):
116 # Japanese - note, if only a kana reading is present, it's
117 # stored as "japanese", and furigana is left as None.
120 # Native language glosses
122 # Info fields should be inserted here as "tags".
124 # Currently unhandled stuff goes here...
127 # Most people don't need ultra-fancy parsing and can happily
128 # take glosses with keywords stuck in them. In this case,
129 # they can save processing time by using parse_entry_quick.
130 # However, this will mean that "J-Ben"-style entry sorting may
131 # not work exactly as expected because of tags being appended
132 # to the beginning or end.
134 # Note: Even with full parsing, due to a few entries with tags
135 # at the end of their glosses, there's a few entries which will not
136 # successfully match on an "ends with" search.
138 # ENABLE THIS once parse_entry_quick is implemented.
140 self
.parse_entry_quick(raw_entry
)
142 self
.parse_entry(raw_entry
)
144 def parse_entry(self
, raw_entry
):
148 jdata
, ndata
= raw_entry
.split(u
'/', 1)
151 pieces
= jdata
.split(u
'[', 1)
152 self
.japanese
= pieces
[0].strip()
154 # Store furigana without '[]'
155 self
.furigana
= pieces
[1].strip()[:-1]
158 # print "JAPANESE: %s, FURIGANA: %s" % (self.japanese, self.furigana)
160 # print "JAPANESE: %s" % self.japanese
162 # Get native language data
163 glosses
= ndata
.split(u
'/')
164 for gloss
in glosses
:
165 # For each gloss, we need to check for ()'s at the beginning.
166 # Multiple such ()'s may be present.
167 # The actual gloss does not begin until the last set (or
168 # an unhandled one) is encountered.
170 if not gloss
: continue
171 #print "Unparsed gloss: [%s]" % gloss
174 m
= all_paren_match
.match(gloss
)
178 gloss_start
= m
.span()[1]
179 gloss
= gloss
[gloss_start
:]
180 #print "Info field captured: [%s]" % info
183 m
= paren_match
.match(info
)
184 #if not m: break # Shouldn't ever happen...
186 #print "INFO FIELD FOUND:", i_field
187 i_fields
= i_field
.split(u
',')
189 # Check that all i_fields are valid
190 bools
= map(info_field_valid
, i_fields
)
191 ok
= reduce(lambda x
, y
: x
and y
, bools
)
194 #print "INVALID INFO FIELD FOUND, REVERTING"
195 #print "INFO WAS %s, GLOSS WAS %s" % (info, gloss)
198 #print "RESTORED GLOSS:", gloss
202 self
.tags
.add(tag
.rstrip(':')) # Handles "ksb:"
205 #print "INFO FIELD FOUND:", i
209 #print "APPENDING GLOSS:", gloss
210 self
.glosses
.append(gloss
)
212 def parse_entry_quick(self
, raw_entry
):
216 jdata
, ndata
= raw_entry
.split(u
'/', 1)
219 pieces
= jdata
.split(u
'[', 1)
220 self
.japanese
= pieces
[0].strip()
222 # Store furigana without '[]'
223 self
.furigana
= pieces
[1].strip()[:-1]
225 # Get native language data
226 self
.glosses
= ndata
.split(u
'/')
229 # KANJI [KANA] /(general information) gloss/gloss/.../
231 # KANA /(general information) gloss/gloss/.../
233 # Where there are multiple senses, these are indicated by (1), (2),
234 # etc. before the first gloss in each sense. As this format only
235 # allows a single kanji headword and reading, entries are generated
236 # for each possible headword/reading combination. As the format
237 # restricts Japanese characters to the kanji and kana fields, any
238 # cross-reference data and other informational fields are omitted.
242 # KANJI-1;KANJI-2 [KANA-1;KANA-2] /(general information) (see xxxx) gloss/gloss/.../
249 # # First 2 fields are always the same
250 # pieces = raw_entry.split(None, 2)
251 # misc = pieces.pop()
252 # self.jis = int(pieces.pop(), 16)
253 # self.literal = pieces.pop()
255 # # Parse the remainder
257 # while si < len(misc):
263 # if i > 0xFF or c in (u'-', u'.'):
265 # ei = misc.find(u' ', si+1)
270 # self._parse_japanese(state, sub)
272 # # Parse Translation
273 # si += 1 # Move si inside of {
274 # ei = misc.find(u'}', si+1)
278 # ei += 1 # Move ei past }
280 # self.meanings.append(sub)
283 # ei = misc.find(u' ', si+1)
288 # self._parse_info(state, sub)
292 def to_string(self
, **kwargs
):
294 ja
= _(u
"%s [%s]") % (self
.japanese
, self
.furigana
)
297 native
= _(u
"; ").join(self
.glosses
)
298 return _(u
"%s: %s") % (ja
, native
)
300 def __unicode__(self
):
301 """Dummy string dumper"""
302 return unicode(self
.__repr
__())
304 class EdictParser(object):
306 def __init__(self
, filename
, use_cache
=True, encoding
="EUC-JP"):
307 self
.filename
= filename
308 self
.encoding
= encoding
309 self
.use_cache
= use_cache
312 def search(self
, query
):
313 """Returns a list of entries matching the query."""
316 def proc_entry(entry
):
317 if query
in entry
.japanese
:
318 results
.append(entry
)
320 for gloss
in entry
.glosses
:
322 results
.append(entry
)
325 if self
.use_cache
and self
.cache
:
327 for k
, entry
in self
.cache
.iteritems():
331 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
332 f
= gzip
.open(self
.filename
)
334 f
= open(self
.filename
, "rb")
337 fdata
= fdata
.decode(self
.encoding
)
338 lines
= fdata
.splitlines()
339 lines
= [line
for line
in lines
if line
and (line
[0] != u
"#")]
343 entry
= EdictEntry(line
)
345 self
.cache
[entry
.japanese
] = entry
348 # Very simple sorting of results.
349 # (Requires that (P) is left in glosses...)
355 for gloss
in item
.glosses
:
365 results
.extend(other
)
370 if __name__
== "__main__":
373 if len(sys
.argv
) < 2:
374 print _(u
"Please specify a dictionary file.")
377 kp
= EdictParser(sys
.argv
[1])
379 print _(u
"Could not create EdictParser: %s") % unicode(e
)
382 if len(sys
.argv
) < 3:
383 print _(u
"Please specify a search query.")
391 for i
, entry
in enumerate(kp
.search(sys
.argv
[2].decode(charset
))):
392 print _(u
"Entry %d: %s") % (i
+1, entry
.to_string())