jben/parsers/edict.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A parser for EDICT.
  32
  33 This version is intended to be a more-or-less complete EDICT parser,
  34 with the exception of not doing special parsing for loan word tags.
  35 If you require special handling for those, then you probably ought to
  36 be using JMdict instead.
  37
  38 """
  39
  40 import re, gzip, gettext
  41 gettext.install('pyjben', unicode=True)
  42
  43
  44 # Below follows the information codes sorted more-or-less as they are
  45 # on http://www.csse.monash.edu.au/~jwb/edict_doc.html, however more
  46 # up to date.  These sets are accurate as of 2009-Jul-17.
  47
  48 # Part of speech codes
  49 valid_pos_codes = set((
  50     "adj-i", "adj-na", "adj-no", "adj-pn", "adj-t", "adj-f", "adj",
  51     "adv", "adv-to", "aux", "aux-v", "aux-adj", "conj", "ctr", "exp",
  52     "int", "iv", "n", "n-adv", "n-suf", "n-pref", "n-t", "num", "pn",
  53     "pref", "prt", "suf", "v1", "v2a-s", "v4h", "v4r", "v5", "v5aru",
  54     "v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r", "v5r-i", "v5s",
  55     "v5t", "v5u", "v5u-s", "v5uru", "v5z", "vz", "vi", "vk", "vn",
  56     "vr", "vs", "vs-s", "vs-i", "vt",
  57     ))
  58
  59 # Field of application codes
  60 valid_foa_codes = set((
  61     "Buddh", "MA", "comp", "food", "geom", "ling", "math", "mil",
  62     "physics", "chem"
  63     ))
  64
  65 # Miscellaneous marking codes
  66 valid_misc_codes = set((
  67     "X", "abbr", "arch", "ateji", "chn", "col", "derog", "eK", "ek",
  68     "fam", "fem", "gikun", "hon", "hum", "iK", "id", "ik", "io",
  69     "m-sl", "male", "male-sl", "oK", "obs", "obsc", "ok", "on-mim",
  70     "poet", "pol", "rare", "sens", "sl", "uK", "uk", "vulg"
  71     ))
  72
  73 # Dialect codes
  74 valid_dialect_codes = set((
  75     "kyb", "osb", "ksb", "ktb", "tsb", "thb", "tsug", "kyu", "rkb",
  76     "nab"
  77     ))
  78
  79 # Grab all ()'s before a gloss
  80 all_paren_match = re.compile("^(\([^)]*\)[ ]*)+")
  81 # Grab the first () data entry, with group(1) set to the contents
  82 paren_match = re.compile(u"^[ ]*\(([^)]+)\)[ ]*")
  83
  84 def info_field_valid(i_field):
  85     """Returns whether a given info code is valid."""
  86
  87     # Validity is a sticky issue since there's so many fields:
  88     #
  89     # - Sense markers (1, 2, 3, ...)
  90     # - Part of speech markers (n, adv, v5r)
  91     # - Field of application markers (comp, math, mil)
  92     # - Miscellaneous meanings (X, abbr, arch, ateji, ..........)
  93     # - Word priority (P)
  94     # ? Okurigana variants (Maybe this is JMdict only?)
  95     # - Loan words, a.k.a. Gairaigo
  96     # - Regional Japanese words (Kansai-ben, etc.)
  97     #
  98     # Thankfully, this function should be reusable in the edict2 parser...
  99
 100     if i_field in valid_pos_codes: return True
 101     if i_field == "P": return True
 102     if i_field in valid_misc_codes: return True
 103     if i_field in valid_foa_codes: return True
 104     if i_field[:-1] in valid_dialect_codes: return True
 105     # Check for (1), (2), etc.
 106     try:
 107         i = int(i_field)
 108         return True
 109     except:
 110         return False
 111
 112 class EdictEntry(object):
 113
 114     def __init__(self, raw_entry, quick_parsing=True):
 115
 116         # Japanese - note, if only a kana reading is present, it's
 117         # stored as "japanese", and furigana is left as None.
 118         self.japanese = None
 119         self.furigana = None
 120         # Native language glosses
 121         self.glosses = []
 122         # Info fields should be inserted here as "tags".
 123         self.tags = set()
 124         # Currently unhandled stuff goes here...
 125         self.unparsed = []
 126
 127         # Most people don't need ultra-fancy parsing and can happily
 128         # take glosses with keywords stuck in them.  In this case,
 129         # they can save processing time by using parse_entry_quick.
 130         # However, this will mean that "J-Ben"-style entry sorting may
 131         # not work exactly as expected because of tags being appended
 132         # to the beginning or end.
 133
 134         # Note: Even with full parsing, due to a few entries with tags
 135         # at the end of their glosses, there's a few entries which will not
 136         # successfully match on an "ends with" search.
 137
 138         # ENABLE THIS once parse_entry_quick is implemented.
 139         if quick_parsing:
 140             self.parse_entry_quick(raw_entry)
 141         else:
 142             self.parse_entry(raw_entry)
 143
 144     def parse_entry(self, raw_entry):
 145         if not raw_entry:
 146             return None
 147
 148         jdata, ndata = raw_entry.split(u'/', 1)
 149
 150         # Get Japanese
 151         pieces = jdata.split(u'[', 1)
 152         self.japanese = pieces[0].strip()
 153         if len(pieces) > 1:
 154             # Store furigana without '[]'
 155             self.furigana = pieces[1].strip()[:-1]
 156
 157         #if self.furigana:
 158         #    print "JAPANESE: %s, FURIGANA: %s" % (self.japanese, self.furigana)
 159         #else:
 160         #    print "JAPANESE: %s" % self.japanese
 161
 162         # Get native language data
 163         glosses = ndata.split(u'/')
 164         for gloss in glosses:
 165             # For each gloss, we need to check for ()'s at the beginning.
 166             # Multiple such ()'s may be present.
 167             # The actual gloss does not begin until the last set (or
 168             # an unhandled one) is encountered.
 169
 170             if not gloss: continue
 171             #print "Unparsed gloss: [%s]" % gloss
 172
 173             info = None
 174             m = all_paren_match.match(gloss)
 175             if m:
 176                 info = m.group(0)
 177             if info:
 178                 gloss_start = m.span()[1]
 179                 gloss = gloss[gloss_start:]
 180                 #print "Info field captured: [%s]" % info
 181
 182             while info:
 183                 m = paren_match.match(info)
 184                 #if not m: break  # Shouldn't ever happen...
 185                 i_field = m.group(1)
 186                 #print "INFO FIELD FOUND:", i_field
 187                 i_fields = i_field.split(u',')
 188
 189                 # Check that all i_fields are valid
 190                 bools = map(info_field_valid, i_fields)
 191                 ok = reduce(lambda x, y: x and y, bools)
 192
 193                 if not ok:
 194                     #print "INVALID INFO FIELD FOUND, REVERTING"
 195                     #print "INFO WAS %s, GLOSS WAS %s" % (info, gloss)
 196                     print info
 197                     gloss = info + gloss
 198                     #print "RESTORED GLOSS:", gloss
 199                     break
 200
 201                 for tag in i_fields:
 202                     self.tags.add(tag.rstrip(':')) # Handles "ksb:"
 203                                                     # and other
 204                                                     # dialect codes
 205                     #print "INFO FIELD FOUND:", i
 206                 next_i = m.span()[1]
 207                 info = info[next_i:]
 208
 209             #print "APPENDING GLOSS:", gloss
 210             self.glosses.append(gloss)
 211
 212     def parse_entry_quick(self, raw_entry):
 213         if not raw_entry:
 214             return None
 215
 216         jdata, ndata = raw_entry.split(u'/', 1)
 217
 218         # Get Japanese
 219         pieces = jdata.split(u'[', 1)
 220         self.japanese = pieces[0].strip()
 221         if len(pieces) > 1:
 222             # Store furigana without '[]'
 223             self.furigana = pieces[1].strip()[:-1]
 224
 225         # Get native language data
 226         self.glosses = ndata.split(u'/')
 227
 228 # EDICT FORMAT:
 229 #    KANJI [KANA] /(general information) gloss/gloss/.../
 230 # or
 231 #    KANA /(general information) gloss/gloss/.../
 232 #
 233 # Where there are multiple senses, these are indicated by (1), (2),
 234 # etc. before the first gloss in each sense. As this format only
 235 # allows a single kanji headword and reading, entries are generated
 236 # for each possible headword/reading combination. As the format
 237 # restricts Japanese characters to the kanji and kana fields, any
 238 # cross-reference data and other informational fields are omitted.
 239
 240 # EDICT2 FORMAT:
 241 #
 242 #    KANJI-1;KANJI-2 [KANA-1;KANA-2] /(general information) (see xxxx) gloss/gloss/.../
 243
 244
 245
 246
 247
 248
 249 #        # First 2 fields are always the same
 250 #        pieces = raw_entry.split(None, 2)
 251 #        misc = pieces.pop()
 252 #        self.jis = int(pieces.pop(), 16)
 253 #        self.literal = pieces.pop()
 254 #
 255 #        # Parse the remainder
 256 #        si = ei = 0
 257 #        while si < len(misc):
 258 #            c = misc[si]
 259 #            i = ord(c)
 260 #            if c == u' ':
 261 #                si += 1
 262 #                continue
 263 #            if i > 0xFF or c in (u'-', u'.'):
 264 #                # Parse Japanese
 265 #                ei = misc.find(u' ', si+1)
 266 #                if ei == -1:
 267 #                    ei = len(misc) + 1
 268 #                sub = misc[si:ei]
 269 #
 270 #                self._parse_japanese(state, sub)
 271 #            elif c == u'{':
 272 #                # Parse Translation
 273 #                si += 1  # Move si inside of {
 274 #                ei = misc.find(u'}', si+1)
 275 #                if ei == -1:
 276 #                    ei = len(misc) + 1
 277 #                sub = misc[si:ei]
 278 #                ei += 1  # Move ei past }
 279 #
 280 #                self.meanings.append(sub)
 281 #            else:
 282 #                # Parse info field
 283 #                ei = misc.find(u' ', si+1)
 284 #                if ei == -1:
 285 #                    ei = len(misc) + 1
 286 #                sub = misc[si:ei]
 287 #
 288 #                self._parse_info(state, sub)
 289 #
 290 #            si = ei + 1
 291
 292     def to_string(self, **kwargs):
 293         if self.furigana:
 294             ja = _(u"%s [%s]") % (self.japanese, self.furigana)
 295         else:
 296             ja = self.japanese
 297         native = _(u"; ").join(self.glosses)
 298         return _(u"%s: %s") % (ja, native)
 299
 300     def __unicode__(self):
 301         """Dummy string dumper"""
 302         return unicode(self.__repr__())
 303
 304 class EdictParser(object):
 305
 306     def __init__(self, filename, use_cache=True, encoding="EUC-JP"):
 307         self.filename = filename
 308         self.encoding = encoding
 309         self.use_cache = use_cache
 310         self.cache = {}
 311
 312     def search(self, query):
 313         """Returns a list of entries matching the query."""
 314         results = []
 315
 316         def proc_entry(entry):
 317             if query in entry.japanese:
 318                 results.append(entry)
 319             else:
 320                 for gloss in entry.glosses:
 321                     if query in gloss:
 322                         results.append(entry)
 323                         break
 324
 325         if self.use_cache and self.cache:
 326             # Read from cache
 327             for k, entry in self.cache.iteritems():
 328                 proc_entry(entry)
 329         else:
 330             # Read from file
 331             if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 332                 f = gzip.open(self.filename)
 333             else:
 334                 f = open(self.filename, "rb")
 335             fdata = f.read()
 336             f.close()
 337             fdata = fdata.decode(self.encoding)
 338             lines = fdata.splitlines()
 339             lines = [line for line in lines if line and (line[0] != u"#")]
 340
 341             data = {}
 342             for line in lines:
 343                 entry = EdictEntry(line)
 344                 if self.use_cache:
 345                     self.cache[entry.japanese] = entry
 346                 proc_entry(entry)
 347
 348         # Very simple sorting of results.
 349         # (Requires that (P) is left in glosses...)
 350         common = []
 351         other = []
 352
 353         for item in results:
 354             is_common = False
 355             for gloss in item.glosses:
 356                 if u'(P)' in gloss:
 357                     is_common = True
 358                     break
 359             if is_common:
 360                 common.append(item)
 361             else:
 362                 other.append(item)
 363
 364         results = common
 365         results.extend(other)
 366
 367         # Return results
 368         return results
 369
 370 if __name__ == "__main__":
 371     import sys, os
 372
 373     if len(sys.argv) < 2:
 374         print _(u"Please specify a dictionary file.")
 375         exit(-1)
 376     try:
 377         kp = EdictParser(sys.argv[1])
 378     except Exception, e:
 379         print _(u"Could not create EdictParser: %s") % unicode(e)
 380         exit(-1)
 381
 382     if len(sys.argv) < 3:
 383         print _(u"Please specify a search query.")
 384         exit(-1)
 385
 386     if os.name == "nt":
 387         charset = "cp932"
 388     else:
 389         charset = "utf-8"
 390
 391     for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
 392         print _(u"Entry %d: %s") % (i+1, entry.to_string())