parsers/jmdict.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # Copyright (c) 2009, Paul Goins
   5 # All rights reserved.
   6 #
   7 # Redistribution and use in source and binary forms, with or without
   8 # modification, are permitted provided that the following conditions
   9 # are met:
  10 #
  11 #     * Redistributions of source code must retain the above copyright
  12 #       notice, this list of conditions and the following disclaimer.
  13 #     * Redistributions in binary form must reproduce the above
  14 #       copyright notice, this list of conditions and the following
  15 #       disclaimer in the documentation and/or other materials provided
  16 #       with the distribution.
  17 #
  18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29 # POSSIBILITY OF SUCH DAMAGE.
  30
  31 """A basic parser for JMdict."""
  32
  33 # Parsing is now handled, but some things are still needed:
  34 #
  35 # 1. Passing in of the parser object, or some config structure, so
  36 #    that only desired fields are stored.  (Otherwise we'll waste more
  37 #    memory than necessary on our cache, and on JMdict there's a lot
  38 #    of stuff to store.)
  39 # 2. Indices
  40 #
  41 # HOW TO INDEX JMDICT ENTRIES
  42 #
  43 # JMdict is a Japanese-English dictionary file, however in practice it
  44 # is used for bidirectional searches.
  45 #
  46 # What we have: big list of entries, japanese readings/kanji as
  47 # central entries, glosses as native language entries, multiple
  48 # glosses per entry.
  49 #
  50 # Japanese indexing, *basic*
  51 # Entries to consider:
  52 #    reb+, keb*.  Index on both for sure.  Do first
  53 # Indices
  54 # 1. Starts-with index: {first_char: set()} or {first_char: []}
  55 #    - We could do secondary buckets if desired, but let's just do one
  56 #      to start.
  57 #    - Separate dicts for readings/kanji?  Same?  (If separate, we
  58 #      need to search both...)
  59 #
  60 # Native language indexing
  61 # Entries to consider: gloss
  62 # Other factors: language (default: en, others supported)
  63
  64 # - dict indexes can be made in the same way, but only one rather than
  65 #   the dual reading/kanji dicts for Japanese.
  66 # - Should be able to create indices in separate languages
  67 # - Should be able to restrict searches to a single language
  68 # FORMAT:
  69 # native_indices {lang: indices={}}
  70 # (Dict based on lang, maps to other indices}
  71
  72 from xml.sax.handler import ContentHandler, DTDHandler, EntityResolver
  73 from xml.sax.xmlreader import InputSource
  74 import xml.sax
  75 import gzip, gettext
  76 gettext.install('pyjben', unicode=True)
  77
  78 class JMdictEntry(object):
  79
  80     """JMdict entry.
  81
  82     For performance/memory reasons, attributes are dynamically
  83     created.  Safe access to attributes can be done via
  84     getattr(obj, key, None).
  85
  86     """
  87
  88     def __init__(self):
  89         self.ent_seq = None
  90         self.k_ele = []
  91         self.r_ele = []
  92         self.info = None
  93         self.sense = []
  94
  95     def to_string(self, **kwargs):
  96         """A default "to-string" dump of a JMdictEntry."""
  97
  98         s = []
  99         s.append(u"JMdictEntry %d" % self.ent_seq)
 100         if self.k_ele:
 101             s.append(u"Kanji blobs: %s" % u",".join(
 102                 [elem[u"keb"] for elem in self.k_ele]))
 103             s.append(u"k_ele: %s" % unicode(self.k_ele))
 104         if self.r_ele:
 105             s.append(u"Reading blobs: %s" % u",".join(
 106                 [elem[u"reb"] for elem in self.r_ele]))
 107             s.append(u"r_ele: %s" % unicode(self.r_ele))
 108         if self.info: s.append(u"info: %s" % unicode(self.info))
 109         if self.sense:
 110             if len(self.sense) == 1:
 111                 s.append(u"Sense: %s" % unicode(self.sense))
 112             else:
 113                 for i, sense in enumerate(self.sense):
 114                     s.append(u"Sense %d: %s" % (i+1, unicode(sense)))
 115         return u"\n".join(s)
 116
 117 class JMDSAXHandler(ContentHandler):
 118
 119     """SAX handler for JMdict.
 120
 121     If not using caching, parsing should take a minimal amount of
 122     memory as only the matching results are stored and returned.  A
 123     single non-cached search will be slightly faster than a cached one
 124     (over 10% on my machine).  However, realistically this function
 125     should only be used for systems which are severely strapped for
 126     memory.
 127
 128     Further, rather than using JMdict, why not just use classic EDICT?
 129     If the extra info is not really needed, it'll greatly speed things
 130     up to use something else.
 131
 132     """
 133
 134     def __init__(self, use_cache, search_str, *args, **kwargs):
 135         ContentHandler.__init__(self, *args, **kwargs)
 136         self.parsing = False
 137         self.entry = None
 138         self.path = []
 139         self.full_keys = set()
 140         self.data = []
 141         self.node_content = ""
 142
 143         self.use_cache = use_cache
 144         self.search_str = search_str
 145
 146     def get_path(self):
 147         return u"/".join([i[0] for i in self.path])
 148
 149     def get_attr_str(self):
 150         return u", ".join([u"%s: %s" % (k, v)
 151                            for k, v in self.path[-1][1].items()])
 152
 153     def startElement(self, name, attrs):
 154         if name == u"entry":
 155             self.parsing = True
 156             self.entry = JMdictEntry()
 157         elif self.parsing:
 158             self.path.append((name, attrs))
 159             if name in (u"k_ele", u"r_ele", u"sense", u"links", u"audit"):
 160                 # Create a temp var for the current reading, sense, etc.
 161                 key = u"cur_%s" % name
 162                 setattr(self.entry, key, {})
 163             elif name == u"info":
 164                 self.entry.info = {}
 165
 166     def endElement(self, name):
 167         entry = self.entry
 168         if self.parsing:
 169             self.node_content = self.node_content.strip()
 170             if self.node_content:
 171                 # Assign data as appropriate
 172                 node, attrs = self.path[-1]
 173                 # Unique ID for entry
 174                 if node == u"ent_seq":
 175                     entry.ent_seq = int(self.node_content)
 176                 # Kanji elements (blob, info, priority)
 177                 elif node == u"keb":
 178                     entry.cur_k_ele[node] = self.node_content
 179                 elif node[:3] == u"ke_":
 180                     entry.cur_k_ele.setdefault(node, []).append(
 181                         self.node_content)
 182                 # Reading elements (blob, nokanji?, reading substrs, inf, pri)
 183                 elif node == u"reb":
 184                     entry.cur_r_ele[node] = self.node_content
 185                 elif node == u"re_nokanji": # special case
 186                     entry.cur_r_ele[node] = True
 187                 elif node[:3] == u"re_": # reading element (all but nokanji)
 188                     entry.cur_r_ele.setdefault(node, []).append(
 189                         self.node_content)
 190                 # Info element
 191                 # links [], bibl [], etym [], audit []
 192                 # links: (tag, desc, uri)
 193                 # audit: upd_date, upd_detl)
 194                 # bibl, etym, and all other child fields: strings
 195                 elif node in (u"bibl", u"etym"):
 196                     entry.info.setdefault(node, []).append(
 197                         self.node_content)
 198                 # These info nodes need to be appended on the
 199                 # endElement event.  *** TO DO ***
 200                 elif node in (u"link_tag", u"link_desc", u"link_uri",
 201                               u"upd_date", u"upd_detl"):
 202                     setattr(entry, u"cur_%s" % node, self.node_content)
 203                 # Sense elements (all but glosses)
 204                 elif node in (u"stagk", u"stagr", u"pos", u"xref", u"ant",
 205                               u"field", u"misc", u"s_inf", u"dial",
 206                               u"example"):
 207                     entry.cur_sense.setdefault(node, []).append(
 208                         self.node_content)
 209                 elif node == u"lsource":
 210                     # xml_lang is common
 211                     xml_lang = attrs.get(u"xml:lang", u"eng")
 212                     # ls_* seem new...
 213                     ls_type = attrs.get(u"ls_type", u"full")
 214                     ls_wasei = attrs.get(u"ls_wasei")  # Flag for "waseieigo"
 215                     # We'll do a 4 node tuple for this entry...
 216                     entry.cur_sense.setdefault(node, []).append(
 217                         (self.node_content, xml_lang, ls_type, ls_wasei))
 218                 # Glosses...  It seems that <pri> is not yet used, so
 219                 # glosses are pretty straightforward like the above fields.
 220                 elif node == u"gloss":
 221                     xml_lang = attrs.get(u"xml:lang", u"eng")
 222                     g_gend = attrs.get(u"g_gend")
 223                     entry.cur_sense.setdefault(node, []).append(
 224                         (self.node_content, xml_lang, g_gend))
 225                 elif node == u"pri":
 226                     print (u"DEBUG: <pri> field detected!  This is a new "
 227                            u"field; please contact the author with the "
 228                            u"modification date of your copy of JMdict so he "
 229                            u"can update J-Ben to support it!")
 230
 231                 else:   # Unhandled
 232                     print (u"DEBUG: path %s: unhandled node %s with content "
 233                            u"[%s]" % (self.get_path(), node,
 234                                       self.node_content))
 235                 self.node_content = ""
 236
 237             if self.path:
 238                 if name != self.path[-1][0]:
 239                     # Shouldn't ever happen, but mistakes *can* slip in...
 240                     print u"Mismatch detected, path is %s, element name is %s" \
 241                           % (self.get_path(), name)
 242                 else:
 243                     self.path.pop()
 244
 245             # Handle composite values
 246             # First, the dict types...
 247             if name in (u"k_ele", u"r_ele", u"sense"):
 248                 temp_key = u"cur_%s" % name
 249                 obj = getattr(entry, temp_key)
 250                 getattr(entry, name).append(obj)
 251                 delattr(entry, temp_key)
 252             # Next, the two tuple types
 253             elif name == u"links":
 254                 entry.info.setdefault(u"links", []).append(
 255                     (entry.cur_link_tag,
 256                      entry.cur_link_desc,
 257                      entry.cur_link_uri))
 258                 delattr(entry, u"cur_link_tag")
 259                 delattr(entry, u"cur_link_desc")
 260                 delattr(entry, u"cur_link_uri")
 261             elif name == u"audit":
 262                 entry.info.setdefault(u"audit", []).append(
 263                     (entry.cur_upd_date,
 264                      entry.cur_upd_detl))
 265                 delattr(entry, u"cur_upd_date")
 266                 delattr(entry, u"cur_upd_detl")
 267
 268             # Handle end of entry
 269             elif name == u"entry":
 270                 for node in (u"k_ele", u"r_ele", u"sense",
 271                              u"link_tag", u"link_desc", u"link_uri",
 272                              u"upd_date", u"upd_detl"):
 273                     if hasattr(entry, u"cur_%s" % node):
 274                         print vars(entry)
 275                         print node
 276                         raise Exception(u"Shouldn't-Happen-Error")
 277
 278                 # LATER: do some optimization if doing non-cached searches.
 279                 # (probably won't help many people though...)
 280                 #if not self.use_cache:
 281                 #    raise Exception(u"JMdict no-cache-mode not yet supported!")
 282                 #else:
 283                 #    self.data.append(entry)
 284
 285                 # For now: all entries go into the data list.
 286                 self.data.append(entry)
 287
 288                 entry = None
 289                 self.parsing = False
 290
 291     def characters(self, content):
 292         if self.parsing:
 293             self.node_content += content
 294
 295     def skippedEntity(self, name):
 296         # 2 things need to be done here:
 297         # 1. JMdict entities need to be stored properly
 298         # 2. Standard XML entities (***IF*** they are ***ALSO*** not parsed)
 299         #    should be manually put into the character stream.
 300         if self.parsing:
 301             if name in (u"lt", u"amp", u"gt", u"quot", u"apos"):
 302                 print u"Houston, we gots ourselves a BIG problem:", name
 303             else:
 304                 self.node_content += name
 305
 306 from xml.sax.expatreader import ExpatParser
 307 class ExpatParserNoEntityExp(ExpatParser):
 308
 309     """An overridden Expat parser class which disables entity expansion."""
 310
 311     def reset(self):
 312         ExpatParser.reset(self)
 313         self._parser.DefaultHandler = self.dummy_handler
 314
 315     def dummy_handler(self, *args, **kwargs):
 316         pass
 317
 318 class JMdictParser(object):
 319
 320     def __init__(self, filename, use_cache=True, encoding="utf-8"):
 321         """Initializer for JMdictParser.
 322
 323         About use_cache: JMdict is a large, heavy to parse file.
 324         Although it takes a large amount of memory, it is ideal to
 325         retain it in memory to increase the speed of subsequent
 326         searches.
 327
 328         """
 329         self.filename = filename
 330         self.encoding = encoding
 331         self.cache = None
 332         self.use_cache = use_cache
 333
 334         # All cached entries will be stored here
 335         self.entries = []
 336         self.entry_count = 0
 337
 338         # Indices
 339         # Basic level index: key: set()
 340         #     Alternatively: key: list (constant order)
 341
 342         self.j_ind = {}  # Japanese (ind_type: index)
 343         self.n_ind = {}  # Native (lang: lang_indices)
 344         #                      (lang: index)
 345         self.index_list = ["starts_with"]  # List of indices to auto-create
 346
 347     def load_via_sax(self, use_cache, search_str):
 348         if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
 349             f = gzip.open(self.filename)
 350         else:
 351             f = open(self.filename, "rb")
 352
 353         sh = JMDSAXHandler(use_cache, search_str)
 354         isource = InputSource()
 355         isource.setEncoding("utf-8")
 356         isource.setByteStream(f)
 357
 358         # Parser: Since I wish to directly handle the "entities", we
 359         # need to override default behavior and cannot just use
 360         # xml.sax.parse.
 361         parser = ExpatParserNoEntityExp()
 362         parser.setContentHandler(sh)
 363
 364         parser.parse(isource)
 365         f.close()
 366         return sh.data
 367
 368     def search(self, search_str, index="starts_with", n_langs=["eng"],
 369                n_fallback=True):
 370         """Search JMdict for a Japanese or native language query.
 371
 372         search_str: the query
 373         index: index to use (valid values: starts_with, None)
 374         n_langs: list of native languages to search for
 375         n_fallback: If True, processes languages in a "fallback" fashion:
 376                     for each entry examined, only look at the first language
 377                     to have glosses and ignore the rest.
 378         """
 379         data = None
 380         if self.use_cache: data = self.cache
 381         if not data:
 382             # Pick a loader.
 383             # Opt 1: sax... very powerful, but too much code with my impl?
 384             # Opt 2: elementtree... more memory required, loads
 385             # everything at once...
 386             # Opt 3: sax... redo to store all vars as lists, or similar.
 387
 388             # First attempt of a SAX style loader.
 389             data = self.load_via_sax(self.use_cache, search_str)
 390
 391             if self.use_cache: self.cache = data
 392
 393             self.create_indices(data, self.index_list)
 394
 395         results = []
 396         if index == "starts_with":
 397             # Indexed lookup
 398             key = search_str[0]
 399
 400             # Japanese first:
 401             idx = self.j_ind.get(index)
 402             if idx:
 403                 idx = idx.get(key)
 404             if idx:
 405                 for entry in [data[i] for i in idx]:
 406                     added = False
 407                     for k_ele in entry.k_ele:
 408                         if search_str == k_ele[u"keb"][:len(search_str)]:
 409                             results.append(entry)
 410                             added = True
 411                             break
 412                     if added: continue
 413                     for r_ele in entry.r_ele:
 414                         if search_str == r_ele[u"reb"][:len(search_str)]:
 415                             results.append(entry)
 416                             break
 417
 418             # Native language next:
 419             # WEAKNESS: if we later support searching via other
 420             # languages which use Chinese characters, we may end up
 421             # with duplicates with this code.
 422             for lang in n_langs:
 423                 search_keys = None
 424                 idx = self.n_ind.get(lang)
 425                 if idx:
 426                     idx = idx.get(index)
 427                 if idx:
 428                     idx = idx.get(key)
 429                 if idx:
 430                     for entry in [data[i] for i in idx]:
 431                         if n_fallback:
 432                             # NOT YET IMPLEMENTED
 433                             pass
 434                         #else:
 435
 436                         for sense in entry.sense:
 437                             for gloss, lang, gender in sense[u"gloss"]:
 438                                 if search_str == gloss[:len(search_str)]:
 439                                     results.append(entry)
 440                                     continue
 441         elif not index:
 442             # Non-indexed lookup
 443             # WARNING: this could be VERY slow!
 444             for entry in data:
 445                 # Japanese search:
 446                 # *** TO DO ***
 447
 448                 # Native language search:
 449                 for sense in entry.sense:
 450                     for gloss, lang, gender in sense[u"gloss"]:
 451                         if lang not in n_langs:
 452                             continue
 453                         if search_str == gloss[:len(search_str)]:
 454                             results.add(entry)
 455                             break
 456         else:
 457             raise Exception(u"Unhandled index type: %s" % index)
 458
 459         return results
 460
 461     def create_indices(self, data, desired_indices):
 462         """Creates desired indices for a set of input data."""
 463         # Initialize indices
 464         self.j_ind = {}
 465         self.n_ind = {}
 466
 467         for i, entry in enumerate(data):
 468             for index_name in desired_indices:
 469                 if index_name == "starts_with":
 470                     # Make targets
 471                     j_targets = set()
 472                     n_targets = {}
 473                     for k_ele in entry.k_ele:
 474                         j_targets.add(k_ele[u"keb"][0])
 475                     for r_ele in entry.r_ele:
 476                         j_targets.add(r_ele[u"reb"][0])
 477                     for sense in entry.sense:
 478                         if not sense.has_key(u"gloss"): continue
 479                         for gloss, lang, gender in sense[u"gloss"]:
 480                             n_targets.setdefault(lang, set()).add(gloss[0])
 481                     # Append to indices (assuming indices as lists)
 482                     for target in j_targets:
 483                         self.j_ind.setdefault(index_name, {}) \
 484                             .setdefault(target, []) \
 485                             .append(i)
 486                     for lang, targ_set in n_targets.iteritems():
 487                         for target in targ_set:
 488                             self.n_ind.setdefault(lang, {}) \
 489                                 .setdefault(index_name, {}) \
 490                                 .setdefault(target, []) \
 491                                 .append(i)
 492                 else:
 493                     raise Exception(u"Unsupported index type")
 494
 495 if __name__ == "__main__":
 496     import sys, os
 497
 498     if len(sys.argv) < 2:
 499         print _(u"Please specify a dictionary file.")
 500         exit(-1)
 501     try:
 502         kp = JMdictParser(sys.argv[1])
 503     except Exception, e:
 504         print _(u"Could not create JMdictParser: %s") % unicode(e)
 505         exit(-1)
 506
 507     if len(sys.argv) < 3:
 508         print _(u"Please specify a search query.")
 509         exit(-1)
 510
 511     if os.name == "nt":
 512         charset = "cp932"
 513     else:
 514         charset = "utf-8"
 515
 516     for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
 517         print _(u"Entry %d: %s") % (i+1, entry.to_string())