Basic JMdict searches ("starts_with" index) should now work.
[jben2_gui.git] / parsers / jmdict.py
blobad17369dafdf90c3d9a237c3f38f0302d070d0b6
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
5 # All rights reserved.
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
9 # are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A basic parser for JMdict."""
33 # Parsing is now handled, but some things are still needed:
35 # 1. Passing in of the parser object, or some config structure, so
36 # that only desired fields are stored. (Otherwise we'll waste more
37 # memory than necessary on our cache, and on JMdict there's a lot
38 # of stuff to store.)
39 # 2. Indices
41 # HOW TO INDEX JMDICT ENTRIES
43 # JMdict is a Japanese-English dictionary file, however in practice it
44 # is used for bidirectional searches.
46 # What we have: big list of entries, japanese readings/kanji as
47 # central entries, glosses as native language entries, multiple
48 # glosses per entry.
50 # Japanese indexing, *basic*
51 # Entries to consider:
52 # reb+, keb*. Index on both for sure. Do first
53 # Indices
54 # 1. Starts-with index: {first_char: set()} or {first_char: []}
55 # - We could do secondary buckets if desired, but let's just do one
56 # to start.
57 # - Separate dicts for readings/kanji? Same? (If separate, we
58 # need to search both...)
60 # Native language indexing
61 # Entries to consider: gloss
62 # Other factors: language (default: en, others supported)
64 # - dict indexes can be made in the same way, but only one rather than
65 # the dual reading/kanji dicts for Japanese.
66 # - Should be able to create indices in separate languages
67 # - Should be able to restrict searches to a single language
68 # FORMAT:
69 # native_indices {lang: indices={}}
70 # (Dict based on lang, maps to other indices}
72 from xml.sax.handler import ContentHandler, DTDHandler, EntityResolver
73 from xml.sax.xmlreader import InputSource
74 import xml.sax
75 import gzip, gettext
76 gettext.install('pyjben', unicode=True)
78 class JMdictEntry(object):
80 """JMdict entry.
82 For performance/memory reasons, attributes are dynamically
83 created. Safe access to attributes can be done via
84 getattr(obj, key, None).
86 """
88 def __init__(self):
89 self.ent_seq = None
90 self.k_ele = []
91 self.r_ele = []
92 self.info = None
93 self.sense = []
95 def to_string(self, **kwargs):
96 """A default "to-string" dump of a JMdictEntry."""
98 s = []
99 s.append(u"JMdictEntry %d" % self.ent_seq)
100 if self.k_ele:
101 s.append(u"Kanji blobs: %s" % u",".join(
102 [elem[u"keb"] for elem in self.k_ele]))
103 s.append(u"k_ele: %s" % unicode(self.k_ele))
104 if self.r_ele:
105 s.append(u"Reading blobs: %s" % u",".join(
106 [elem[u"reb"] for elem in self.r_ele]))
107 s.append(u"r_ele: %s" % unicode(self.r_ele))
108 if self.info: s.append(u"info: %s" % unicode(self.info))
109 if self.sense:
110 if len(self.sense) == 1:
111 s.append(u"Sense: %s" % unicode(self.sense))
112 else:
113 for i, sense in enumerate(self.sense):
114 s.append(u"Sense %d: %s" % (i+1, unicode(sense)))
115 return u"\n".join(s)
117 class JMDSAXHandler(ContentHandler):
119 """SAX handler for JMdict.
121 If not using caching, parsing should take a minimal amount of
122 memory as only the matching results are stored and returned. A
123 single non-cached search will be slightly faster than a cached one
124 (over 10% on my machine). However, realistically this function
125 should only be used for systems which are severely strapped for
126 memory.
128 Further, rather than using JMdict, why not just use classic EDICT?
129 If the extra info is not really needed, it'll greatly speed things
130 up to use something else.
134 def __init__(self, use_cache, search_str, *args, **kwargs):
135 ContentHandler.__init__(self, *args, **kwargs)
136 self.parsing = False
137 self.entry = None
138 self.path = []
139 self.full_keys = set()
140 self.data = []
141 self.node_content = ""
143 self.use_cache = use_cache
144 self.search_str = search_str
146 def get_path(self):
147 return u"/".join([i[0] for i in self.path])
149 def get_attr_str(self):
150 return u", ".join([u"%s: %s" % (k, v)
151 for k, v in self.path[-1][1].items()])
153 def startElement(self, name, attrs):
154 if name == u"entry":
155 self.parsing = True
156 self.entry = JMdictEntry()
157 elif self.parsing:
158 self.path.append((name, attrs))
159 if name in (u"k_ele", u"r_ele", u"sense", u"links", u"audit"):
160 # Create a temp var for the current reading, sense, etc.
161 key = u"cur_%s" % name
162 setattr(self.entry, key, {})
163 elif name == u"info":
164 self.entry.info = {}
166 def endElement(self, name):
167 entry = self.entry
168 if self.parsing:
169 self.node_content = self.node_content.strip()
170 if self.node_content:
171 # Assign data as appropriate
172 node, attrs = self.path[-1]
173 # Unique ID for entry
174 if node == u"ent_seq":
175 entry.ent_seq = int(self.node_content)
176 # Kanji elements (blob, info, priority)
177 elif node == u"keb":
178 entry.cur_k_ele[node] = self.node_content
179 elif node[:3] == u"ke_":
180 entry.cur_k_ele.setdefault(node, []).append(
181 self.node_content)
182 # Reading elements (blob, nokanji?, reading substrs, inf, pri)
183 elif node == u"reb":
184 entry.cur_r_ele[node] = self.node_content
185 elif node == u"re_nokanji": # special case
186 entry.cur_r_ele[node] = True
187 elif node[:3] == u"re_": # reading element (all but nokanji)
188 entry.cur_r_ele.setdefault(node, []).append(
189 self.node_content)
190 # Info element
191 # links [], bibl [], etym [], audit []
192 # links: (tag, desc, uri)
193 # audit: upd_date, upd_detl)
194 # bibl, etym, and all other child fields: strings
195 elif node in (u"bibl", u"etym"):
196 entry.info.setdefault(node, []).append(
197 self.node_content)
198 # These info nodes need to be appended on the
199 # endElement event. *** TO DO ***
200 elif node in (u"link_tag", u"link_desc", u"link_uri",
201 u"upd_date", u"upd_detl"):
202 setattr(entry, u"cur_%s" % node, self.node_content)
203 # Sense elements (all but glosses)
204 elif node in (u"stagk", u"stagr", u"pos", u"xref", u"ant",
205 u"field", u"misc", u"s_inf", u"dial",
206 u"example"):
207 entry.cur_sense.setdefault(node, []).append(
208 self.node_content)
209 elif node == u"lsource":
210 # xml_lang is common
211 xml_lang = attrs.get(u"xml:lang", u"eng")
212 # ls_* seem new...
213 ls_type = attrs.get(u"ls_type", u"full")
214 ls_wasei = attrs.get(u"ls_wasei") # Flag for "waseieigo"
215 # We'll do a 4 node tuple for this entry...
216 entry.cur_sense.setdefault(node, []).append(
217 (self.node_content, xml_lang, ls_type, ls_wasei))
218 # Glosses... It seems that <pri> is not yet used, so
219 # glosses are pretty straightforward like the above fields.
220 elif node == u"gloss":
221 xml_lang = attrs.get(u"xml:lang", u"eng")
222 g_gend = attrs.get(u"g_gend")
223 entry.cur_sense.setdefault(node, []).append(
224 (self.node_content, xml_lang, g_gend))
225 elif node == u"pri":
226 print (u"DEBUG: <pri> field detected! This is a new "
227 u"field; please contact the author with the "
228 u"modification date of your copy of JMdict so he "
229 u"can update J-Ben to support it!")
231 else: # Unhandled
232 print (u"DEBUG: path %s: unhandled node %s with content "
233 u"[%s]" % (self.get_path(), node,
234 self.node_content))
235 self.node_content = ""
237 if self.path:
238 if name != self.path[-1][0]:
239 # Shouldn't ever happen, but mistakes *can* slip in...
240 print u"Mismatch detected, path is %s, element name is %s" \
241 % (self.get_path(), name)
242 else:
243 self.path.pop()
245 # Handle composite values
246 # First, the dict types...
247 if name in (u"k_ele", u"r_ele", u"sense"):
248 temp_key = u"cur_%s" % name
249 obj = getattr(entry, temp_key)
250 getattr(entry, name).append(obj)
251 delattr(entry, temp_key)
252 # Next, the two tuple types
253 elif name == u"links":
254 entry.info.setdefault(u"links", []).append(
255 (entry.cur_link_tag,
256 entry.cur_link_desc,
257 entry.cur_link_uri))
258 delattr(entry, u"cur_link_tag")
259 delattr(entry, u"cur_link_desc")
260 delattr(entry, u"cur_link_uri")
261 elif name == u"audit":
262 entry.info.setdefault(u"audit", []).append(
263 (entry.cur_upd_date,
264 entry.cur_upd_detl))
265 delattr(entry, u"cur_upd_date")
266 delattr(entry, u"cur_upd_detl")
268 # Handle end of entry
269 elif name == u"entry":
270 for node in (u"k_ele", u"r_ele", u"sense",
271 u"link_tag", u"link_desc", u"link_uri",
272 u"upd_date", u"upd_detl"):
273 if hasattr(entry, u"cur_%s" % node):
274 print vars(entry)
275 print node
276 raise Exception(u"Shouldn't-Happen-Error")
278 # LATER: do some optimization if doing non-cached searches.
279 # (probably won't help many people though...)
280 #if not self.use_cache:
281 # raise Exception(u"JMdict no-cache-mode not yet supported!")
282 #else:
283 # self.data.append(entry)
285 # For now: all entries go into the data list.
286 self.data.append(entry)
288 entry = None
289 self.parsing = False
291 def characters(self, content):
292 if self.parsing:
293 self.node_content += content
295 def skippedEntity(self, name):
296 # 2 things need to be done here:
297 # 1. JMdict entities need to be stored properly
298 # 2. Standard XML entities (***IF*** they are ***ALSO*** not parsed)
299 # should be manually put into the character stream.
300 if self.parsing:
301 if name in (u"lt", u"amp", u"gt", u"quot", u"apos"):
302 print u"Houston, we gots ourselves a BIG problem:", name
303 else:
304 self.node_content += name
306 from xml.sax.expatreader import ExpatParser
307 class ExpatParserNoEntityExp(ExpatParser):
309 """An overridden Expat parser class which disables entity expansion."""
311 def reset(self):
312 ExpatParser.reset(self)
313 self._parser.DefaultHandler = self.dummy_handler
315 def dummy_handler(self, *args, **kwargs):
316 pass
318 class JMdictParser(object):
320 def __init__(self, filename, use_cache=True, encoding="utf-8"):
321 """Initializer for JMdictParser.
323 About use_cache: JMdict is a large, heavy to parse file.
324 Although it takes a large amount of memory, it is ideal to
325 retain it in memory to increase the speed of subsequent
326 searches.
329 self.filename = filename
330 self.encoding = encoding
331 self.cache = None
332 self.use_cache = use_cache
334 # All cached entries will be stored here
335 self.entries = []
336 self.entry_count = 0
338 # Indices
339 # Basic level index: key: set()
340 # Alternatively: key: list (constant order)
342 self.j_ind = {} # Japanese (ind_type: index)
343 self.n_ind = {} # Native (lang: lang_indices)
344 # (lang: index)
345 self.index_list = ["starts_with"] # List of indices to auto-create
347 def load_via_sax(self, use_cache, search_str):
348 if len(self.filename) >= 3 and self.filename[-3:] == ".gz":
349 f = gzip.open(self.filename)
350 else:
351 f = open(self.filename, "rb")
353 sh = JMDSAXHandler(use_cache, search_str)
354 isource = InputSource()
355 isource.setEncoding("utf-8")
356 isource.setByteStream(f)
358 # Parser: Since I wish to directly handle the "entities", we
359 # need to override default behavior and cannot just use
360 # xml.sax.parse.
361 parser = ExpatParserNoEntityExp()
362 parser.setContentHandler(sh)
364 parser.parse(isource)
365 f.close()
366 return sh.data
368 def search(self, search_str, index="starts_with", n_langs=["eng"],
369 n_fallback=True):
370 """Search JMdict for a Japanese or native language query.
372 search_str: the query
373 index: index to use (valid values: starts_with, None)
374 n_langs: list of native languages to search for
375 n_fallback: If True, processes languages in a "fallback" fashion:
376 for each entry examined, only look at the first language
377 to have glosses and ignore the rest.
379 data = None
380 if self.use_cache: data = self.cache
381 if not data:
382 # Pick a loader.
383 # Opt 1: sax... very powerful, but too much code with my impl?
384 # Opt 2: elementtree... more memory required, loads
385 # everything at once...
386 # Opt 3: sax... redo to store all vars as lists, or similar.
388 # First attempt of a SAX style loader.
389 data = self.load_via_sax(self.use_cache, search_str)
391 if self.use_cache: self.cache = data
393 self.create_indices(data, self.index_list)
395 results = []
396 if index == "starts_with":
397 # Indexed lookup
398 key = search_str[0]
400 # Japanese first:
401 idx = self.j_ind.get(index)
402 if idx:
403 idx = idx.get(key)
404 if idx:
405 for entry in [data[i] for i in idx]:
406 added = False
407 for k_ele in entry.k_ele:
408 if search_str == k_ele[u"keb"][:len(search_str)]:
409 results.append(entry)
410 added = True
411 break
412 if added: continue
413 for r_ele in entry.r_ele:
414 if search_str == r_ele[u"reb"][:len(search_str)]:
415 results.append(entry)
416 break
418 # Native language next:
419 # WEAKNESS: if we later support searching via other
420 # languages which use Chinese characters, we may end up
421 # with duplicates with this code.
422 for lang in n_langs:
423 search_keys = None
424 idx = self.n_ind.get(lang)
425 if idx:
426 idx = idx.get(index)
427 if idx:
428 idx = idx.get(key)
429 if idx:
430 for entry in [data[i] for i in idx]:
431 if n_fallback:
432 # NOT YET IMPLEMENTED
433 pass
434 #else:
436 for sense in entry.sense:
437 for gloss, lang, gender in sense[u"gloss"]:
438 if search_str == gloss[:len(search_str)]:
439 results.append(entry)
440 continue
441 elif not index:
442 # Non-indexed lookup
443 # WARNING: this could be VERY slow!
444 for entry in data:
445 # Japanese search:
446 # *** TO DO ***
448 # Native language search:
449 for sense in entry.sense:
450 for gloss, lang, gender in sense[u"gloss"]:
451 if lang not in n_langs:
452 continue
453 if search_str == gloss[:len(search_str)]:
454 results.add(entry)
455 break
456 else:
457 raise Exception(u"Unhandled index type: %s" % index)
459 return results
461 def create_indices(self, data, desired_indices):
462 """Creates desired indices for a set of input data."""
463 # Initialize indices
464 self.j_ind = {}
465 self.n_ind = {}
467 for i, entry in enumerate(data):
468 for index_name in desired_indices:
469 if index_name == "starts_with":
470 # Make targets
471 j_targets = set()
472 n_targets = {}
473 for k_ele in entry.k_ele:
474 j_targets.add(k_ele[u"keb"][0])
475 for r_ele in entry.r_ele:
476 j_targets.add(r_ele[u"reb"][0])
477 for sense in entry.sense:
478 if not sense.has_key(u"gloss"): continue
479 for gloss, lang, gender in sense[u"gloss"]:
480 n_targets.setdefault(lang, set()).add(gloss[0])
481 # Append to indices (assuming indices as lists)
482 for target in j_targets:
483 self.j_ind.setdefault(index_name, {}) \
484 .setdefault(target, []) \
485 .append(i)
486 for lang, targ_set in n_targets.iteritems():
487 for target in targ_set:
488 self.n_ind.setdefault(lang, {}) \
489 .setdefault(index_name, {}) \
490 .setdefault(target, []) \
491 .append(i)
492 else:
493 raise Exception(u"Unsupported index type")
495 if __name__ == "__main__":
496 import sys, os
498 if len(sys.argv) < 2:
499 print _(u"Please specify a dictionary file.")
500 exit(-1)
501 try:
502 kp = JMdictParser(sys.argv[1])
503 except Exception, e:
504 print _(u"Could not create JMdictParser: %s") % unicode(e)
505 exit(-1)
507 if len(sys.argv) < 3:
508 print _(u"Please specify a search query.")
509 exit(-1)
511 if os.name == "nt":
512 charset = "cp932"
513 else:
514 charset = "utf-8"
516 for i, entry in enumerate(kp.search(sys.argv[2].decode(charset))):
517 print _(u"Entry %d: %s") % (i+1, entry.to_string())