2 # -*- coding: utf-8 -*-
4 # Copyright (c) 2009, Paul Goins
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above
14 # copyright notice, this list of conditions and the following
15 # disclaimer in the documentation and/or other materials provided
16 # with the distribution.
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 # POSSIBILITY OF SUCH DAMAGE.
31 """A basic parser for JMdict."""
33 # Parsing is now handled, but some things are still needed:
35 # 1. Passing in of the parser object, or some config structure, so
36 # that only desired fields are stored. (Otherwise we'll waste more
37 # memory than necessary on our cache, and on JMdict there's a lot
41 # HOW TO INDEX JMDICT ENTRIES
43 # JMdict is a Japanese-English dictionary file, however in practice it
44 # is used for bidirectional searches.
46 # What we have: big list of entries, japanese readings/kanji as
47 # central entries, glosses as native language entries, multiple
50 # Japanese indexing, *basic*
51 # Entries to consider:
52 # reb+, keb*. Index on both for sure. Do first
54 # 1. Starts-with index: {first_char: set()} or {first_char: []}
55 # - We could do secondary buckets if desired, but let's just do one
57 # - Separate dicts for readings/kanji? Same? (If separate, we
58 # need to search both...)
60 # Native language indexing
61 # Entries to consider: gloss
62 # Other factors: language (default: en, others supported)
64 # - dict indexes can be made in the same way, but only one rather than
65 # the dual reading/kanji dicts for Japanese.
66 # - Should be able to create indices in separate languages
67 # - Should be able to restrict searches to a single language
69 # native_indices {lang: indices={}}
70 # (Dict based on lang, maps to other indices}
72 from xml
.sax
.handler
import ContentHandler
, DTDHandler
, EntityResolver
73 from xml
.sax
.xmlreader
import InputSource
76 gettext
.install('pyjben', unicode=True)
78 class JMdictEntry(object):
82 For performance/memory reasons, attributes are dynamically
83 created. Safe access to attributes can be done via
84 getattr(obj, key, None).
95 def to_string(self
, **kwargs
):
96 """A default "to-string" dump of a JMdictEntry."""
99 s
.append(u
"JMdictEntry %d" % self
.ent_seq
)
101 s
.append(u
"Kanji blobs: %s" % u
",".join(
102 [elem
[u
"keb"] for elem
in self
.k_ele
]))
103 s
.append(u
"k_ele: %s" % unicode(self
.k_ele
))
105 s
.append(u
"Reading blobs: %s" % u
",".join(
106 [elem
[u
"reb"] for elem
in self
.r_ele
]))
107 s
.append(u
"r_ele: %s" % unicode(self
.r_ele
))
108 if self
.info
: s
.append(u
"info: %s" % unicode(self
.info
))
110 if len(self
.sense
) == 1:
111 s
.append(u
"Sense: %s" % unicode(self
.sense
))
113 for i
, sense
in enumerate(self
.sense
):
114 s
.append(u
"Sense %d: %s" % (i
+1, unicode(sense
)))
117 class JMDSAXHandler(ContentHandler
):
119 """SAX handler for JMdict.
121 If not using caching, parsing should take a minimal amount of
122 memory as only the matching results are stored and returned. A
123 single non-cached search will be slightly faster than a cached one
124 (over 10% on my machine). However, realistically this function
125 should only be used for systems which are severely strapped for
128 Further, rather than using JMdict, why not just use classic EDICT?
129 If the extra info is not really needed, it'll greatly speed things
130 up to use something else.
134 def __init__(self
, use_cache
, search_str
, *args
, **kwargs
):
135 ContentHandler
.__init
__(self
, *args
, **kwargs
)
139 self
.full_keys
= set()
141 self
.node_content
= ""
143 self
.use_cache
= use_cache
144 self
.search_str
= search_str
147 return u
"/".join([i
[0] for i
in self
.path
])
149 def get_attr_str(self
):
150 return u
", ".join([u
"%s: %s" % (k
, v
)
151 for k
, v
in self
.path
[-1][1].items()])
153 def startElement(self
, name
, attrs
):
156 self
.entry
= JMdictEntry()
158 self
.path
.append((name
, attrs
))
159 if name
in (u
"k_ele", u
"r_ele", u
"sense", u
"links", u
"audit"):
160 # Create a temp var for the current reading, sense, etc.
161 key
= u
"cur_%s" % name
162 setattr(self
.entry
, key
, {})
163 elif name
== u
"info":
166 def endElement(self
, name
):
169 self
.node_content
= self
.node_content
.strip()
170 if self
.node_content
:
171 # Assign data as appropriate
172 node
, attrs
= self
.path
[-1]
173 # Unique ID for entry
174 if node
== u
"ent_seq":
175 entry
.ent_seq
= int(self
.node_content
)
176 # Kanji elements (blob, info, priority)
178 entry
.cur_k_ele
[node
] = self
.node_content
179 elif node
[:3] == u
"ke_":
180 entry
.cur_k_ele
.setdefault(node
, []).append(
182 # Reading elements (blob, nokanji?, reading substrs, inf, pri)
184 entry
.cur_r_ele
[node
] = self
.node_content
185 elif node
== u
"re_nokanji": # special case
186 entry
.cur_r_ele
[node
] = True
187 elif node
[:3] == u
"re_": # reading element (all but nokanji)
188 entry
.cur_r_ele
.setdefault(node
, []).append(
191 # links [], bibl [], etym [], audit []
192 # links: (tag, desc, uri)
193 # audit: upd_date, upd_detl)
194 # bibl, etym, and all other child fields: strings
195 elif node
in (u
"bibl", u
"etym"):
196 entry
.info
.setdefault(node
, []).append(
198 # These info nodes need to be appended on the
199 # endElement event. *** TO DO ***
200 elif node
in (u
"link_tag", u
"link_desc", u
"link_uri",
201 u
"upd_date", u
"upd_detl"):
202 setattr(entry
, u
"cur_%s" % node
, self
.node_content
)
203 # Sense elements (all but glosses)
204 elif node
in (u
"stagk", u
"stagr", u
"pos", u
"xref", u
"ant",
205 u
"field", u
"misc", u
"s_inf", u
"dial",
207 entry
.cur_sense
.setdefault(node
, []).append(
209 elif node
== u
"lsource":
211 xml_lang
= attrs
.get(u
"xml:lang", u
"eng")
213 ls_type
= attrs
.get(u
"ls_type", u
"full")
214 ls_wasei
= attrs
.get(u
"ls_wasei") # Flag for "waseieigo"
215 # We'll do a 4 node tuple for this entry...
216 entry
.cur_sense
.setdefault(node
, []).append(
217 (self
.node_content
, xml_lang
, ls_type
, ls_wasei
))
218 # Glosses... It seems that <pri> is not yet used, so
219 # glosses are pretty straightforward like the above fields.
220 elif node
== u
"gloss":
221 xml_lang
= attrs
.get(u
"xml:lang", u
"eng")
222 g_gend
= attrs
.get(u
"g_gend")
223 entry
.cur_sense
.setdefault(node
, []).append(
224 (self
.node_content
, xml_lang
, g_gend
))
226 print (u
"DEBUG: <pri> field detected! This is a new "
227 u
"field; please contact the author with the "
228 u
"modification date of your copy of JMdict so he "
229 u
"can update J-Ben to support it!")
232 print (u
"DEBUG: path %s: unhandled node %s with content "
233 u
"[%s]" % (self
.get_path(), node
,
235 self
.node_content
= ""
238 if name
!= self
.path
[-1][0]:
239 # Shouldn't ever happen, but mistakes *can* slip in...
240 print u
"Mismatch detected, path is %s, element name is %s" \
241 % (self
.get_path(), name
)
245 # Handle composite values
246 # First, the dict types...
247 if name
in (u
"k_ele", u
"r_ele", u
"sense"):
248 temp_key
= u
"cur_%s" % name
249 obj
= getattr(entry
, temp_key
)
250 getattr(entry
, name
).append(obj
)
251 delattr(entry
, temp_key
)
252 # Next, the two tuple types
253 elif name
== u
"links":
254 entry
.info
.setdefault(u
"links", []).append(
258 delattr(entry
, u
"cur_link_tag")
259 delattr(entry
, u
"cur_link_desc")
260 delattr(entry
, u
"cur_link_uri")
261 elif name
== u
"audit":
262 entry
.info
.setdefault(u
"audit", []).append(
265 delattr(entry
, u
"cur_upd_date")
266 delattr(entry
, u
"cur_upd_detl")
268 # Handle end of entry
269 elif name
== u
"entry":
270 for node
in (u
"k_ele", u
"r_ele", u
"sense",
271 u
"link_tag", u
"link_desc", u
"link_uri",
272 u
"upd_date", u
"upd_detl"):
273 if hasattr(entry
, u
"cur_%s" % node
):
276 raise Exception(u
"Shouldn't-Happen-Error")
278 # LATER: do some optimization if doing non-cached searches.
279 # (probably won't help many people though...)
280 #if not self.use_cache:
281 # raise Exception(u"JMdict no-cache-mode not yet supported!")
283 # self.data.append(entry)
285 # For now: all entries go into the data list.
286 self
.data
.append(entry
)
291 def characters(self
, content
):
293 self
.node_content
+= content
295 def skippedEntity(self
, name
):
296 # 2 things need to be done here:
297 # 1. JMdict entities need to be stored properly
298 # 2. Standard XML entities (***IF*** they are ***ALSO*** not parsed)
299 # should be manually put into the character stream.
301 if name
in (u
"lt", u
"amp", u
"gt", u
"quot", u
"apos"):
302 print u
"Houston, we gots ourselves a BIG problem:", name
304 self
.node_content
+= name
306 from xml
.sax
.expatreader
import ExpatParser
307 class ExpatParserNoEntityExp(ExpatParser
):
309 """An overridden Expat parser class which disables entity expansion."""
312 ExpatParser
.reset(self
)
313 self
._parser
.DefaultHandler
= self
.dummy_handler
315 def dummy_handler(self
, *args
, **kwargs
):
318 class JMdictParser(object):
320 def __init__(self
, filename
, use_cache
=True, encoding
="utf-8"):
321 """Initializer for JMdictParser.
323 About use_cache: JMdict is a large, heavy to parse file.
324 Although it takes a large amount of memory, it is ideal to
325 retain it in memory to increase the speed of subsequent
329 self
.filename
= filename
330 self
.encoding
= encoding
332 self
.use_cache
= use_cache
334 # All cached entries will be stored here
339 # Basic level index: key: set()
340 # Alternatively: key: list (constant order)
342 self
.j_ind
= {} # Japanese (ind_type: index)
343 self
.n_ind
= {} # Native (lang: lang_indices)
345 self
.index_list
= ["starts_with"] # List of indices to auto-create
347 def load_via_sax(self
, use_cache
, search_str
):
348 if len(self
.filename
) >= 3 and self
.filename
[-3:] == ".gz":
349 f
= gzip
.open(self
.filename
)
351 f
= open(self
.filename
, "rb")
353 sh
= JMDSAXHandler(use_cache
, search_str
)
354 isource
= InputSource()
355 isource
.setEncoding("utf-8")
356 isource
.setByteStream(f
)
358 # Parser: Since I wish to directly handle the "entities", we
359 # need to override default behavior and cannot just use
361 parser
= ExpatParserNoEntityExp()
362 parser
.setContentHandler(sh
)
364 parser
.parse(isource
)
368 def search(self
, search_str
, index
="starts_with", n_langs
=["eng"],
370 """Search JMdict for a Japanese or native language query.
372 search_str: the query
373 index: index to use (valid values: starts_with, None)
374 n_langs: list of native languages to search for
375 n_fallback: If True, processes languages in a "fallback" fashion:
376 for each entry examined, only look at the first language
377 to have glosses and ignore the rest.
380 if self
.use_cache
: data
= self
.cache
383 # Opt 1: sax... very powerful, but too much code with my impl?
384 # Opt 2: elementtree... more memory required, loads
385 # everything at once...
386 # Opt 3: sax... redo to store all vars as lists, or similar.
388 # First attempt of a SAX style loader.
389 data
= self
.load_via_sax(self
.use_cache
, search_str
)
391 if self
.use_cache
: self
.cache
= data
393 self
.create_indices(data
, self
.index_list
)
396 if index
== "starts_with":
401 idx
= self
.j_ind
.get(index
)
405 for entry
in [data
[i
] for i
in idx
]:
407 for k_ele
in entry
.k_ele
:
408 if search_str
== k_ele
[u
"keb"][:len(search_str
)]:
409 results
.append(entry
)
413 for r_ele
in entry
.r_ele
:
414 if search_str
== r_ele
[u
"reb"][:len(search_str
)]:
415 results
.append(entry
)
418 # Native language next:
419 # WEAKNESS: if we later support searching via other
420 # languages which use Chinese characters, we may end up
421 # with duplicates with this code.
424 idx
= self
.n_ind
.get(lang
)
430 for entry
in [data
[i
] for i
in idx
]:
432 # NOT YET IMPLEMENTED
436 for sense
in entry
.sense
:
437 for gloss
, lang
, gender
in sense
[u
"gloss"]:
438 if search_str
== gloss
[:len(search_str
)]:
439 results
.append(entry
)
443 # WARNING: this could be VERY slow!
448 # Native language search:
449 for sense
in entry
.sense
:
450 for gloss
, lang
, gender
in sense
[u
"gloss"]:
451 if lang
not in n_langs
:
453 if search_str
== gloss
[:len(search_str
)]:
457 raise Exception(u
"Unhandled index type: %s" % index
)
461 def create_indices(self
, data
, desired_indices
):
462 """Creates desired indices for a set of input data."""
467 for i
, entry
in enumerate(data
):
468 for index_name
in desired_indices
:
469 if index_name
== "starts_with":
473 for k_ele
in entry
.k_ele
:
474 j_targets
.add(k_ele
[u
"keb"][0])
475 for r_ele
in entry
.r_ele
:
476 j_targets
.add(r_ele
[u
"reb"][0])
477 for sense
in entry
.sense
:
478 if not sense
.has_key(u
"gloss"): continue
479 for gloss
, lang
, gender
in sense
[u
"gloss"]:
480 n_targets
.setdefault(lang
, set()).add(gloss
[0])
481 # Append to indices (assuming indices as lists)
482 for target
in j_targets
:
483 self
.j_ind
.setdefault(index_name
, {}) \
484 .setdefault(target
, []) \
486 for lang
, targ_set
in n_targets
.iteritems():
487 for target
in targ_set
:
488 self
.n_ind
.setdefault(lang
, {}) \
489 .setdefault(index_name
, {}) \
490 .setdefault(target
, []) \
493 raise Exception(u
"Unsupported index type")
495 if __name__
== "__main__":
498 if len(sys
.argv
) < 2:
499 print _(u
"Please specify a dictionary file.")
502 kp
= JMdictParser(sys
.argv
[1])
504 print _(u
"Could not create JMdictParser: %s") % unicode(e
)
507 if len(sys
.argv
) < 3:
508 print _(u
"Please specify a search query.")
516 for i
, entry
in enumerate(kp
.search(sys
.argv
[2].decode(charset
))):
517 print _(u
"Entry %d: %s") % (i
+1, entry
.to_string())