From 5c7fc089c4319dad9134cab7a52444eb1f91c65e Mon Sep 17 00:00:00 2001 From: Paul Goins Date: Thu, 30 Jul 2009 01:06:29 +0900 Subject: [PATCH] Basic JMdict searches ("starts_with" index) should now work. --- parsers/jmdict.py | 87 +++++++++++++++++++++++++++++++++++++++++++++---- parsers/tests/jmdict.py | 63 ++++++++++++++++++++++++++--------- 2 files changed, 128 insertions(+), 22 deletions(-) diff --git a/parsers/jmdict.py b/parsers/jmdict.py index 52169dc..ad17369 100644 --- a/parsers/jmdict.py +++ b/parsers/jmdict.py @@ -365,7 +365,17 @@ class JMdictParser(object): f.close() return sh.data - def search(self, search_str): + def search(self, search_str, index="starts_with", n_langs=["eng"], + n_fallback=True): + """Search JMdict for a Japanese or native language query. + + search_str: the query + index: index to use (valid values: starts_with, None) + n_langs: list of native languages to search for + n_fallback: If True, processes languages in a "fallback" fashion: + for each entry examined, only look at the first language + to have glosses and ignore the rest. + """ data = None if self.use_cache: data = self.cache if not data: @@ -382,15 +392,78 @@ class JMdictParser(object): self.create_indices(data, self.index_list) - # Add search logic here - i = 0 - for entry in data: - i += 1 - yield entry - if i >= 5: exit(0) + results = [] + if index == "starts_with": + # Indexed lookup + key = search_str[0] + + # Japanese first: + idx = self.j_ind.get(index) + if idx: + idx = idx.get(key) + if idx: + for entry in [data[i] for i in idx]: + added = False + for k_ele in entry.k_ele: + if search_str == k_ele[u"keb"][:len(search_str)]: + results.append(entry) + added = True + break + if added: continue + for r_ele in entry.r_ele: + if search_str == r_ele[u"reb"][:len(search_str)]: + results.append(entry) + break + + # Native language next: + # WEAKNESS: if we later support searching via other + # languages which use Chinese characters, we may end up + # with duplicates with this code. + for lang in n_langs: + search_keys = None + idx = self.n_ind.get(lang) + if idx: + idx = idx.get(index) + if idx: + idx = idx.get(key) + if idx: + for entry in [data[i] for i in idx]: + if n_fallback: + # NOT YET IMPLEMENTED + pass + #else: + + for sense in entry.sense: + for gloss, lang, gender in sense[u"gloss"]: + if search_str == gloss[:len(search_str)]: + results.append(entry) + continue + elif not index: + # Non-indexed lookup + # WARNING: this could be VERY slow! + for entry in data: + # Japanese search: + # *** TO DO *** + + # Native language search: + for sense in entry.sense: + for gloss, lang, gender in sense[u"gloss"]: + if lang not in n_langs: + continue + if search_str == gloss[:len(search_str)]: + results.add(entry) + break + else: + raise Exception(u"Unhandled index type: %s" % index) + + return results def create_indices(self, data, desired_indices): """Creates desired indices for a set of input data.""" + # Initialize indices + self.j_ind = {} + self.n_ind = {} + for i, entry in enumerate(data): for index_name in desired_indices: if index_name == "starts_with": diff --git a/parsers/tests/jmdict.py b/parsers/tests/jmdict.py index 7dd1687..7eeb343 100644 --- a/parsers/tests/jmdict.py +++ b/parsers/tests/jmdict.py @@ -17,14 +17,45 @@ class JMdictTest(unittest.TestCase): def test_japanese_search(self): """JMDICT: Search for Japanese word/phrase""" - query = u"日本" - l = [entry for entry in self.parser.search(query)] + parser = self.parser + desired_indices = ["starts_with"] + + data = self._parse_x_entries(SRC_NAME, 10) + parser.cache = data + parser.create_indices(data, desired_indices) + + query = u"仝" + l = parser.search(query) + #print + #print "=====" + #for entry in l: + # print entry.to_string() + self.assertTrue(len(l) > 0) + #print "=====" + query = u"おなじく" + l = parser.search(query) + #for entry in l: + # print entry.to_string() + #print "=====" self.assertTrue(len(l) > 0) def test_native_search(self): """JMDICT: Search for non-Japanese word/phrase""" - query = u"Japan" - l = [entry for entry in self.parser.search(query)] + # Let's cheat a little: reading in the whole JMdict will make + # our unit tests UNBEARABLY slow. + # + # This test will fail if "repetition mark" does not show up + # within the first 10 entries. In such a case, the test will + # need to be updated. + parser = self.parser + desired_indices = ["starts_with"] + + data = self._parse_x_entries(SRC_NAME, 10) + parser.cache = data + parser.create_indices(data, desired_indices) + + query = u"repetition mark" + l = parser.search(query) self.assertTrue(len(l) > 0) def test_unparsed(self): @@ -71,11 +102,12 @@ class JMdictTest(unittest.TestCase): print "\n\tFirst query time: %f" % first_t print "\tSecond query time: %f" % second_t - def _parse_5_entries(self, filename): - """Helper function: reads 5 entries from the JMdict file. + def _parse_x_entries(self, filename, max_entries): + """Helper function: reads max_entries entries from the JMdict file. - The text for 5 entries of JMdict are naively read in, then - converted to a file-like object which the parser will use. + The text for max_entries entries of JMdict are naively read + in, then converted to a file-like object which the parser will + use. """ # Copied from parsers.jmdict @@ -84,8 +116,8 @@ class JMdictTest(unittest.TestCase): else: f = open(filename, "rb") - # Grab just the first 5 entries, then close f and make a new - # "f" via the StringIO lib. + # Grab just the first max_entries entries, then close f and + # make a new "f" via the StringIO lib. lines = [] count = 0 while True: @@ -93,7 +125,7 @@ class JMdictTest(unittest.TestCase): lines.append(line) if "" in line: count += 1 - if count >= 5: break + if count >= max_entries: break f.close() lines.append("\n") f = StringIO("".join(lines)) @@ -116,14 +148,14 @@ class JMdictTest(unittest.TestCase): def test_limited_parse(self): """JMDICT: Parse 5 entries successfully.""" - data = self._parse_5_entries(SRC_NAME) + data = self._parse_x_entries(SRC_NAME, 5) self.assertEqual(len(data), 5) def test_indexing(self): parser = self.parser desired_indices = ["starts_with"] - data = self._parse_5_entries(SRC_NAME) + data = self._parse_x_entries(SRC_NAME, 5) print "CREATING INDICES" parser.create_indices(data, desired_indices) @@ -147,13 +179,14 @@ class JMdictTest(unittest.TestCase): def tearDown(self): self.parser = None - del test_japanese_search - del test_native_search + #del test_japanese_search + #del test_native_search del test_unparsed del test_caching del test_no_cache del test_limited_parse + del test_indexing if __name__ == "__main__": unittest.main() -- 2.11.4.GIT