From 5c7fc089c4319dad9134cab7a52444eb1f91c65e Mon Sep 17 00:00:00 2001
From: Paul Goins <general@vultaire.net>
Date: Thu, 30 Jul 2009 01:06:29 +0900
Subject: [PATCH] Basic JMdict searches ("starts_with" index) should now work.

---
 parsers/jmdict.py       | 87 +++++++++++++++++++++++++++++++++++++++++++++----
 parsers/tests/jmdict.py | 63 ++++++++++++++++++++++++++---------
 2 files changed, 128 insertions(+), 22 deletions(-)

diff --git a/parsers/jmdict.py b/parsers/jmdict.py
index 52169dc..ad17369 100644
--- a/parsers/jmdict.py
+++ b/parsers/jmdict.py
@@ -365,7 +365,17 @@ class JMdictParser(object):
         f.close()
         return sh.data
 
-    def search(self, search_str):
+    def search(self, search_str, index="starts_with", n_langs=["eng"],
+               n_fallback=True):
+        """Search JMdict for a Japanese or native language query.
+
+        search_str: the query
+        index: index to use (valid values: starts_with, None)
+        n_langs: list of native languages to search for
+        n_fallback: If True, processes languages in a "fallback" fashion:
+                    for each entry examined, only look at the first language
+                    to have glosses and ignore the rest.
+        """
         data = None
         if self.use_cache: data = self.cache
         if not data:
@@ -382,15 +392,78 @@ class JMdictParser(object):
 
             self.create_indices(data, self.index_list)
 
-        # Add search logic here
-        i = 0
-        for entry in data:
-            i += 1
-            yield entry
-            if i >= 5: exit(0)
+        results = []
+        if index == "starts_with":
+            # Indexed lookup
+            key = search_str[0]
+
+            # Japanese first:
+            idx = self.j_ind.get(index)
+            if idx:
+                idx = idx.get(key)
+            if idx:
+                for entry in [data[i] for i in idx]:
+                    added = False
+                    for k_ele in entry.k_ele:
+                        if search_str == k_ele[u"keb"][:len(search_str)]:
+                            results.append(entry)
+                            added = True
+                            break
+                    if added: continue
+                    for r_ele in entry.r_ele:
+                        if search_str == r_ele[u"reb"][:len(search_str)]:
+                            results.append(entry)
+                            break
+
+            # Native language next:
+            # WEAKNESS: if we later support searching via other
+            # languages which use Chinese characters, we may end up
+            # with duplicates with this code.
+            for lang in n_langs:
+                search_keys = None
+                idx = self.n_ind.get(lang)
+                if idx:
+                    idx = idx.get(index)
+                if idx:
+                    idx = idx.get(key)
+                if idx:
+                    for entry in [data[i] for i in idx]:
+                        if n_fallback:
+                            # NOT YET IMPLEMENTED
+                            pass
+                        #else:
+
+                        for sense in entry.sense:
+                            for gloss, lang, gender in sense[u"gloss"]:
+                                if search_str == gloss[:len(search_str)]:
+                                    results.append(entry)
+                                    continue
+        elif not index:
+            # Non-indexed lookup
+            # WARNING: this could be VERY slow!
+            for entry in data:
+                # Japanese search:
+                # *** TO DO ***
+
+                # Native language search:
+                for sense in entry.sense:
+                    for gloss, lang, gender in sense[u"gloss"]:
+                        if lang not in n_langs:
+                            continue
+                        if search_str == gloss[:len(search_str)]:
+                            results.add(entry)
+                            break
+        else:
+            raise Exception(u"Unhandled index type: %s" % index)
+
+        return results
 
     def create_indices(self, data, desired_indices):
         """Creates desired indices for a set of input data."""
+        # Initialize indices
+        self.j_ind = {}
+        self.n_ind = {}
+
         for i, entry in enumerate(data):
             for index_name in desired_indices:
                 if index_name == "starts_with":
diff --git a/parsers/tests/jmdict.py b/parsers/tests/jmdict.py
index 7dd1687..7eeb343 100644
--- a/parsers/tests/jmdict.py
+++ b/parsers/tests/jmdict.py
@@ -17,14 +17,45 @@ class JMdictTest(unittest.TestCase):
 
     def test_japanese_search(self):
         """JMDICT: Search for Japanese word/phrase"""
-        query = u"日本"
-        l = [entry for entry in self.parser.search(query)]
+        parser = self.parser
+        desired_indices = ["starts_with"]
+
+        data = self._parse_x_entries(SRC_NAME, 10)
+        parser.cache = data
+        parser.create_indices(data, desired_indices)
+
+        query = u"仝"
+        l = parser.search(query)
+        #print
+        #print "====="
+        #for entry in l:
+        #    print entry.to_string()
+        self.assertTrue(len(l) > 0)
+        #print "====="
+        query = u"おなじく"
+        l = parser.search(query)
+        #for entry in l:
+        #    print entry.to_string()
+        #print "====="
         self.assertTrue(len(l) > 0)
 
     def test_native_search(self):
         """JMDICT: Search for non-Japanese word/phrase"""
-        query = u"Japan"
-        l = [entry for entry in self.parser.search(query)]
+        # Let's cheat a little: reading in the whole JMdict will make
+        # our unit tests UNBEARABLY slow.
+        #
+        # This test will fail if "repetition mark" does not show up
+        # within the first 10 entries.  In such a case, the test will
+        # need to be updated.
+        parser = self.parser
+        desired_indices = ["starts_with"]
+
+        data = self._parse_x_entries(SRC_NAME, 10)
+        parser.cache = data
+        parser.create_indices(data, desired_indices)
+
+        query = u"repetition mark"
+        l = parser.search(query)
         self.assertTrue(len(l) > 0)
 
     def test_unparsed(self):
@@ -71,11 +102,12 @@ class JMdictTest(unittest.TestCase):
         print "\n\tFirst query time:  %f" % first_t
         print "\tSecond query time: %f" % second_t
 
-    def _parse_5_entries(self, filename):
-        """Helper function: reads 5 entries from the JMdict file.
+    def _parse_x_entries(self, filename, max_entries):
+        """Helper function: reads max_entries entries from the JMdict file.
 
-        The text for 5 entries of JMdict are naively read in, then
-        converted to a file-like object which the parser will use.
+        The text for max_entries entries of JMdict are naively read
+        in, then converted to a file-like object which the parser will
+        use.
 
         """
         # Copied from parsers.jmdict
@@ -84,8 +116,8 @@ class JMdictTest(unittest.TestCase):
         else:
             f = open(filename, "rb")
 
-        # Grab just the first 5 entries, then close f and make a new
-        # "f" via the StringIO lib.
+        # Grab just the first max_entries entries, then close f and
+        # make a new "f" via the StringIO lib.
         lines = []
         count = 0
         while True:
@@ -93,7 +125,7 @@ class JMdictTest(unittest.TestCase):
             lines.append(line)
             if "</entry>" in line:
                 count += 1
-                if count >= 5: break
+                if count >= max_entries: break
         f.close()
         lines.append("</JMdict>\n")
         f = StringIO("".join(lines))
@@ -116,14 +148,14 @@ class JMdictTest(unittest.TestCase):
 
     def test_limited_parse(self):
         """JMDICT: Parse 5 entries successfully."""
-        data = self._parse_5_entries(SRC_NAME)
+        data = self._parse_x_entries(SRC_NAME, 5)
         self.assertEqual(len(data), 5)
 
     def test_indexing(self):
         parser = self.parser
         desired_indices = ["starts_with"]
 
-        data = self._parse_5_entries(SRC_NAME)
+        data = self._parse_x_entries(SRC_NAME, 5)
 
         print "CREATING INDICES"
         parser.create_indices(data, desired_indices)
@@ -147,13 +179,14 @@ class JMdictTest(unittest.TestCase):
     def tearDown(self):
         self.parser = None
 
-    del test_japanese_search
-    del test_native_search
+    #del test_japanese_search
+    #del test_native_search
     del test_unparsed
     del test_caching
     del test_no_cache
 
     del test_limited_parse
+    del test_indexing
 
 if __name__ == "__main__":
     unittest.main()
-- 
2.11.4.GIT