jbparse/test/jmdict.py

   1 # -*- coding: utf-8 -*-
   2
   3 from __future__ import absolute_import
   4
   5 import unittest, time
   6 from cStringIO import StringIO
   7 from jbparse import jmdict
   8 from xml.sax.xmlreader import InputSource
   9
  10 SRC_NAME = "jmdict"
  11 SRC_DIR = "../../dicts"
  12
  13 SRC_NAME = "/".join((SRC_DIR, SRC_NAME))
  14
  15 class JMdictTest(unittest.TestCase):
  16
  17     def setUp(self):
  18         self.parser = jmdict.JMdictParser(SRC_NAME)
  19
  20     def test_japanese_search(self):
  21         """JMDICT: Search for Japanese word/phrase"""
  22         parser = self.parser
  23         desired_indices = ["starts_with"]
  24
  25         data = self._parse_x_entries(SRC_NAME, 10)
  26         parser.cache = data
  27         parser.create_indices(data, desired_indices)
  28
  29         query = u"仝"
  30         l = parser.search(query)
  31         #print
  32         #print "====="
  33         #for entry in l:
  34         #    print entry.to_string()
  35         self.assertTrue(len(l) > 0)
  36         #print "====="
  37         query = u"おなじく"
  38         l = parser.search(query)
  39         #for entry in l:
  40         #    print entry.to_string()
  41         #print "====="
  42         self.assertTrue(len(l) > 0)
  43
  44     def test_native_search(self):
  45         """JMDICT: Search for non-Japanese word/phrase"""
  46         # Let's cheat a little: reading in the whole JMdict will make
  47         # our unit tests UNBEARABLY slow.
  48         #
  49         # This test will fail if "repetition mark" does not show up
  50         # within the first 10 entries.  In such a case, the test will
  51         # need to be updated.
  52         parser = self.parser
  53         desired_indices = ["starts_with"]
  54
  55         data = self._parse_x_entries(SRC_NAME, 10)
  56         parser.cache = data
  57         parser.create_indices(data, desired_indices)
  58
  59         query = u"repetition mark"
  60         l = parser.search(query)
  61         self.assertTrue(len(l) > 0)
  62
  63     def test_unparsed(self):
  64         """JMDICT: Check for unhandled JMDICT fields"""
  65         l = [k for k in self.parser.search(u"")]
  66         unparsed = set()
  67         for key, entry in self.parser.cache.iteritems():
  68             for item in entry.unparsed: unparsed.add(item)
  69         if unparsed:
  70             #self.fail(u"Unhandled fields found: %s" % u", ".join(unparsed))
  71             print u"\n\tWARNING: Unhandled fields found: %s" \
  72                   % u", ".join(unparsed)
  73
  74     def test_caching(self):
  75         """JMDICT: Check that caching is working"""
  76         self.assertFalse(self.parser.cache)
  77         t = time.time()
  78         self.test_japanese_search()
  79         first_t = time.time() - t
  80
  81         self.assertTrue(self.parser.cache)
  82         t = time.time()
  83         self.test_japanese_search()
  84         second_t = time.time() - t
  85
  86         print "\n\tFirst query time:  %f" % first_t
  87         print "\tSecond query time: %f" % second_t
  88         self.assertTrue(second_t <= first_t)
  89
  90     def test_no_cache(self):
  91         """JMDICT: Check that parser works without caching."""
  92         self.parser = jmdict.JMdictParser(SRC_NAME, use_cache=False)
  93
  94         self.assertFalse(self.parser.cache)
  95         t = time.time()
  96         self.test_japanese_search()
  97         first_t = time.time() - t
  98
  99         self.assertFalse(self.parser.cache)
 100         t = time.time()
 101         self.test_japanese_search()
 102         second_t = time.time() - t
 103
 104         print "\n\tFirst query time:  %f" % first_t
 105         print "\tSecond query time: %f" % second_t
 106
 107     def _parse_x_entries(self, filename, max_entries):
 108         """Helper function: reads max_entries entries from the JMdict file.
 109
 110         The text for max_entries entries of JMdict are naively read
 111         in, then converted to a file-like object which the parser will
 112         use.
 113
 114         """
 115         # Copied from parsers.jmdict
 116         if len(filename) >= 3 and filename[-3:] == ".gz":
 117             f = gzip.open(filename)
 118         else:
 119             f = open(filename, "rb")
 120
 121         # Grab just the first max_entries entries, then close f and
 122         # make a new "f" via the StringIO lib.
 123         lines = []
 124         count = 0
 125         while True:
 126             line = f.readline()
 127             lines.append(line)
 128             if "</entry>" in line:
 129                 count += 1
 130                 if count >= max_entries: break
 131         f.close()
 132         lines.append("</JMdict>\n")
 133         f = StringIO("".join(lines))
 134
 135         # Continuing from parsers.jmdict...
 136         sh = jmdict.JMDSAXHandler(True, "beef")
 137         isource = InputSource()
 138         isource.setEncoding("utf-8")
 139         isource.setByteStream(f)
 140
 141         # Parser: Since I wish to directly handle the "entities", we
 142         # need to override default behavior and cannot just use
 143         # xml.sax.parse.
 144         parser = jmdict.ExpatParserNoEntityExp()
 145         parser.setContentHandler(sh)
 146
 147         parser.parse(isource)
 148         f.close()
 149         return sh.data
 150
 151     def test_limited_parse(self):
 152         """JMDICT: Parse 5 entries successfully."""
 153         data = self._parse_x_entries(SRC_NAME, 5)
 154         self.assertEqual(len(data), 5)
 155
 156     def test_indexing(self):
 157         parser = self.parser
 158         desired_indices = ["starts_with"]
 159
 160         data = self._parse_x_entries(SRC_NAME, 5)
 161
 162         print "CREATING INDICES"
 163         parser.create_indices(data, desired_indices)
 164
 165         print "j_ind keys:", parser.j_ind.keys()
 166         for name, index in parser.j_ind.iteritems():
 167             #print "\t%s: %s" % (name, str(index))
 168             print "\t%s keys: %s" % (name, str(index.keys()))
 169             for k, s in index.iteritems():
 170                 print "\t\t%s: %s" % (k, str(s))
 171         print "n_ind keys:", parser.n_ind.keys()
 172         for name, index in parser.n_ind.iteritems():
 173             print "\t%s keys: %s" % (name, str(index.keys()))
 174             for name2, index2 in index.iteritems():
 175                 #print "\t\t%s: %s" % (name2, str(index2))
 176                 print "\t\t%s keys: %s" % (name2, str(index2.keys()))
 177                 for k, s in index2.iteritems():
 178                     print "\t\t\t%s: %s" % (k, str(s))
 179
 180
 181     def tearDown(self):
 182         self.parser = None
 183
 184     #del test_japanese_search
 185     #del test_native_search
 186     del test_unparsed
 187     del test_caching
 188     del test_no_cache
 189
 190     del test_limited_parse
 191     del test_indexing
 192
 193 if __name__ == "__main__":
 194     unittest.main()