jben/parsers/tests/jmdict.py

   1 # -*- coding: utf-8 -*-
   2
   3 import unittest, time
   4 from cStringIO import StringIO
   5 from parsers import jmdict
   6 from xml.sax.xmlreader import InputSource
   7
   8 SRC_NAME = "jmdict"
   9 SRC_DIR = "../dicts"
  10
  11 SRC_NAME = "/".join((SRC_DIR, SRC_NAME))
  12
  13 class JMdictTest(unittest.TestCase):
  14
  15     def setUp(self):
  16         self.parser = jmdict.JMdictParser(SRC_NAME)
  17
  18     def test_japanese_search(self):
  19         """JMDICT: Search for Japanese word/phrase"""
  20         parser = self.parser
  21         desired_indices = ["starts_with"]
  22
  23         data = self._parse_x_entries(SRC_NAME, 10)
  24         parser.cache = data
  25         parser.create_indices(data, desired_indices)
  26
  27         query = u"仝"
  28         l = parser.search(query)
  29         #print
  30         #print "====="
  31         #for entry in l:
  32         #    print entry.to_string()
  33         self.assertTrue(len(l) > 0)
  34         #print "====="
  35         query = u"おなじく"
  36         l = parser.search(query)
  37         #for entry in l:
  38         #    print entry.to_string()
  39         #print "====="
  40         self.assertTrue(len(l) > 0)
  41
  42     def test_native_search(self):
  43         """JMDICT: Search for non-Japanese word/phrase"""
  44         # Let's cheat a little: reading in the whole JMdict will make
  45         # our unit tests UNBEARABLY slow.
  46         #
  47         # This test will fail if "repetition mark" does not show up
  48         # within the first 10 entries.  In such a case, the test will
  49         # need to be updated.
  50         parser = self.parser
  51         desired_indices = ["starts_with"]
  52
  53         data = self._parse_x_entries(SRC_NAME, 10)
  54         parser.cache = data
  55         parser.create_indices(data, desired_indices)
  56
  57         query = u"repetition mark"
  58         l = parser.search(query)
  59         self.assertTrue(len(l) > 0)
  60
  61     def test_unparsed(self):
  62         """JMDICT: Check for unhandled JMDICT fields"""
  63         l = [k for k in self.parser.search(u"")]
  64         unparsed = set()
  65         for key, entry in self.parser.cache.iteritems():
  66             for item in entry.unparsed: unparsed.add(item)
  67         if unparsed:
  68             #self.fail(u"Unhandled fields found: %s" % u", ".join(unparsed))
  69             print u"\n\tWARNING: Unhandled fields found: %s" \
  70                   % u", ".join(unparsed)
  71
  72     def test_caching(self):
  73         """JMDICT: Check that caching is working"""
  74         self.assertFalse(self.parser.cache)
  75         t = time.time()
  76         self.test_japanese_search()
  77         first_t = time.time() - t
  78
  79         self.assertTrue(self.parser.cache)
  80         t = time.time()
  81         self.test_japanese_search()
  82         second_t = time.time() - t
  83
  84         print "\n\tFirst query time:  %f" % first_t
  85         print "\tSecond query time: %f" % second_t
  86         self.assertTrue(second_t <= first_t)
  87
  88     def test_no_cache(self):
  89         """JMDICT: Check that parser works without caching."""
  90         self.parser = jmdict.JMdictParser(SRC_NAME, use_cache=False)
  91
  92         self.assertFalse(self.parser.cache)
  93         t = time.time()
  94         self.test_japanese_search()
  95         first_t = time.time() - t
  96
  97         self.assertFalse(self.parser.cache)
  98         t = time.time()
  99         self.test_japanese_search()
 100         second_t = time.time() - t
 101
 102         print "\n\tFirst query time:  %f" % first_t
 103         print "\tSecond query time: %f" % second_t
 104
 105     def _parse_x_entries(self, filename, max_entries):
 106         """Helper function: reads max_entries entries from the JMdict file.
 107
 108         The text for max_entries entries of JMdict are naively read
 109         in, then converted to a file-like object which the parser will
 110         use.
 111
 112         """
 113         # Copied from parsers.jmdict
 114         if len(filename) >= 3 and filename[-3:] == ".gz":
 115             f = gzip.open(filename)
 116         else:
 117             f = open(filename, "rb")
 118
 119         # Grab just the first max_entries entries, then close f and
 120         # make a new "f" via the StringIO lib.
 121         lines = []
 122         count = 0
 123         while True:
 124             line = f.readline()
 125             lines.append(line)
 126             if "</entry>" in line:
 127                 count += 1
 128                 if count >= max_entries: break
 129         f.close()
 130         lines.append("</JMdict>\n")
 131         f = StringIO("".join(lines))
 132
 133         # Continuing from parsers.jmdict...
 134         sh = jmdict.JMDSAXHandler(True, "beef")
 135         isource = InputSource()
 136         isource.setEncoding("utf-8")
 137         isource.setByteStream(f)
 138
 139         # Parser: Since I wish to directly handle the "entities", we
 140         # need to override default behavior and cannot just use
 141         # xml.sax.parse.
 142         parser = jmdict.ExpatParserNoEntityExp()
 143         parser.setContentHandler(sh)
 144
 145         parser.parse(isource)
 146         f.close()
 147         return sh.data
 148
 149     def test_limited_parse(self):
 150         """JMDICT: Parse 5 entries successfully."""
 151         data = self._parse_x_entries(SRC_NAME, 5)
 152         self.assertEqual(len(data), 5)
 153
 154     def test_indexing(self):
 155         parser = self.parser
 156         desired_indices = ["starts_with"]
 157
 158         data = self._parse_x_entries(SRC_NAME, 5)
 159
 160         print "CREATING INDICES"
 161         parser.create_indices(data, desired_indices)
 162
 163         print "j_ind keys:", parser.j_ind.keys()
 164         for name, index in parser.j_ind.iteritems():
 165             #print "\t%s: %s" % (name, str(index))
 166             print "\t%s keys: %s" % (name, str(index.keys()))
 167             for k, s in index.iteritems():
 168                 print "\t\t%s: %s" % (k, str(s))
 169         print "n_ind keys:", parser.n_ind.keys()
 170         for name, index in parser.n_ind.iteritems():
 171             print "\t%s keys: %s" % (name, str(index.keys()))
 172             for name2, index2 in index.iteritems():
 173                 #print "\t\t%s: %s" % (name2, str(index2))
 174                 print "\t\t%s keys: %s" % (name2, str(index2.keys()))
 175                 for k, s in index2.iteritems():
 176                     print "\t\t\t%s: %s" % (k, str(s))
 177
 178
 179     def tearDown(self):
 180         self.parser = None
 181
 182     #del test_japanese_search
 183     #del test_native_search
 184     del test_unparsed
 185     del test_caching
 186     del test_no_cache
 187
 188     del test_limited_parse
 189     del test_indexing
 190
 191 if __name__ == "__main__":
 192     unittest.main()