Added MANIFEST.in files, updated setup.py files.
[jben2_gui.git] / jbparse / test / jmdict.py
blob3d20d837acd60a4dc9d30d176b6a706a924ab7bc
1 # -*- coding: utf-8 -*-
3 from __future__ import absolute_import
5 import unittest, time
6 from cStringIO import StringIO
7 from jbparse import jmdict
8 from xml.sax.xmlreader import InputSource
10 SRC_NAME = "jmdict"
11 SRC_DIR = "../../dicts"
13 SRC_NAME = "/".join((SRC_DIR, SRC_NAME))
15 class JMdictTest(unittest.TestCase):
17 def setUp(self):
18 self.parser = jmdict.JMdictParser(SRC_NAME)
20 def test_japanese_search(self):
21 """JMDICT: Search for Japanese word/phrase"""
22 parser = self.parser
23 desired_indices = ["starts_with"]
25 data = self._parse_x_entries(SRC_NAME, 10)
26 parser.cache = data
27 parser.create_indices(data, desired_indices)
29 query = u"仝"
30 l = parser.search(query)
31 #print
32 #print "====="
33 #for entry in l:
34 # print entry.to_string()
35 self.assertTrue(len(l) > 0)
36 #print "====="
37 query = u"おなじく"
38 l = parser.search(query)
39 #for entry in l:
40 # print entry.to_string()
41 #print "====="
42 self.assertTrue(len(l) > 0)
44 def test_native_search(self):
45 """JMDICT: Search for non-Japanese word/phrase"""
46 # Let's cheat a little: reading in the whole JMdict will make
47 # our unit tests UNBEARABLY slow.
49 # This test will fail if "repetition mark" does not show up
50 # within the first 10 entries. In such a case, the test will
51 # need to be updated.
52 parser = self.parser
53 desired_indices = ["starts_with"]
55 data = self._parse_x_entries(SRC_NAME, 10)
56 parser.cache = data
57 parser.create_indices(data, desired_indices)
59 query = u"repetition mark"
60 l = parser.search(query)
61 self.assertTrue(len(l) > 0)
63 def test_unparsed(self):
64 """JMDICT: Check for unhandled JMDICT fields"""
65 l = [k for k in self.parser.search(u"")]
66 unparsed = set()
67 for key, entry in self.parser.cache.iteritems():
68 for item in entry.unparsed: unparsed.add(item)
69 if unparsed:
70 #self.fail(u"Unhandled fields found: %s" % u", ".join(unparsed))
71 print u"\n\tWARNING: Unhandled fields found: %s" \
72 % u", ".join(unparsed)
74 def test_caching(self):
75 """JMDICT: Check that caching is working"""
76 self.assertFalse(self.parser.cache)
77 t = time.time()
78 self.test_japanese_search()
79 first_t = time.time() - t
81 self.assertTrue(self.parser.cache)
82 t = time.time()
83 self.test_japanese_search()
84 second_t = time.time() - t
86 print "\n\tFirst query time: %f" % first_t
87 print "\tSecond query time: %f" % second_t
88 self.assertTrue(second_t <= first_t)
90 def test_no_cache(self):
91 """JMDICT: Check that parser works without caching."""
92 self.parser = jmdict.JMdictParser(SRC_NAME, use_cache=False)
94 self.assertFalse(self.parser.cache)
95 t = time.time()
96 self.test_japanese_search()
97 first_t = time.time() - t
99 self.assertFalse(self.parser.cache)
100 t = time.time()
101 self.test_japanese_search()
102 second_t = time.time() - t
104 print "\n\tFirst query time: %f" % first_t
105 print "\tSecond query time: %f" % second_t
107 def _parse_x_entries(self, filename, max_entries):
108 """Helper function: reads max_entries entries from the JMdict file.
110 The text for max_entries entries of JMdict are naively read
111 in, then converted to a file-like object which the parser will
112 use.
115 # Copied from parsers.jmdict
116 if len(filename) >= 3 and filename[-3:] == ".gz":
117 f = gzip.open(filename)
118 else:
119 f = open(filename, "rb")
121 # Grab just the first max_entries entries, then close f and
122 # make a new "f" via the StringIO lib.
123 lines = []
124 count = 0
125 while True:
126 line = f.readline()
127 lines.append(line)
128 if "</entry>" in line:
129 count += 1
130 if count >= max_entries: break
131 f.close()
132 lines.append("</JMdict>\n")
133 f = StringIO("".join(lines))
135 # Continuing from parsers.jmdict...
136 sh = jmdict.JMDSAXHandler(True, "beef")
137 isource = InputSource()
138 isource.setEncoding("utf-8")
139 isource.setByteStream(f)
141 # Parser: Since I wish to directly handle the "entities", we
142 # need to override default behavior and cannot just use
143 # xml.sax.parse.
144 parser = jmdict.ExpatParserNoEntityExp()
145 parser.setContentHandler(sh)
147 parser.parse(isource)
148 f.close()
149 return sh.data
151 def test_limited_parse(self):
152 """JMDICT: Parse 5 entries successfully."""
153 data = self._parse_x_entries(SRC_NAME, 5)
154 self.assertEqual(len(data), 5)
156 def test_indexing(self):
157 parser = self.parser
158 desired_indices = ["starts_with"]
160 data = self._parse_x_entries(SRC_NAME, 5)
162 print "CREATING INDICES"
163 parser.create_indices(data, desired_indices)
165 print "j_ind keys:", parser.j_ind.keys()
166 for name, index in parser.j_ind.iteritems():
167 #print "\t%s: %s" % (name, str(index))
168 print "\t%s keys: %s" % (name, str(index.keys()))
169 for k, s in index.iteritems():
170 print "\t\t%s: %s" % (k, str(s))
171 print "n_ind keys:", parser.n_ind.keys()
172 for name, index in parser.n_ind.iteritems():
173 print "\t%s keys: %s" % (name, str(index.keys()))
174 for name2, index2 in index.iteritems():
175 #print "\t\t%s: %s" % (name2, str(index2))
176 print "\t\t%s keys: %s" % (name2, str(index2.keys()))
177 for k, s in index2.iteritems():
178 print "\t\t\t%s: %s" % (k, str(s))
181 def tearDown(self):
182 self.parser = None
184 #del test_japanese_search
185 #del test_native_search
186 del test_unparsed
187 del test_caching
188 del test_no_cache
190 del test_limited_parse
191 del test_indexing
193 if __name__ == "__main__":
194 unittest.main()