Basic JMdict searches ("starts_with" index) should now work.
[jben2_gui.git] / parsers / tests / jmdict.py
blob7eeb3432f3963bafa9a83de5e12a6cfdbff99376
1 # -*- coding: utf-8 -*-
3 import unittest, time
4 from cStringIO import StringIO
5 from parsers import jmdict
6 from xml.sax.xmlreader import InputSource
8 SRC_NAME = "jmdict"
9 SRC_DIR = "../dicts"
11 SRC_NAME = "/".join((SRC_DIR, SRC_NAME))
13 class JMdictTest(unittest.TestCase):
15 def setUp(self):
16 self.parser = jmdict.JMdictParser(SRC_NAME)
18 def test_japanese_search(self):
19 """JMDICT: Search for Japanese word/phrase"""
20 parser = self.parser
21 desired_indices = ["starts_with"]
23 data = self._parse_x_entries(SRC_NAME, 10)
24 parser.cache = data
25 parser.create_indices(data, desired_indices)
27 query = u"仝"
28 l = parser.search(query)
29 #print
30 #print "====="
31 #for entry in l:
32 # print entry.to_string()
33 self.assertTrue(len(l) > 0)
34 #print "====="
35 query = u"おなじく"
36 l = parser.search(query)
37 #for entry in l:
38 # print entry.to_string()
39 #print "====="
40 self.assertTrue(len(l) > 0)
42 def test_native_search(self):
43 """JMDICT: Search for non-Japanese word/phrase"""
44 # Let's cheat a little: reading in the whole JMdict will make
45 # our unit tests UNBEARABLY slow.
47 # This test will fail if "repetition mark" does not show up
48 # within the first 10 entries. In such a case, the test will
49 # need to be updated.
50 parser = self.parser
51 desired_indices = ["starts_with"]
53 data = self._parse_x_entries(SRC_NAME, 10)
54 parser.cache = data
55 parser.create_indices(data, desired_indices)
57 query = u"repetition mark"
58 l = parser.search(query)
59 self.assertTrue(len(l) > 0)
61 def test_unparsed(self):
62 """JMDICT: Check for unhandled JMDICT fields"""
63 l = [k for k in self.parser.search(u"")]
64 unparsed = set()
65 for key, entry in self.parser.cache.iteritems():
66 for item in entry.unparsed: unparsed.add(item)
67 if unparsed:
68 #self.fail(u"Unhandled fields found: %s" % u", ".join(unparsed))
69 print u"\n\tWARNING: Unhandled fields found: %s" \
70 % u", ".join(unparsed)
72 def test_caching(self):
73 """JMDICT: Check that caching is working"""
74 self.assertFalse(self.parser.cache)
75 t = time.time()
76 self.test_japanese_search()
77 first_t = time.time() - t
79 self.assertTrue(self.parser.cache)
80 t = time.time()
81 self.test_japanese_search()
82 second_t = time.time() - t
84 print "\n\tFirst query time: %f" % first_t
85 print "\tSecond query time: %f" % second_t
86 self.assertTrue(second_t <= first_t)
88 def test_no_cache(self):
89 """JMDICT: Check that parser works without caching."""
90 self.parser = jmdict.JMdictParser(SRC_NAME, use_cache=False)
92 self.assertFalse(self.parser.cache)
93 t = time.time()
94 self.test_japanese_search()
95 first_t = time.time() - t
97 self.assertFalse(self.parser.cache)
98 t = time.time()
99 self.test_japanese_search()
100 second_t = time.time() - t
102 print "\n\tFirst query time: %f" % first_t
103 print "\tSecond query time: %f" % second_t
105 def _parse_x_entries(self, filename, max_entries):
106 """Helper function: reads max_entries entries from the JMdict file.
108 The text for max_entries entries of JMdict are naively read
109 in, then converted to a file-like object which the parser will
110 use.
113 # Copied from parsers.jmdict
114 if len(filename) >= 3 and filename[-3:] == ".gz":
115 f = gzip.open(filename)
116 else:
117 f = open(filename, "rb")
119 # Grab just the first max_entries entries, then close f and
120 # make a new "f" via the StringIO lib.
121 lines = []
122 count = 0
123 while True:
124 line = f.readline()
125 lines.append(line)
126 if "</entry>" in line:
127 count += 1
128 if count >= max_entries: break
129 f.close()
130 lines.append("</JMdict>\n")
131 f = StringIO("".join(lines))
133 # Continuing from parsers.jmdict...
134 sh = jmdict.JMDSAXHandler(True, "beef")
135 isource = InputSource()
136 isource.setEncoding("utf-8")
137 isource.setByteStream(f)
139 # Parser: Since I wish to directly handle the "entities", we
140 # need to override default behavior and cannot just use
141 # xml.sax.parse.
142 parser = jmdict.ExpatParserNoEntityExp()
143 parser.setContentHandler(sh)
145 parser.parse(isource)
146 f.close()
147 return sh.data
149 def test_limited_parse(self):
150 """JMDICT: Parse 5 entries successfully."""
151 data = self._parse_x_entries(SRC_NAME, 5)
152 self.assertEqual(len(data), 5)
154 def test_indexing(self):
155 parser = self.parser
156 desired_indices = ["starts_with"]
158 data = self._parse_x_entries(SRC_NAME, 5)
160 print "CREATING INDICES"
161 parser.create_indices(data, desired_indices)
163 print "j_ind keys:", parser.j_ind.keys()
164 for name, index in parser.j_ind.iteritems():
165 #print "\t%s: %s" % (name, str(index))
166 print "\t%s keys: %s" % (name, str(index.keys()))
167 for k, s in index.iteritems():
168 print "\t\t%s: %s" % (k, str(s))
169 print "n_ind keys:", parser.n_ind.keys()
170 for name, index in parser.n_ind.iteritems():
171 print "\t%s keys: %s" % (name, str(index.keys()))
172 for name2, index2 in index.iteritems():
173 #print "\t\t%s: %s" % (name2, str(index2))
174 print "\t\t%s keys: %s" % (name2, str(index2.keys()))
175 for k, s in index2.iteritems():
176 print "\t\t\t%s: %s" % (k, str(s))
179 def tearDown(self):
180 self.parser = None
182 #del test_japanese_search
183 #del test_native_search
184 del test_unparsed
185 del test_caching
186 del test_no_cache
188 del test_limited_parse
189 del test_indexing
191 if __name__ == "__main__":
192 unittest.main()