1 # -*- coding: utf-8 -*-
4 from cStringIO
import StringIO
5 from parsers
import jmdict
6 from xml
.sax
.xmlreader
import InputSource
11 SRC_NAME
= "/".join((SRC_DIR
, SRC_NAME
))
13 class JMdictTest(unittest
.TestCase
):
16 self
.parser
= jmdict
.JMdictParser(SRC_NAME
)
18 def test_japanese_search(self
):
19 """JMDICT: Search for Japanese word/phrase"""
21 desired_indices
= ["starts_with"]
23 data
= self
._parse
_x
_entries
(SRC_NAME
, 10)
25 parser
.create_indices(data
, desired_indices
)
28 l
= parser
.search(query
)
32 # print entry.to_string()
33 self
.assertTrue(len(l
) > 0)
36 l
= parser
.search(query
)
38 # print entry.to_string()
40 self
.assertTrue(len(l
) > 0)
42 def test_native_search(self
):
43 """JMDICT: Search for non-Japanese word/phrase"""
44 # Let's cheat a little: reading in the whole JMdict will make
45 # our unit tests UNBEARABLY slow.
47 # This test will fail if "repetition mark" does not show up
48 # within the first 10 entries. In such a case, the test will
51 desired_indices
= ["starts_with"]
53 data
= self
._parse
_x
_entries
(SRC_NAME
, 10)
55 parser
.create_indices(data
, desired_indices
)
57 query
= u
"repetition mark"
58 l
= parser
.search(query
)
59 self
.assertTrue(len(l
) > 0)
61 def test_unparsed(self
):
62 """JMDICT: Check for unhandled JMDICT fields"""
63 l
= [k
for k
in self
.parser
.search(u
"")]
65 for key
, entry
in self
.parser
.cache
.iteritems():
66 for item
in entry
.unparsed
: unparsed
.add(item
)
68 #self.fail(u"Unhandled fields found: %s" % u", ".join(unparsed))
69 print u
"\n\tWARNING: Unhandled fields found: %s" \
70 % u
", ".join(unparsed
)
72 def test_caching(self
):
73 """JMDICT: Check that caching is working"""
74 self
.assertFalse(self
.parser
.cache
)
76 self
.test_japanese_search()
77 first_t
= time
.time() - t
79 self
.assertTrue(self
.parser
.cache
)
81 self
.test_japanese_search()
82 second_t
= time
.time() - t
84 print "\n\tFirst query time: %f" % first_t
85 print "\tSecond query time: %f" % second_t
86 self
.assertTrue(second_t
<= first_t
)
88 def test_no_cache(self
):
89 """JMDICT: Check that parser works without caching."""
90 self
.parser
= jmdict
.JMdictParser(SRC_NAME
, use_cache
=False)
92 self
.assertFalse(self
.parser
.cache
)
94 self
.test_japanese_search()
95 first_t
= time
.time() - t
97 self
.assertFalse(self
.parser
.cache
)
99 self
.test_japanese_search()
100 second_t
= time
.time() - t
102 print "\n\tFirst query time: %f" % first_t
103 print "\tSecond query time: %f" % second_t
105 def _parse_x_entries(self
, filename
, max_entries
):
106 """Helper function: reads max_entries entries from the JMdict file.
108 The text for max_entries entries of JMdict are naively read
109 in, then converted to a file-like object which the parser will
113 # Copied from parsers.jmdict
114 if len(filename
) >= 3 and filename
[-3:] == ".gz":
115 f
= gzip
.open(filename
)
117 f
= open(filename
, "rb")
119 # Grab just the first max_entries entries, then close f and
120 # make a new "f" via the StringIO lib.
126 if "</entry>" in line
:
128 if count
>= max_entries
: break
130 lines
.append("</JMdict>\n")
131 f
= StringIO("".join(lines
))
133 # Continuing from parsers.jmdict...
134 sh
= jmdict
.JMDSAXHandler(True, "beef")
135 isource
= InputSource()
136 isource
.setEncoding("utf-8")
137 isource
.setByteStream(f
)
139 # Parser: Since I wish to directly handle the "entities", we
140 # need to override default behavior and cannot just use
142 parser
= jmdict
.ExpatParserNoEntityExp()
143 parser
.setContentHandler(sh
)
145 parser
.parse(isource
)
149 def test_limited_parse(self
):
150 """JMDICT: Parse 5 entries successfully."""
151 data
= self
._parse
_x
_entries
(SRC_NAME
, 5)
152 self
.assertEqual(len(data
), 5)
154 def test_indexing(self
):
156 desired_indices
= ["starts_with"]
158 data
= self
._parse
_x
_entries
(SRC_NAME
, 5)
160 print "CREATING INDICES"
161 parser
.create_indices(data
, desired_indices
)
163 print "j_ind keys:", parser
.j_ind
.keys()
164 for name
, index
in parser
.j_ind
.iteritems():
165 #print "\t%s: %s" % (name, str(index))
166 print "\t%s keys: %s" % (name
, str(index
.keys()))
167 for k
, s
in index
.iteritems():
168 print "\t\t%s: %s" % (k
, str(s
))
169 print "n_ind keys:", parser
.n_ind
.keys()
170 for name
, index
in parser
.n_ind
.iteritems():
171 print "\t%s keys: %s" % (name
, str(index
.keys()))
172 for name2
, index2
in index
.iteritems():
173 #print "\t\t%s: %s" % (name2, str(index2))
174 print "\t\t%s keys: %s" % (name2
, str(index2
.keys()))
175 for k
, s
in index2
.iteritems():
176 print "\t\t\t%s: %s" % (k
, str(s
))
182 #del test_japanese_search
183 #del test_native_search
188 del test_limited_parse
191 if __name__
== "__main__":