1 # -*- coding: utf-8 -*-
3 from __future__
import absolute_import
6 from cStringIO
import StringIO
7 from jbparse
import jmdict
8 from xml
.sax
.xmlreader
import InputSource
11 SRC_DIR
= "../../dicts"
13 SRC_NAME
= "/".join((SRC_DIR
, SRC_NAME
))
15 class JMdictTest(unittest
.TestCase
):
18 self
.parser
= jmdict
.JMdictParser(SRC_NAME
)
20 def test_japanese_search(self
):
21 """JMDICT: Search for Japanese word/phrase"""
23 desired_indices
= ["starts_with"]
25 data
= self
._parse
_x
_entries
(SRC_NAME
, 10)
27 parser
.create_indices(data
, desired_indices
)
30 l
= parser
.search(query
)
34 # print entry.to_string()
35 self
.assertTrue(len(l
) > 0)
38 l
= parser
.search(query
)
40 # print entry.to_string()
42 self
.assertTrue(len(l
) > 0)
44 def test_native_search(self
):
45 """JMDICT: Search for non-Japanese word/phrase"""
46 # Let's cheat a little: reading in the whole JMdict will make
47 # our unit tests UNBEARABLY slow.
49 # This test will fail if "repetition mark" does not show up
50 # within the first 10 entries. In such a case, the test will
53 desired_indices
= ["starts_with"]
55 data
= self
._parse
_x
_entries
(SRC_NAME
, 10)
57 parser
.create_indices(data
, desired_indices
)
59 query
= u
"repetition mark"
60 l
= parser
.search(query
)
61 self
.assertTrue(len(l
) > 0)
63 def test_unparsed(self
):
64 """JMDICT: Check for unhandled JMDICT fields"""
65 l
= [k
for k
in self
.parser
.search(u
"")]
67 for key
, entry
in self
.parser
.cache
.iteritems():
68 for item
in entry
.unparsed
: unparsed
.add(item
)
70 #self.fail(u"Unhandled fields found: %s" % u", ".join(unparsed))
71 print u
"\n\tWARNING: Unhandled fields found: %s" \
72 % u
", ".join(unparsed
)
74 def test_caching(self
):
75 """JMDICT: Check that caching is working"""
76 self
.assertFalse(self
.parser
.cache
)
78 self
.test_japanese_search()
79 first_t
= time
.time() - t
81 self
.assertTrue(self
.parser
.cache
)
83 self
.test_japanese_search()
84 second_t
= time
.time() - t
86 print "\n\tFirst query time: %f" % first_t
87 print "\tSecond query time: %f" % second_t
88 self
.assertTrue(second_t
<= first_t
)
90 def test_no_cache(self
):
91 """JMDICT: Check that parser works without caching."""
92 self
.parser
= jmdict
.JMdictParser(SRC_NAME
, use_cache
=False)
94 self
.assertFalse(self
.parser
.cache
)
96 self
.test_japanese_search()
97 first_t
= time
.time() - t
99 self
.assertFalse(self
.parser
.cache
)
101 self
.test_japanese_search()
102 second_t
= time
.time() - t
104 print "\n\tFirst query time: %f" % first_t
105 print "\tSecond query time: %f" % second_t
107 def _parse_x_entries(self
, filename
, max_entries
):
108 """Helper function: reads max_entries entries from the JMdict file.
110 The text for max_entries entries of JMdict are naively read
111 in, then converted to a file-like object which the parser will
115 # Copied from parsers.jmdict
116 if len(filename
) >= 3 and filename
[-3:] == ".gz":
117 f
= gzip
.open(filename
)
119 f
= open(filename
, "rb")
121 # Grab just the first max_entries entries, then close f and
122 # make a new "f" via the StringIO lib.
128 if "</entry>" in line
:
130 if count
>= max_entries
: break
132 lines
.append("</JMdict>\n")
133 f
= StringIO("".join(lines
))
135 # Continuing from parsers.jmdict...
136 sh
= jmdict
.JMDSAXHandler(True, "beef")
137 isource
= InputSource()
138 isource
.setEncoding("utf-8")
139 isource
.setByteStream(f
)
141 # Parser: Since I wish to directly handle the "entities", we
142 # need to override default behavior and cannot just use
144 parser
= jmdict
.ExpatParserNoEntityExp()
145 parser
.setContentHandler(sh
)
147 parser
.parse(isource
)
151 def test_limited_parse(self
):
152 """JMDICT: Parse 5 entries successfully."""
153 data
= self
._parse
_x
_entries
(SRC_NAME
, 5)
154 self
.assertEqual(len(data
), 5)
156 def test_indexing(self
):
158 desired_indices
= ["starts_with"]
160 data
= self
._parse
_x
_entries
(SRC_NAME
, 5)
162 print "CREATING INDICES"
163 parser
.create_indices(data
, desired_indices
)
165 print "j_ind keys:", parser
.j_ind
.keys()
166 for name
, index
in parser
.j_ind
.iteritems():
167 #print "\t%s: %s" % (name, str(index))
168 print "\t%s keys: %s" % (name
, str(index
.keys()))
169 for k
, s
in index
.iteritems():
170 print "\t\t%s: %s" % (k
, str(s
))
171 print "n_ind keys:", parser
.n_ind
.keys()
172 for name
, index
in parser
.n_ind
.iteritems():
173 print "\t%s keys: %s" % (name
, str(index
.keys()))
174 for name2
, index2
in index
.iteritems():
175 #print "\t\t%s: %s" % (name2, str(index2))
176 print "\t\t%s keys: %s" % (name2
, str(index2
.keys()))
177 for k
, s
in index2
.iteritems():
178 print "\t\t\t%s: %s" % (k
, str(s
))
184 #del test_japanese_search
185 #del test_native_search
190 del test_limited_parse
193 if __name__
== "__main__":