move sections
[python/dscho.git] / Lib / test / test_unicodedata.py
blob4904f70b3a87983195d990e7f64e5601d4911fdf
1 """ Test script for the unicodedata module.
3 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7 """
9 import sys
10 import unittest
11 import hashlib
12 import subprocess
13 import test.test_support
15 encoding = 'utf-8'
18 ### Run tests
20 class UnicodeMethodsTest(unittest.TestCase):
22 # update this, if the database changes
23 expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
25 def test_method_checksum(self):
26 h = hashlib.sha1()
27 for i in range(0x10000):
28 char = unichr(i)
29 data = [
30 # Predicates (single char)
31 u"01"[char.isalnum()],
32 u"01"[char.isalpha()],
33 u"01"[char.isdecimal()],
34 u"01"[char.isdigit()],
35 u"01"[char.islower()],
36 u"01"[char.isnumeric()],
37 u"01"[char.isspace()],
38 u"01"[char.istitle()],
39 u"01"[char.isupper()],
41 # Predicates (multiple chars)
42 u"01"[(char + u'abc').isalnum()],
43 u"01"[(char + u'abc').isalpha()],
44 u"01"[(char + u'123').isdecimal()],
45 u"01"[(char + u'123').isdigit()],
46 u"01"[(char + u'abc').islower()],
47 u"01"[(char + u'123').isnumeric()],
48 u"01"[(char + u' \t').isspace()],
49 u"01"[(char + u'abc').istitle()],
50 u"01"[(char + u'ABC').isupper()],
52 # Mappings (single char)
53 char.lower(),
54 char.upper(),
55 char.title(),
57 # Mappings (multiple chars)
58 (char + u'abc').lower(),
59 (char + u'ABC').upper(),
60 (char + u'abc').title(),
61 (char + u'ABC').title(),
64 h.update(u''.join(data).encode(encoding))
65 result = h.hexdigest()
66 self.assertEqual(result, self.expectedchecksum)
68 class UnicodeDatabaseTest(unittest.TestCase):
70 def setUp(self):
71 # In case unicodedata is not available, this will raise an ImportError,
72 # but the other test cases will still be run
73 import unicodedata
74 self.db = unicodedata
76 def tearDown(self):
77 del self.db
79 class UnicodeFunctionsTest(UnicodeDatabaseTest):
81 # update this, if the database changes
82 expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6'
84 def test_function_checksum(self):
85 data = []
86 h = hashlib.sha1()
88 for i in range(0x10000):
89 char = unichr(i)
90 data = [
91 # Properties
92 str(self.db.digit(char, -1)),
93 str(self.db.numeric(char, -1)),
94 str(self.db.decimal(char, -1)),
95 self.db.category(char),
96 self.db.bidirectional(char),
97 self.db.decomposition(char),
98 str(self.db.mirrored(char)),
99 str(self.db.combining(char)),
101 h.update(''.join(data))
102 result = h.hexdigest()
103 self.assertEqual(result, self.expectedchecksum)
105 def test_digit(self):
106 self.assertEqual(self.db.digit(u'A', None), None)
107 self.assertEqual(self.db.digit(u'9'), 9)
108 self.assertEqual(self.db.digit(u'\u215b', None), None)
109 self.assertEqual(self.db.digit(u'\u2468'), 9)
110 self.assertEqual(self.db.digit(u'\U00020000', None), None)
112 self.assertRaises(TypeError, self.db.digit)
113 self.assertRaises(TypeError, self.db.digit, u'xx')
114 self.assertRaises(ValueError, self.db.digit, u'x')
116 def test_numeric(self):
117 self.assertEqual(self.db.numeric(u'A',None), None)
118 self.assertEqual(self.db.numeric(u'9'), 9)
119 self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
120 self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
121 self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
122 self.assertEqual(self.db.numeric(u'\U00020000', None), None)
124 self.assertRaises(TypeError, self.db.numeric)
125 self.assertRaises(TypeError, self.db.numeric, u'xx')
126 self.assertRaises(ValueError, self.db.numeric, u'x')
128 def test_decimal(self):
129 self.assertEqual(self.db.decimal(u'A',None), None)
130 self.assertEqual(self.db.decimal(u'9'), 9)
131 self.assertEqual(self.db.decimal(u'\u215b', None), None)
132 self.assertEqual(self.db.decimal(u'\u2468', None), None)
133 self.assertEqual(self.db.decimal(u'\U00020000', None), None)
135 self.assertRaises(TypeError, self.db.decimal)
136 self.assertRaises(TypeError, self.db.decimal, u'xx')
137 self.assertRaises(ValueError, self.db.decimal, u'x')
139 def test_category(self):
140 self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
141 self.assertEqual(self.db.category(u'a'), 'Ll')
142 self.assertEqual(self.db.category(u'A'), 'Lu')
143 self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
145 self.assertRaises(TypeError, self.db.category)
146 self.assertRaises(TypeError, self.db.category, u'xx')
148 def test_bidirectional(self):
149 self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
150 self.assertEqual(self.db.bidirectional(u' '), 'WS')
151 self.assertEqual(self.db.bidirectional(u'A'), 'L')
152 self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
154 self.assertRaises(TypeError, self.db.bidirectional)
155 self.assertRaises(TypeError, self.db.bidirectional, u'xx')
157 def test_decomposition(self):
158 self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
159 self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
161 self.assertRaises(TypeError, self.db.decomposition)
162 self.assertRaises(TypeError, self.db.decomposition, u'xx')
164 def test_mirrored(self):
165 self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
166 self.assertEqual(self.db.mirrored(u'a'), 0)
167 self.assertEqual(self.db.mirrored(u'\u2201'), 1)
168 self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
170 self.assertRaises(TypeError, self.db.mirrored)
171 self.assertRaises(TypeError, self.db.mirrored, u'xx')
173 def test_combining(self):
174 self.assertEqual(self.db.combining(u'\uFFFE'), 0)
175 self.assertEqual(self.db.combining(u'a'), 0)
176 self.assertEqual(self.db.combining(u'\u20e1'), 230)
177 self.assertEqual(self.db.combining(u'\U00020000'), 0)
179 self.assertRaises(TypeError, self.db.combining)
180 self.assertRaises(TypeError, self.db.combining, u'xx')
182 def test_normalize(self):
183 self.assertRaises(TypeError, self.db.normalize)
184 self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
185 self.assertEqual(self.db.normalize('NFKC', u''), u'')
186 # The rest can be found in test_normalization.py
187 # which requires an external file.
189 def test_pr29(self):
190 # http://www.unicode.org/review/pr-29.html
191 for text in (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161"):
192 self.assertEqual(self.db.normalize('NFC', text), text)
194 def test_east_asian_width(self):
195 eaw = self.db.east_asian_width
196 self.assertRaises(TypeError, eaw, 'a')
197 self.assertRaises(TypeError, eaw, u'')
198 self.assertRaises(TypeError, eaw, u'ra')
199 self.assertEqual(eaw(u'\x1e'), 'N')
200 self.assertEqual(eaw(u'\x20'), 'Na')
201 self.assertEqual(eaw(u'\uC894'), 'W')
202 self.assertEqual(eaw(u'\uFF66'), 'H')
203 self.assertEqual(eaw(u'\uFF1F'), 'F')
204 self.assertEqual(eaw(u'\u2010'), 'A')
205 self.assertEqual(eaw(u'\U00020000'), 'W')
207 class UnicodeMiscTest(UnicodeDatabaseTest):
209 def test_failed_import_during_compiling(self):
210 # Issue 4367
211 # Decoding \N escapes requires the unicodedata module. If it can't be
212 # imported, we shouldn't segfault.
214 # This program should raise a SyntaxError in the eval.
215 code = "import sys;" \
216 "sys.modules['unicodedata'] = None;" \
217 """eval("u'\N{SOFT HYPHEN}'")"""
218 args = [sys.executable, "-c", code]
219 # We use a subprocess because the unicodedata module may already have
220 # been loaded in this process.
221 popen = subprocess.Popen(args, stderr=subprocess.PIPE)
222 popen.wait()
223 self.assertEqual(popen.returncode, 1)
224 error = "SyntaxError: (unicode error) \N escapes not supported " \
225 "(can't load unicodedata module)"
226 self.assertIn(error, popen.stderr.read())
228 def test_decimal_numeric_consistent(self):
229 # Test that decimal and numeric are consistent,
230 # i.e. if a character has a decimal value,
231 # its numeric value should be the same.
232 count = 0
233 for i in xrange(0x10000):
234 c = unichr(i)
235 dec = self.db.decimal(c, -1)
236 if dec != -1:
237 self.assertEqual(dec, self.db.numeric(c))
238 count += 1
239 self.assertTrue(count >= 10) # should have tested at least the ASCII digits
241 def test_digit_numeric_consistent(self):
242 # Test that digit and numeric are consistent,
243 # i.e. if a character has a digit value,
244 # its numeric value should be the same.
245 count = 0
246 for i in xrange(0x10000):
247 c = unichr(i)
248 dec = self.db.digit(c, -1)
249 if dec != -1:
250 self.assertEqual(dec, self.db.numeric(c))
251 count += 1
252 self.assertTrue(count >= 10) # should have tested at least the ASCII digits
254 def test_bug_1704793(self):
255 self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
257 def test_ucd_510(self):
258 import unicodedata
259 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
260 self.assertTrue(unicodedata.mirrored(u"\u0f3a"))
261 self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
262 # Also, we now have two ways of representing
263 # the upper-case mapping: as delta, or as absolute value
264 self.assertTrue(u"a".upper()==u'A')
265 self.assertTrue(u"\u1d79".upper()==u'\ua77d')
266 self.assertTrue(u".".upper()==u".")
268 def test_bug_5828(self):
269 self.assertEqual(u"\u1d79".lower(), u"\u1d79")
270 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
271 self.assertEqual(
273 c for c in range(sys.maxunicode+1)
274 if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
279 def test_bug_4971(self):
280 # LETTER DZ WITH CARON: DZ, Dz, dz
281 self.assertEqual(u"\u01c4".title(), u"\u01c5")
282 self.assertEqual(u"\u01c5".title(), u"\u01c5")
283 self.assertEqual(u"\u01c6".title(), u"\u01c5")
285 def test_linebreak_7643(self):
286 for i in range(0x10000):
287 lines = (unichr(i) + u'A').splitlines()
288 if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
289 0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
290 self.assertEqual(len(lines), 2,
291 r"\u%.4x should be a linebreak" % i)
292 else:
293 self.assertEqual(len(lines), 1,
294 r"\u%.4x should not be a linebreak" % i)
296 def test_main():
297 test.test_support.run_unittest(
298 UnicodeMiscTest,
299 UnicodeMethodsTest,
300 UnicodeFunctionsTest
303 if __name__ == "__main__":
304 test_main()