Lib/test/test_unicodedata.py

   1 """ Test script for the unicodedata module.
   2
   3     Written by Marc-Andre Lemburg (mal@lemburg.com).
   4
   5     (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   6
   7 """
   8
   9 import sys
  10 import unittest
  11 import hashlib
  12 import subprocess
  13 import test.test_support
  14
  15 encoding = 'utf-8'
  16
  17
  18 ### Run tests
  19
  20 class UnicodeMethodsTest(unittest.TestCase):
  21
  22     # update this, if the database changes
  23     expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
  24
  25     def test_method_checksum(self):
  26         h = hashlib.sha1()
  27         for i in range(0x10000):
  28             char = unichr(i)
  29             data = [
  30                 # Predicates (single char)
  31                 u"01"[char.isalnum()],
  32                 u"01"[char.isalpha()],
  33                 u"01"[char.isdecimal()],
  34                 u"01"[char.isdigit()],
  35                 u"01"[char.islower()],
  36                 u"01"[char.isnumeric()],
  37                 u"01"[char.isspace()],
  38                 u"01"[char.istitle()],
  39                 u"01"[char.isupper()],
  40
  41                 # Predicates (multiple chars)
  42                 u"01"[(char + u'abc').isalnum()],
  43                 u"01"[(char + u'abc').isalpha()],
  44                 u"01"[(char + u'123').isdecimal()],
  45                 u"01"[(char + u'123').isdigit()],
  46                 u"01"[(char + u'abc').islower()],
  47                 u"01"[(char + u'123').isnumeric()],
  48                 u"01"[(char + u' \t').isspace()],
  49                 u"01"[(char + u'abc').istitle()],
  50                 u"01"[(char + u'ABC').isupper()],
  51
  52                 # Mappings (single char)
  53                 char.lower(),
  54                 char.upper(),
  55                 char.title(),
  56
  57                 # Mappings (multiple chars)
  58                 (char + u'abc').lower(),
  59                 (char + u'ABC').upper(),
  60                 (char + u'abc').title(),
  61                 (char + u'ABC').title(),
  62
  63                 ]
  64             h.update(u''.join(data).encode(encoding))
  65         result = h.hexdigest()
  66         self.assertEqual(result, self.expectedchecksum)
  67
  68 class UnicodeDatabaseTest(unittest.TestCase):
  69
  70     def setUp(self):
  71         # In case unicodedata is not available, this will raise an ImportError,
  72         # but the other test cases will still be run
  73         import unicodedata
  74         self.db = unicodedata
  75
  76     def tearDown(self):
  77         del self.db
  78
  79 class UnicodeFunctionsTest(UnicodeDatabaseTest):
  80
  81     # update this, if the database changes
  82     expectedchecksum = '6ccf1b1a36460d2694f9b0b0f0324942fe70ede6'
  83
  84     def test_function_checksum(self):
  85         data = []
  86         h = hashlib.sha1()
  87
  88         for i in range(0x10000):
  89             char = unichr(i)
  90             data = [
  91                 # Properties
  92                 str(self.db.digit(char, -1)),
  93                 str(self.db.numeric(char, -1)),
  94                 str(self.db.decimal(char, -1)),
  95                 self.db.category(char),
  96                 self.db.bidirectional(char),
  97                 self.db.decomposition(char),
  98                 str(self.db.mirrored(char)),
  99                 str(self.db.combining(char)),
 100             ]
 101             h.update(''.join(data))
 102         result = h.hexdigest()
 103         self.assertEqual(result, self.expectedchecksum)
 104
 105     def test_digit(self):
 106         self.assertEqual(self.db.digit(u'A', None), None)
 107         self.assertEqual(self.db.digit(u'9'), 9)
 108         self.assertEqual(self.db.digit(u'\u215b', None), None)
 109         self.assertEqual(self.db.digit(u'\u2468'), 9)
 110         self.assertEqual(self.db.digit(u'\U00020000', None), None)
 111
 112         self.assertRaises(TypeError, self.db.digit)
 113         self.assertRaises(TypeError, self.db.digit, u'xx')
 114         self.assertRaises(ValueError, self.db.digit, u'x')
 115
 116     def test_numeric(self):
 117         self.assertEqual(self.db.numeric(u'A',None), None)
 118         self.assertEqual(self.db.numeric(u'9'), 9)
 119         self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
 120         self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
 121         self.assertEqual(self.db.numeric(u'\ua627'), 7.0)
 122         self.assertEqual(self.db.numeric(u'\U00020000', None), None)
 123
 124         self.assertRaises(TypeError, self.db.numeric)
 125         self.assertRaises(TypeError, self.db.numeric, u'xx')
 126         self.assertRaises(ValueError, self.db.numeric, u'x')
 127
 128     def test_decimal(self):
 129         self.assertEqual(self.db.decimal(u'A',None), None)
 130         self.assertEqual(self.db.decimal(u'9'), 9)
 131         self.assertEqual(self.db.decimal(u'\u215b', None), None)
 132         self.assertEqual(self.db.decimal(u'\u2468', None), None)
 133         self.assertEqual(self.db.decimal(u'\U00020000', None), None)
 134
 135         self.assertRaises(TypeError, self.db.decimal)
 136         self.assertRaises(TypeError, self.db.decimal, u'xx')
 137         self.assertRaises(ValueError, self.db.decimal, u'x')
 138
 139     def test_category(self):
 140         self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
 141         self.assertEqual(self.db.category(u'a'), 'Ll')
 142         self.assertEqual(self.db.category(u'A'), 'Lu')
 143         self.assertEqual(self.db.category(u'\U00020000'), 'Lo')
 144
 145         self.assertRaises(TypeError, self.db.category)
 146         self.assertRaises(TypeError, self.db.category, u'xx')
 147
 148     def test_bidirectional(self):
 149         self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
 150         self.assertEqual(self.db.bidirectional(u' '), 'WS')
 151         self.assertEqual(self.db.bidirectional(u'A'), 'L')
 152         self.assertEqual(self.db.bidirectional(u'\U00020000'), 'L')
 153
 154         self.assertRaises(TypeError, self.db.bidirectional)
 155         self.assertRaises(TypeError, self.db.bidirectional, u'xx')
 156
 157     def test_decomposition(self):
 158         self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
 159         self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
 160
 161         self.assertRaises(TypeError, self.db.decomposition)
 162         self.assertRaises(TypeError, self.db.decomposition, u'xx')
 163
 164     def test_mirrored(self):
 165         self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
 166         self.assertEqual(self.db.mirrored(u'a'), 0)
 167         self.assertEqual(self.db.mirrored(u'\u2201'), 1)
 168         self.assertEqual(self.db.mirrored(u'\U00020000'), 0)
 169
 170         self.assertRaises(TypeError, self.db.mirrored)
 171         self.assertRaises(TypeError, self.db.mirrored, u'xx')
 172
 173     def test_combining(self):
 174         self.assertEqual(self.db.combining(u'\uFFFE'), 0)
 175         self.assertEqual(self.db.combining(u'a'), 0)
 176         self.assertEqual(self.db.combining(u'\u20e1'), 230)
 177         self.assertEqual(self.db.combining(u'\U00020000'), 0)
 178
 179         self.assertRaises(TypeError, self.db.combining)
 180         self.assertRaises(TypeError, self.db.combining, u'xx')
 181
 182     def test_normalize(self):
 183         self.assertRaises(TypeError, self.db.normalize)
 184         self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
 185         self.assertEqual(self.db.normalize('NFKC', u''), u'')
 186         # The rest can be found in test_normalization.py
 187         # which requires an external file.
 188
 189     def test_pr29(self):
 190         # http://www.unicode.org/review/pr-29.html
 191         for text in (u"\u0b47\u0300\u0b3e", u"\u1100\u0300\u1161"):
 192             self.assertEqual(self.db.normalize('NFC', text), text)
 193
 194     def test_east_asian_width(self):
 195         eaw = self.db.east_asian_width
 196         self.assertRaises(TypeError, eaw, 'a')
 197         self.assertRaises(TypeError, eaw, u'')
 198         self.assertRaises(TypeError, eaw, u'ra')
 199         self.assertEqual(eaw(u'\x1e'), 'N')
 200         self.assertEqual(eaw(u'\x20'), 'Na')
 201         self.assertEqual(eaw(u'\uC894'), 'W')
 202         self.assertEqual(eaw(u'\uFF66'), 'H')
 203         self.assertEqual(eaw(u'\uFF1F'), 'F')
 204         self.assertEqual(eaw(u'\u2010'), 'A')
 205         self.assertEqual(eaw(u'\U00020000'), 'W')
 206
 207 class UnicodeMiscTest(UnicodeDatabaseTest):
 208
 209     def test_failed_import_during_compiling(self):
 210         # Issue 4367
 211         # Decoding \N escapes requires the unicodedata module. If it can't be
 212         # imported, we shouldn't segfault.
 213
 214         # This program should raise a SyntaxError in the eval.
 215         code = "import sys;" \
 216             "sys.modules['unicodedata'] = None;" \
 217             """eval("u'\N{SOFT HYPHEN}'")"""
 218         args = [sys.executable, "-c", code]
 219         # We use a subprocess because the unicodedata module may already have
 220         # been loaded in this process.
 221         popen = subprocess.Popen(args, stderr=subprocess.PIPE)
 222         popen.wait()
 223         self.assertEqual(popen.returncode, 1)
 224         error = "SyntaxError: (unicode error) \N escapes not supported " \
 225             "(can't load unicodedata module)"
 226         self.assertIn(error, popen.stderr.read())
 227
 228     def test_decimal_numeric_consistent(self):
 229         # Test that decimal and numeric are consistent,
 230         # i.e. if a character has a decimal value,
 231         # its numeric value should be the same.
 232         count = 0
 233         for i in xrange(0x10000):
 234             c = unichr(i)
 235             dec = self.db.decimal(c, -1)
 236             if dec != -1:
 237                 self.assertEqual(dec, self.db.numeric(c))
 238                 count += 1
 239         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
 240
 241     def test_digit_numeric_consistent(self):
 242         # Test that digit and numeric are consistent,
 243         # i.e. if a character has a digit value,
 244         # its numeric value should be the same.
 245         count = 0
 246         for i in xrange(0x10000):
 247             c = unichr(i)
 248             dec = self.db.digit(c, -1)
 249             if dec != -1:
 250                 self.assertEqual(dec, self.db.numeric(c))
 251                 count += 1
 252         self.assertTrue(count >= 10) # should have tested at least the ASCII digits
 253
 254     def test_bug_1704793(self):
 255         self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
 256
 257     def test_ucd_510(self):
 258         import unicodedata
 259         # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
 260         self.assertTrue(unicodedata.mirrored(u"\u0f3a"))
 261         self.assertTrue(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
 262         # Also, we now have two ways of representing
 263         # the upper-case mapping: as delta, or as absolute value
 264         self.assertTrue(u"a".upper()==u'A')
 265         self.assertTrue(u"\u1d79".upper()==u'\ua77d')
 266         self.assertTrue(u".".upper()==u".")
 267
 268     def test_bug_5828(self):
 269         self.assertEqual(u"\u1d79".lower(), u"\u1d79")
 270         # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
 271         self.assertEqual(
 272             [
 273                 c for c in range(sys.maxunicode+1)
 274                 if u"\x00" in unichr(c).lower()+unichr(c).upper()+unichr(c).title()
 275             ],
 276             [0]
 277         )
 278
 279     def test_bug_4971(self):
 280         # LETTER DZ WITH CARON: DZ, Dz, dz
 281         self.assertEqual(u"\u01c4".title(), u"\u01c5")
 282         self.assertEqual(u"\u01c5".title(), u"\u01c5")
 283         self.assertEqual(u"\u01c6".title(), u"\u01c5")
 284
 285     def test_linebreak_7643(self):
 286         for i in range(0x10000):
 287             lines = (unichr(i) + u'A').splitlines()
 288             if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
 289                      0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
 290                 self.assertEqual(len(lines), 2,
 291                                  r"\u%.4x should be a linebreak" % i)
 292             else:
 293                 self.assertEqual(len(lines), 1,
 294                                  r"\u%.4x should not be a linebreak" % i)
 295
 296 def test_main():
 297     test.test_support.run_unittest(
 298         UnicodeMiscTest,
 299         UnicodeMethodsTest,
 300         UnicodeFunctionsTest
 301     )
 302
 303 if __name__ == "__main__":
 304     test_main()