Lib/test/test_normalization.py

   1 from test.test_support import run_unittest, open_urlresource
   2 import unittest
   3
   4 import sys
   5 import os
   6 from unicodedata import normalize, unidata_version
   7
   8 TESTDATAFILE = "NormalizationTest" + os.extsep + "txt"
   9 TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE
  10
  11 if os.path.exists(TESTDATAFILE):
  12     f = open(TESTDATAFILE)
  13     l = f.readline()
  14     f.close()
  15     if not unidata_version in l:
  16         os.unlink(TESTDATAFILE)
  17
  18 class RangeError(Exception):
  19     pass
  20
  21 def NFC(str):
  22     return normalize("NFC", str)
  23
  24 def NFKC(str):
  25     return normalize("NFKC", str)
  26
  27 def NFD(str):
  28     return normalize("NFD", str)
  29
  30 def NFKD(str):
  31     return normalize("NFKD", str)
  32
  33 def unistr(data):
  34     data = [int(x, 16) for x in data.split(" ")]
  35     for x in data:
  36         if x > sys.maxunicode:
  37             raise RangeError
  38     return u"".join([unichr(x) for x in data])
  39
  40 class NormalizationTest(unittest.TestCase):
  41     def test_main(self):
  42         part1_data = {}
  43         # Hit the exception early
  44         try:
  45             open_urlresource(TESTDATAURL)
  46         except IOError:
  47             self.skipTest("Could not retrieve " + TESTDATAURL)
  48         for line in open_urlresource(TESTDATAURL):
  49             if '#' in line:
  50                 line = line.split('#')[0]
  51             line = line.strip()
  52             if not line:
  53                 continue
  54             if line.startswith("@Part"):
  55                 part = line.split()[0]
  56                 continue
  57             if part == "@Part3":
  58                 # XXX we don't support PRI #29 yet, so skip these tests for now
  59                 continue
  60             try:
  61                 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
  62             except RangeError:
  63                 # Skip unsupported characters;
  64                 # try atleast adding c1 if we are in part1
  65                 if part == "@Part1":
  66                     try:
  67                         c1 = unistr(line.split(';')[0])
  68                     except RangeError:
  69                         pass
  70                     else:
  71                         part1_data[c1] = 1
  72                 continue
  73
  74             # Perform tests
  75             self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
  76             self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
  77             self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
  78             self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
  79             self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
  80                             NFKC(c3) == NFKC(c4) == NFKC(c5),
  81                             line)
  82             self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
  83                             NFKD(c3) == NFKD(c4) == NFKD(c5),
  84                             line)
  85
  86             # Record part 1 data
  87             if part == "@Part1":
  88                 part1_data[c1] = 1
  89
  90         # Perform tests for all other data
  91         for c in range(sys.maxunicode+1):
  92             X = unichr(c)
  93             if X in part1_data:
  94                 continue
  95             self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
  96
  97     def test_bug_834676(self):
  98         # Check for bug 834676
  99         normalize('NFC', u'\ud55c\uae00')
 100
 101
 102 def test_main():
 103     run_unittest(NormalizationTest)
 104
 105 if __name__ == "__main__":
 106     test_main()