Lib/test/test_normalization.py

   1 from test.support import run_unittest, open_urlresource
   2 import unittest
   3
   4 import sys
   5 import os
   6 from unicodedata import normalize, unidata_version
   7
   8 TESTDATAFILE = "NormalizationTest.txt"
   9 TESTDATAURL = "http://www.unicode.org/Public/" + unidata_version + "/ucd/" + TESTDATAFILE
  10
  11 if os.path.exists(TESTDATAFILE):
  12     f = open(TESTDATAFILE, encoding='utf-8')
  13     l = f.readline()
  14     f.close()
  15     if not unidata_version in l:
  16         os.unlink(TESTDATAFILE)
  17
  18 class RangeError(Exception):
  19     pass
  20
  21 def NFC(str):
  22     return normalize("NFC", str)
  23
  24 def NFKC(str):
  25     return normalize("NFKC", str)
  26
  27 def NFD(str):
  28     return normalize("NFD", str)
  29
  30 def NFKD(str):
  31     return normalize("NFKD", str)
  32
  33 def unistr(data):
  34     data = [int(x, 16) for x in data.split(" ")]
  35     for x in data:
  36         if x > sys.maxunicode:
  37             raise RangeError
  38     return "".join([chr(x) for x in data])
  39
  40 class NormalizationTest(unittest.TestCase):
  41     def test_main(self):
  42         part1_data = {}
  43         for line in open_urlresource(TESTDATAURL, encoding="utf-8"):
  44             if '#' in line:
  45                 line = line.split('#')[0]
  46             line = line.strip()
  47             if not line:
  48                 continue
  49             if line.startswith("@Part"):
  50                 part = line.split()[0]
  51                 continue
  52             if part == "@Part3":
  53                 # XXX we don't support PRI #29 yet, so skip these tests for now
  54                 continue
  55             try:
  56                 c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
  57             except RangeError:
  58                 # Skip unsupported characters;
  59                 # try atleast adding c1 if we are in part1
  60                 if part == "@Part1":
  61                     try:
  62                         c1 = unistr(line.split(';')[0])
  63                     except RangeError:
  64                         pass
  65                     else:
  66                         part1_data[c1] = 1
  67                 continue
  68
  69             # Perform tests
  70             self.failUnless(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
  71             self.failUnless(c4 ==  NFC(c4) ==  NFC(c5), line)
  72             self.failUnless(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
  73             self.failUnless(c5 ==  NFD(c4) ==  NFD(c5), line)
  74             self.failUnless(c4 == NFKC(c1) == NFKC(c2) == \
  75                             NFKC(c3) == NFKC(c4) == NFKC(c5),
  76                             line)
  77             self.failUnless(c5 == NFKD(c1) == NFKD(c2) == \
  78                             NFKD(c3) == NFKD(c4) == NFKD(c5),
  79                             line)
  80
  81             # Record part 1 data
  82             if part == "@Part1":
  83                 part1_data[c1] = 1
  84
  85         # Perform tests for all other data
  86         for c in range(sys.maxunicode+1):
  87             X = chr(c)
  88             if X in part1_data:
  89                 continue
  90             self.failUnless(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
  91
  92     def test_bug_834676(self):
  93         # Check for bug 834676
  94         normalize('NFC', '\ud55c\uae00')
  95
  96
  97 def test_main():
  98     # Hit the exception early
  99     open_urlresource(TESTDATAURL)
 100     run_unittest(NormalizationTest)
 101
 102 if __name__ == "__main__":
 103     test_main()