tests/gen-casemap-txt.py

   1 #!/usr/bin/env python3
   2 # Copyright (C) 1998, 1999 Tom Tromey
   3 # Copyright (C) 2001 Red Hat Software
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2, or (at your option)
   8 # any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, see <http://www.gnu.org/licenses/>.
  17
  18 """
  19 gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
  20 See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
  21 Usage:
  22     I consider the output of this program to be unrestricted.
  23     Use it as you will.
  24 """
  25
  26 import sys
  27 import argparse
  28
  29
  30 def main(argv):
  31     parser = argparse.ArgumentParser(
  32         description="Generate test cases for case mapping from Unicode data")
  33     parser.add_argument("UNICODE-VERSION")
  34     parser.add_argument("UnicodeData.txt")
  35     parser.add_argument("SpecialCasing.txt")
  36     args = parser.parse_args(argv[1:])
  37     version = getattr(args, "UNICODE-VERSION")
  38     filename_udata = getattr(args, "UnicodeData.txt")
  39     filename_casing = getattr(args, "SpecialCasing.txt")
  40
  41     # Names of fields in Unicode data table.
  42     CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \
  43         DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \
  44         COMMENT, UPPER, LOWER, TITLE = range(15)
  45
  46     # Names of fields in the SpecialCasing table
  47     CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
  48
  49     upper = {}
  50     title = {}
  51     lower = {}
  52
  53     def make_hex(codes):
  54         """Converts a string of white space separated code points encoded as
  55         hex values to a Unicode string. Any extra white space is ignored.
  56         """
  57         return "".join([chr(int(c, 16)) for c in codes.split()])
  58
  59     def process_one(code, fields):
  60         type_ = fields[CATEGORY]
  61         if type_ == "Ll":
  62             upper[code] = make_hex(fields[UPPER])
  63             lower[code] = chr(code)
  64             title[code] = make_hex(fields[TITLE])
  65         elif type_ == "Lu":
  66             lower[code] = make_hex(fields[LOWER])
  67             upper[code] = chr(code)
  68             title[code] = make_hex(fields[TITLE])
  69         elif type_ == "Lt":
  70             upper[code] = make_hex(fields[UPPER])
  71             lower[code] = make_hex(fields[LOWER])
  72             title[code] = make_hex(fields[LOWER])
  73
  74     with open(filename_udata, encoding="utf-8") as fileobj:
  75         last_code = -1
  76         for line in fileobj:
  77             line = line.strip()
  78             fields = [f.strip() for f in line.split(";")]
  79             if len(fields) != 15:
  80                 raise SystemExit(
  81                     "Entry for %s has wrong number of fields (%d)" % (
  82                         fields[CODE], len(fields)))
  83
  84             code = int(fields[CODE], 16)
  85
  86             if code > last_code + 1:
  87                 # Found a gap
  88                 if fields[NAME].endswith("Last>"):
  89                     # Fill the gap with the last character read,
  90                     # since this was a range specified in the char database
  91                     gfields = fields
  92                 else:
  93                     # The gap represents undefined characters.  Only the type
  94                     # matters.
  95                     gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '',
  96                                '', '', '', '']
  97
  98                 last_code += 1
  99                 while last_code < code:
 100                     gfields[CODE] = "%04x" % last_code
 101                     process_one(last_code, gfields)
 102                     last_code += 1
 103
 104             process_one(code, fields)
 105             last_code = code
 106
 107     with open(filename_casing, encoding="utf-8") as fileobj:
 108         last_code = -1
 109         for line in fileobj:
 110             # strip comments and skip empty lines
 111             line = line.split("#", 1)[0].strip()
 112             if not line:
 113                 continue
 114
 115             # all lines end with ";" so just remove it
 116             line = line.rstrip(";").rstrip()
 117             fields = [f.strip() for f in line.split(";")]
 118             if len(fields) not in (4, 5):
 119                 raise SystemExit(
 120                     "Entry for %s has wrong number of fields (%d)" % (
 121                         fields[CASE_CODE], len(fields)))
 122
 123             if len(fields) == 5:
 124                 # Ignore conditional special cases - we'll handle them manually
 125                 continue
 126
 127             code = int(fields[CASE_CODE], 16)
 128
 129             upper[code] = make_hex(fields[CASE_UPPER])
 130             lower[code] = make_hex(fields[CASE_LOWER])
 131             title[code] = make_hex(fields[CASE_TITLE])
 132
 133     print_tests(version, upper, title, lower)
 134
 135
 136 def print_tests(version, upper, title, lower):
 137     print("""\
 138 # Test cases generated from Unicode {} data
 139 # by gen-casemap-txt.py. Do not edit.
 140 #
 141 # Some special hand crafted tests
 142 #
 143 tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
 144 tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
 145 tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
 146 tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
 147 tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
 148 tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
 149 # Test reordering of YPOGEGRAMMENI across other accents
 150 \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
 151 \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
 152 # Handling of final and nonfinal sigma
 153 \tΜΆΙΟΣ    μάιος      Μάιος      ΜΆΙΟΣ
 154 \tΜΆΙΟΣ    μάιος      Μάιος      ΜΆΙΟΣ
 155 \tΣΙΓΜΑ    σιγμα      Σιγμα      ΣΙΓΜΑ
 156 # Lithuanian rule of i followed by letter with dot. Not at all sure
 157 # about the titlecase part here
 158 lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
 159 lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
 160 lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
 161 lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
 162 lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
 163 lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
 164 lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
 165 lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
 166 lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
 167 lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
 168 lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
 169 lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
 170 lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
 171 lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
 172 lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
 173 lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
 174 lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
 175 lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
 176 lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
 177 lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
 178 lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
 179 lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
 180 # Special case not at initial position
 181 \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
 182 #
 183 # Now the automatic tests
 184 #""".format(version))
 185
 186     for i in range(0x10ffff):
 187         if i == 0x3A3:
 188             # Greek sigma needs special tests
 189             continue
 190
 191         up = upper.get(i, "")
 192         lo = lower.get(i, "")
 193         ti = title.get(i, "")
 194
 195         if any([up, lo, ti]):
 196             print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
 197
 198
 199 if __name__ == "__main__":
 200     sys.exit(main(sys.argv))