Merge branch 'source-get-id-docs' into 'master'
[glib.git] / tests / gen-casemap-txt.py
blob98f6bc9698b0694f3b75b73f071730a187bcf3b4
1 #!/usr/bin/env python3
2 # Copyright (C) 1998, 1999 Tom Tromey
3 # Copyright (C) 2001 Red Hat Software
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2, or (at your option)
8 # any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, see <http://www.gnu.org/licenses/>.
18 """
19 gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
20 See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
21 Usage:
22 I consider the output of this program to be unrestricted.
23 Use it as you will.
24 """
26 import sys
27 import argparse
30 def main(argv):
31 parser = argparse.ArgumentParser(
32 description="Generate test cases for case mapping from Unicode data")
33 parser.add_argument("UNICODE-VERSION")
34 parser.add_argument("UnicodeData.txt")
35 parser.add_argument("SpecialCasing.txt")
36 args = parser.parse_args(argv[1:])
37 version = getattr(args, "UNICODE-VERSION")
38 filename_udata = getattr(args, "UnicodeData.txt")
39 filename_casing = getattr(args, "SpecialCasing.txt")
41 # Names of fields in Unicode data table.
42 CODE, NAME, CATEGORY, COMBINING_CLASSES, BIDI_CATEGORY, DECOMPOSITION, \
43 DECIMAL_VALUE, DIGIT_VALUE, NUMERIC_VALUE, MIRRORED, OLD_NAME, \
44 COMMENT, UPPER, LOWER, TITLE = range(15)
46 # Names of fields in the SpecialCasing table
47 CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
49 upper = {}
50 title = {}
51 lower = {}
53 def make_hex(codes):
54 """Converts a string of white space separated code points encoded as
55 hex values to a Unicode string. Any extra white space is ignored.
56 """
57 return "".join([chr(int(c, 16)) for c in codes.split()])
59 def process_one(code, fields):
60 type_ = fields[CATEGORY]
61 if type_ == "Ll":
62 upper[code] = make_hex(fields[UPPER])
63 lower[code] = chr(code)
64 title[code] = make_hex(fields[TITLE])
65 elif type_ == "Lu":
66 lower[code] = make_hex(fields[LOWER])
67 upper[code] = chr(code)
68 title[code] = make_hex(fields[TITLE])
69 elif type_ == "Lt":
70 upper[code] = make_hex(fields[UPPER])
71 lower[code] = make_hex(fields[LOWER])
72 title[code] = make_hex(fields[LOWER])
74 with open(filename_udata, encoding="utf-8") as fileobj:
75 last_code = -1
76 for line in fileobj:
77 line = line.strip()
78 fields = [f.strip() for f in line.split(";")]
79 if len(fields) != 15:
80 raise SystemExit(
81 "Entry for %s has wrong number of fields (%d)" % (
82 fields[CODE], len(fields)))
84 code = int(fields[CODE], 16)
86 if code > last_code + 1:
87 # Found a gap
88 if fields[NAME].endswith("Last>"):
89 # Fill the gap with the last character read,
90 # since this was a range specified in the char database
91 gfields = fields
92 else:
93 # The gap represents undefined characters. Only the type
94 # matters.
95 gfields = ['', '', 'Cn', '0', '', '', '', '', '', '', '',
96 '', '', '', '']
98 last_code += 1
99 while last_code < code:
100 gfields[CODE] = "%04x" % last_code
101 process_one(last_code, gfields)
102 last_code += 1
104 process_one(code, fields)
105 last_code = code
107 with open(filename_casing, encoding="utf-8") as fileobj:
108 last_code = -1
109 for line in fileobj:
110 # strip comments and skip empty lines
111 line = line.split("#", 1)[0].strip()
112 if not line:
113 continue
115 # all lines end with ";" so just remove it
116 line = line.rstrip(";").rstrip()
117 fields = [f.strip() for f in line.split(";")]
118 if len(fields) not in (4, 5):
119 raise SystemExit(
120 "Entry for %s has wrong number of fields (%d)" % (
121 fields[CASE_CODE], len(fields)))
123 if len(fields) == 5:
124 # Ignore conditional special cases - we'll handle them manually
125 continue
127 code = int(fields[CASE_CODE], 16)
129 upper[code] = make_hex(fields[CASE_UPPER])
130 lower[code] = make_hex(fields[CASE_LOWER])
131 title[code] = make_hex(fields[CASE_TITLE])
133 print_tests(version, upper, title, lower)
136 def print_tests(version, upper, title, lower):
137 print("""\
138 # Test cases generated from Unicode {} data
139 # by gen-casemap-txt.py. Do not edit.
141 # Some special hand crafted tests
143 tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
144 tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
145 tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
146 tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
147 tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
148 tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
149 # Test reordering of YPOGEGRAMMENI across other accents
150 \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
151 \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
152 # Handling of final and nonfinal sigma
153 \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
154 \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ
155 \tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ
156 # Lithuanian rule of i followed by letter with dot. Not at all sure
157 # about the titlecase part here
158 lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
159 lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
160 lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
161 lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
162 lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
163 lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
164 lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
165 lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
166 lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
167 lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
168 lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
169 lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
170 lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
171 lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
172 lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
173 lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
174 lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
175 lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
176 lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
177 lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
178 lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
179 lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
180 # Special case not at initial position
181 \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
183 # Now the automatic tests
184 #""".format(version))
186 for i in range(0x10ffff):
187 if i == 0x3A3:
188 # Greek sigma needs special tests
189 continue
191 up = upper.get(i, "")
192 lo = lower.get(i, "")
193 ti = title.get(i, "")
195 if any([up, lo, ti]):
196 print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
199 if __name__ == "__main__":
200 sys.exit(main(sys.argv))