malloc/Makefile: Split and sort tests
[glibc.git] / localedata / unicode-gen / gen_unicode_ctype.py
blob75e4e1334a87b74e904d39b30c9c2e4ac41bb776
1 #!/usr/bin/python3
3 # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
4 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
5 # This file is part of the GNU C Library.
7 # The GNU C Library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # The GNU C Library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with the GNU C Library; if not, see
19 # <https://www.gnu.org/licenses/>.
21 '''
22 Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
23 DerivedCoreProperties.txt files.
25 To see how this script is used, call it with the “-h” option:
27 $ ./gen_unicode_ctype.py -h
28 … prints usage message …
29 '''
31 import argparse
32 import time
33 import re
34 import unicode_utils
36 def code_point_ranges(is_class_function):
37 '''Returns a list of ranges of code points for which is_class_function
38 returns True.
40 Example:
42 [[65, 90], [192, 214], [216, 222], [256], … ]
43 '''
44 cp_ranges = []
45 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
46 if is_class_function(code_point):
47 if (cp_ranges
48 and cp_ranges[-1][-1] == code_point - 1):
49 if len(cp_ranges[-1]) == 1:
50 cp_ranges[-1].append(code_point)
51 else:
52 cp_ranges[-1][-1] = code_point
53 else:
54 cp_ranges.append([code_point])
55 return cp_ranges
57 def output_charclass(i18n_file, class_name, is_class_function):
58 '''Output a LC_CTYPE character class section
60 Example:
62 upper /
63 <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
64
65 <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
66 <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
67 '''
68 cp_ranges = code_point_ranges(is_class_function)
69 if cp_ranges:
70 i18n_file.write('%s /\n' %class_name)
71 max_column = 75
72 prefix = ' '
73 line = prefix
74 range_string = ''
75 for code_point_range in cp_ranges:
76 if line.strip():
77 line += ';'
78 if len(code_point_range) == 1:
79 range_string = unicode_utils.ucs_symbol(code_point_range[0])
80 else:
81 range_string = unicode_utils.ucs_symbol_range(
82 code_point_range[0], code_point_range[-1])
83 if len(line+range_string) > max_column:
84 i18n_file.write(line+'/\n')
85 line = prefix
86 line += range_string
87 if line.strip():
88 i18n_file.write(line+'\n')
89 i18n_file.write('\n')
91 def output_charmap(i18n_file, map_name, map_function):
92 '''Output a LC_CTYPE character map section
94 Example:
96 toupper /
97 (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
98
99 (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
100 (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
102 max_column = 75
103 prefix = ' '
104 line = prefix
105 map_string = ''
106 i18n_file.write('%s /\n' %map_name)
107 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
108 mapped = map_function(code_point)
109 if code_point != mapped:
110 if line.strip():
111 line += ';'
112 map_string = '(' \
113 + unicode_utils.ucs_symbol(code_point) \
114 + ',' \
115 + unicode_utils.ucs_symbol(mapped) \
116 + ')'
117 if len(line+map_string) > max_column:
118 i18n_file.write(line+'/\n')
119 line = prefix
120 line += map_string
121 if line.strip():
122 i18n_file.write(line+'\n')
123 i18n_file.write('\n')
125 def read_input_file(filename):
126 '''Reads the original glibc i18n file to get the original head
127 and tail.
129 We want to replace only the character classes in LC_CTYPE, and the
130 date stamp. All the rest of the i18n file should stay unchanged.
131 To avoid having to cut and paste the generated data into the
132 original file, it is helpful to read the original file here
133 to be able to generate a complete result file.
135 head = tail = ''
136 with open(filename, mode='r') as i18n_file:
137 for line in i18n_file:
138 match = re.match(
139 r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
140 line)
141 if match:
142 line = match.group('key') \
143 + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
144 head = head + line
145 if line.startswith('LC_CTYPE'):
146 break
147 for line in i18n_file:
148 if line.startswith('translit_start'):
149 tail = line
150 break
151 for line in i18n_file:
152 tail = tail + line
153 return (head, tail)
155 def output_head(i18n_file, unicode_version, head=''):
156 '''Write the header of the output file, i.e. the part of the file
157 before the “LC_CTYPE” line.
159 if ARGS.input_file and head:
160 i18n_file.write(head)
161 else:
162 i18n_file.write('escape_char /\n')
163 i18n_file.write('comment_char %\n')
164 i18n_file.write('\n')
165 i18n_file.write('% Generated automatically by '
166 + 'gen_unicode_ctype.py '
167 + 'for Unicode {:s}.\n'.format(unicode_version))
168 i18n_file.write('\n')
169 i18n_file.write('LC_IDENTIFICATION\n')
170 i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format(
171 unicode_version))
172 i18n_file.write('source "UnicodeData.txt, '
173 + 'DerivedCoreProperties.txt"\n')
174 i18n_file.write('address ""\n')
175 i18n_file.write('contact ""\n')
176 i18n_file.write('email "bug-glibc-locales@gnu.org"\n')
177 i18n_file.write('tel ""\n')
178 i18n_file.write('fax ""\n')
179 i18n_file.write('language ""\n')
180 i18n_file.write('territory "Earth"\n')
181 i18n_file.write('revision "{:s}"\n'.format(unicode_version))
182 i18n_file.write('date "{:s}"\n'.format(
183 time.strftime('%Y-%m-%d')))
184 i18n_file.write('category "i18n:2012";LC_CTYPE\n')
185 i18n_file.write('END LC_IDENTIFICATION\n')
186 i18n_file.write('\n')
187 i18n_file.write('LC_CTYPE\n')
189 def output_tail(i18n_file, tail=''):
190 '''Write the tail of the output file, i.e. the part of the file
191 after the last “LC_CTYPE” character class.
193 if ARGS.input_file and tail:
194 i18n_file.write(tail)
195 else:
196 i18n_file.write('END LC_CTYPE\n')
198 def output_tables(i18n_file, unicode_version, turkish):
199 '''Write the new LC_CTYPE character classes to the output file'''
200 i18n_file.write('% The following is the 14652 i18n fdcc-set '
201 + 'LC_CTYPE category.\n')
202 i18n_file.write('% It covers Unicode version {:s}.\n'.format(
203 unicode_version))
204 i18n_file.write('% The character classes and mapping tables were '
205 + 'automatically\n')
206 i18n_file.write('% generated using the gen_unicode_ctype.py '
207 + 'program.\n\n')
208 i18n_file.write('% The "upper" class reflects the uppercase '
209 + 'characters of class "alpha"\n')
210 output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
211 i18n_file.write('% The "lower" class reflects the lowercase '
212 + 'characters of class "alpha"\n')
213 output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
214 i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
215 + 'reflecting\n')
216 i18n_file.write('% the recommendations in TR 10176 annex A\n')
217 output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
218 i18n_file.write('% The "digit" class must only contain the '
219 + 'BASIC LATIN digits, says ISO C 99\n')
220 i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
221 output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
222 i18n_file.write('% The "outdigit" information is by default '
223 + '"0" to "9". We don\'t have to\n')
224 i18n_file.write('% provide it here since localedef will fill '
225 + 'in the bits and it would\n')
226 i18n_file.write('% prevent locales copying this file define '
227 + 'their own values.\n')
228 i18n_file.write('% outdigit /\n')
229 i18n_file.write('% <U0030>..<U0039>\n\n')
230 # output_charclass(i18n_file, 'outdigit', is_outdigit)
231 output_charclass(i18n_file, 'space', unicode_utils.is_space)
232 output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
233 output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
234 output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
235 output_charclass(i18n_file, 'print', unicode_utils.is_print)
236 i18n_file.write('% The "xdigit" class must only contain the '
237 + 'BASIC LATIN digits and A-F, a-f,\n')
238 i18n_file.write('% says ISO C 99 '
239 + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
240 output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
241 output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
242 if turkish:
243 i18n_file.write('% The case conversions reflect '
244 + 'Turkish conventions.\n')
245 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
246 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
247 else:
248 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
249 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
250 output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
251 i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
252 + 'annex B.1\n')
253 i18n_file.write('% That is, all combining characters (level 2+3).\n')
254 output_charclass(i18n_file, 'class "combining";',
255 unicode_utils.is_combining)
256 i18n_file.write('% The "combining_level3" class reflects '
257 + 'ISO/IEC 10646-1 annex B.2\n')
258 i18n_file.write('% That is, combining characters of level 3.\n')
259 output_charclass(i18n_file, 'class "combining_level3";',
260 unicode_utils.is_combining_level3)
262 if __name__ == "__main__":
263 PARSER = argparse.ArgumentParser(
264 description='''
265 Generate a Unicode conforming LC_CTYPE category from
266 UnicodeData.txt and DerivedCoreProperties.txt files.
267 ''')
268 PARSER.add_argument(
269 '-u', '--unicode_data_file',
270 nargs='?',
271 type=str,
272 default='UnicodeData.txt',
273 help=('The UnicodeData.txt file to read, '
274 + 'default: %(default)s'))
275 PARSER.add_argument(
276 '-d', '--derived_core_properties_file',
277 nargs='?',
278 type=str,
279 default='DerivedCoreProperties.txt',
280 help=('The DerivedCoreProperties.txt file to read, '
281 + 'default: %(default)s'))
282 PARSER.add_argument(
283 '-i', '--input_file',
284 nargs='?',
285 type=str,
286 help='''The original glibc/localedata/locales/i18n file.''')
287 PARSER.add_argument(
288 '-o', '--output_file',
289 nargs='?',
290 type=str,
291 default='i18n.new',
292 help='''The file which shall contain the generated LC_CTYPE category,
293 default: %(default)s. If the original
294 glibc/localedata/locales/i18n has been given
295 as an option, all data from the original file
296 except the newly generated LC_CTYPE character
297 classes and the date stamp in
298 LC_IDENTIFICATION will be copied unchanged
299 into the output file. ''')
300 PARSER.add_argument(
301 '--unicode_version',
302 nargs='?',
303 required=True,
304 type=str,
305 help='The Unicode version of the input files used.')
306 PARSER.add_argument(
307 '--turkish',
308 action='store_true',
309 help='Use Turkish case conversions.')
310 ARGS = PARSER.parse_args()
312 unicode_utils.fill_attributes(
313 ARGS.unicode_data_file)
314 unicode_utils.fill_derived_core_properties(
315 ARGS.derived_core_properties_file)
316 unicode_utils.verifications()
317 HEAD = TAIL = ''
318 if ARGS.input_file:
319 (HEAD, TAIL) = read_input_file(ARGS.input_file)
320 with open(ARGS.output_file, mode='w') as I18N_FILE:
321 output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
322 output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
323 output_tail(I18N_FILE, tail=TAIL)