linux: Fix tst-syscall-restart.c on old gcc (BZ 32283)
[glibc.git] / localedata / unicode-gen / utf8_gen.py
blobdd36db93ff9a6022411e326d6a1e279c9d5b88b0
1 #!/usr/bin/python3
2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
4 # Copyright The GNU Toolchain Authors.
5 # This file is part of the GNU C Library.
7 # The GNU C Library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # The GNU C Library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with the GNU C Library; if not, see
19 # <https://www.gnu.org/licenses/>.
21 '''glibc/localedata/charmaps/UTF-8 file generator script
23 This script generates a glibc/localedata/charmaps/UTF-8 file
24 from Unicode data.
26 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
28 It will output UTF-8 file
29 '''
31 import argparse
32 import re
33 import unicode_utils
35 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
36 # sections 3.11 and 4.4.
38 JAMO_INITIAL_SHORT_NAME = (
39 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
40 'C', 'K', 'T', 'P', 'H'
43 JAMO_MEDIAL_SHORT_NAME = (
44 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
45 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
48 JAMO_FINAL_SHORT_NAME = (
49 '', 'G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
50 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
51 'P', 'H'
54 def process_range(start, end, outfile, name):
55 '''Writes a range of code points into the CHARMAP section of the
56 output file
58 '''
59 if 'Hangul Syllable' in name:
60 # from glibc/localedata/ChangeLog:
62 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
63 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
64 # so they become printable and carry a width. Comment out surrogate
65 # ranges. Add a WIDTH table
67 # So we expand the Hangul Syllables here:
68 for i in range(int(start, 16), int(end, 16)+1 ):
69 index2, index3 = divmod(i - 0xaC00, 28)
70 index1, index2 = divmod(index2, 21)
71 hangul_syllable_name = 'HANGUL SYLLABLE ' \
72 + JAMO_INITIAL_SHORT_NAME[index1] \
73 + JAMO_MEDIAL_SHORT_NAME[index2] \
74 + JAMO_FINAL_SHORT_NAME[index3]
75 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
76 unicode_utils.ucs_symbol(i), convert_to_hex(i),
77 hangul_syllable_name))
78 return
79 # UnicodeData.txt file has contains code point ranges like this:
81 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
82 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
84 # The glibc UTF-8 file splits ranges like these into shorter
85 # ranges of 64 code points each:
87 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
88 # …
89 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
90 for i in range(int(start, 16), int(end, 16), 64 ):
91 if i > (int(end, 16)-64):
92 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
93 unicode_utils.ucs_symbol(i),
94 unicode_utils.ucs_symbol(int(end,16)),
95 convert_to_hex(i),
96 name))
97 break
98 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
99 unicode_utils.ucs_symbol(i),
100 unicode_utils.ucs_symbol(i+63),
101 convert_to_hex(i),
102 name))
104 def process_charmap(flines, outfile):
105 '''This function takes an array which contains *all* lines of
106 of UnicodeData.txt and write lines to outfile as used in the
108 CHARMAP
110 END CHARMAP
112 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
114 Samples for input lines:
116 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
117 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
118 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
119 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
120 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
121 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
122 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
124 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
126 <U0010> /x10 DATA LINK ESCAPE
127 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
128 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
129 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
130 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
133 fields_start = []
134 for line in flines:
135 fields = line.split(";")
136 # Some characters have “<control>” as their name. We try to
137 # use the “Unicode 1.0 Name” (10th field in
138 # UnicodeData.txt) for them.
140 # The Characters U+0080, U+0081, U+0084 and U+0099 have
141 # “<control>” as their name but do not even have aa
142 # ”Unicode 1.0 Name”. We could write code to take their
143 # alternate names from NameAliases.txt.
144 if fields[1] == "<control>" and fields[10]:
145 fields[1] = fields[10]
146 # Handling code point ranges like:
148 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
149 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
150 if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
151 fields_start = fields
152 continue
153 if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
154 process_range(fields_start[0], fields[0],
155 outfile, fields[1][:-7]+'>')
156 fields_start = []
157 continue
158 fields_start = []
159 if 'Surrogate,' in fields[1]:
160 # Comment out the surrogates in the UTF-8 file.
161 # One could of course skip them completely but
162 # the original UTF-8 file in glibc had them as
163 # comments, so we keep these comment lines.
164 outfile.write('%')
165 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
166 unicode_utils.ucs_symbol(int(fields[0], 16)),
167 convert_to_hex(int(fields[0], 16)),
168 fields[1]))
170 def convert_to_hex(code_point):
171 '''Converts a code point to a hexadecimal UTF-8 representation
172 like /x**/x**/x**.'''
173 # Getting UTF8 of Unicode characters.
174 # In Python3, .encode('UTF-8') does not work for
175 # surrogates. Therefore, we use this conversion table
176 surrogates = {
177 0xD800: '/xed/xa0/x80',
178 0xDB7F: '/xed/xad/xbf',
179 0xDB80: '/xed/xae/x80',
180 0xDBFF: '/xed/xaf/xbf',
181 0xDC00: '/xed/xb0/x80',
182 0xDFFF: '/xed/xbf/xbf',
184 if code_point in surrogates:
185 return surrogates[code_point]
186 return ''.join([
187 '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
190 def write_header_charmap(outfile):
191 '''Write the header on top of the CHARMAP section to the output file'''
192 outfile.write("<code_set_name> UTF-8\n")
193 outfile.write("<comment_char> %\n")
194 outfile.write("<escape_char> /\n")
195 outfile.write("<mb_cur_min> 1\n")
196 outfile.write("<mb_cur_max> 6\n\n")
197 outfile.write("% CHARMAP generated using utf8_gen.py\n")
198 outfile.write("% alias ISO-10646/UTF-8\n")
199 outfile.write("CHARMAP\n")
201 def write_header_width(outfile, unicode_version):
202 '''Writes the header on top of the WIDTH section to the output file'''
203 outfile.write('% Character width according to Unicode {:s}.\n'.format(unicode_version))
204 outfile.write('% Width is determined by the following rules, in order of decreasing precedence:\n')
205 outfile.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n')
206 outfile.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n')
207 outfile.write('% This character stands in for an intentionally omitted leading consonant\n')
208 outfile.write('% in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n')
209 outfile.write('% of visible display to ensure that the complete block has the correct width.\n')
210 outfile.write('% (See below for more information on Hangul syllables.)\n')
211 outfile.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n')
212 outfile.write('% "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n')
213 outfile.write('% One composed Hangul "syllable block" like 퓛 is made up of\n')
214 outfile.write('% two to three individual component characters called "jamo".\n')
215 outfile.write('% The complete block must have total width 2;\n')
216 outfile.write('% to achieve this, we assign a width of 2 to leading "choseong" jamo,\n')
217 outfile.write('% and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n')
218 outfile.write('% - Non-spacing and enclosing marks have width 0; generated from\n')
219 outfile.write('% "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n')
220 outfile.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n')
221 outfile.write('% "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n')
222 outfile.write('% - Double-width characters have width 2; generated from\n')
223 outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n')
224 outfile.write('% - Default width for all other characters is 1.\n')
225 outfile.write("WIDTH\n")
227 def process_width(outfile, ulines, dlines, elines, klines):
228 '''ulines are lines from UnicodeData.txt.
229 elines are lines from EastAsianWidth.txt containing characters with width
230 “W” or “F”.
231 dlines are lines from DerivedCoreProperties.txt which contain
232 characters with the property “Default_Ignorable_Code_Point”.
233 klines are lines from HangulSyllableType.txt which contain characters
234 with syllable type “V” or “T”.
236 # Wide and fullwidth characters have width 1
237 width_dict = {}
238 for line in elines:
239 fields = line.split(";")
240 if not '..' in fields[0]:
241 code_points = (fields[0], fields[0])
242 else:
243 code_points = fields[0].split("..")
244 for key in range(int(code_points[0], 16),
245 int(code_points[1], 16)+1):
246 width_dict[key] = 2
248 # Nonspacing and enclosing marks have width 0
249 for line in ulines:
250 fields = line.split(";")
251 if fields[4] == "NSM" or fields[2] in ("Me", "Mn"):
252 width_dict[int(fields[0], 16)] = 0
254 # Conjoining vowel and trailing jamo have width 0
255 for line in klines:
256 fields = line.split(";")
257 if not '..' in fields[0]:
258 code_points = (fields[0], fields[0])
259 else:
260 code_points = fields[0].split("..")
261 for key in range(int(code_points[0], 16),
262 int(code_points[1], 16)+1):
263 width_dict[key] = 0
265 # “Default_Ignorable_Code_Point”s have width 0
266 for line in dlines:
267 fields = line.split(";")
268 if not '..' in fields[0]:
269 code_points = (fields[0], fields[0])
270 else:
271 code_points = fields[0].split("..")
272 for key in range(int(code_points[0], 16),
273 int(code_points[1], 16)+1):
274 width_dict[key] = 0 # default width is 1
277 # Special case: U+00AD SOFT HYPHEN
278 del width_dict[0x00AD]
280 # Special case: U+115F HANGUL CHOSEONG FILLER
281 width_dict[0x115F] = 2
283 for key in list(range(0x3248, 0x3250)):
284 # These are “A” which means we can decide whether to treat them
285 # as “W” or “N” based on context:
286 # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
287 # For us, “W” seems better.
288 width_dict[key] = 2
289 for key in list(range(0x4DC0, 0x4E00)):
290 width_dict[key] = 2
292 same_width_lists = []
293 current_width_list = []
294 for key in sorted(width_dict):
295 if not current_width_list:
296 current_width_list = [key]
297 elif (key == current_width_list[-1] + 1
298 and width_dict[key] == width_dict[current_width_list[0]]):
299 current_width_list.append(key)
300 else:
301 same_width_lists.append(current_width_list)
302 current_width_list = [key]
303 if current_width_list:
304 same_width_lists.append(current_width_list)
306 for same_width_list in same_width_lists:
307 if len(same_width_list) == 1:
308 outfile.write('{:s}\t{:d}\n'.format(
309 unicode_utils.ucs_symbol(same_width_list[0]),
310 width_dict[same_width_list[0]]))
311 else:
312 outfile.write('{:s}...{:s}\t{:d}\n'.format(
313 unicode_utils.ucs_symbol(same_width_list[0]),
314 unicode_utils.ucs_symbol(same_width_list[-1]),
315 width_dict[same_width_list[0]]))
317 if __name__ == "__main__":
318 PARSER = argparse.ArgumentParser(
319 description='''
320 Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt
321 ''')
322 PARSER.add_argument(
323 '-u', '--unicode_data_file',
324 nargs='?',
325 type=str,
326 default='UnicodeData.txt',
327 help=('The UnicodeData.txt file to read, '
328 + 'default: %(default)s'))
329 PARSER.add_argument(
330 '-d', '--derived_core_properties_file',
331 nargs='?',
332 type=str,
333 default='DerivedCoreProperties.txt',
334 help=('The DerivedCoreProperties.txt file to read, '
335 + 'default: %(default)s'))
336 PARSER.add_argument(
337 '-e', '--east_asian_with_file',
338 nargs='?',
339 type=str,
340 default='EastAsianWidth.txt',
341 help=('The EastAsianWidth.txt file to read, '
342 + 'default: %(default)s'))
343 PARSER.add_argument(
344 '-k', '--hangul_syllable_type_file',
345 nargs='?',
346 type=str,
347 default='HangulSyllableType.txt',
348 help=('The HangulSyllableType.txt file to read, '
349 + 'default: %(default)s'))
350 PARSER.add_argument(
351 '--unicode_version',
352 nargs='?',
353 required=True,
354 type=str,
355 help='The Unicode version of the input files used.')
356 ARGS = PARSER.parse_args()
358 unicode_utils.fill_attributes(ARGS.unicode_data_file)
359 with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
360 UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
361 with open(ARGS.derived_core_properties_file, mode='r') as DERIVED_CORE_PROPERTIES_FILE:
362 DERIVED_CORE_PROPERTIES_LINES = []
363 for LINE in DERIVED_CORE_PROPERTIES_FILE:
364 # If characters which are from reserved ranges
365 # (i.e. not yet assigned code points)
366 # are added to the WIDTH section of the UTF-8 file, then
367 # “make check” produces “Unknown Character” errors for
368 # these code points because such unassigned code points
369 # are not in the CHARMAP section of the UTF-8 file.
371 # Therefore, we skip all reserved code points.
372 if re.match(r'.*<reserved-.+>', LINE):
373 continue
374 if re.match(r'^[^;]*;\s*Default_Ignorable_Code_Point', LINE):
375 DERIVED_CORE_PROPERTIES_LINES.append(LINE.strip())
376 with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
377 EAST_ASIAN_WIDTH_LINES = []
378 for LINE in EAST_ASIAN_WIDTH_FILE:
379 if re.match(r'.*<reserved-.+>', LINE):
380 continue
381 if re.match(r'^[^;]*;\s*[WF]', LINE):
382 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
383 with open(ARGS.hangul_syllable_type_file, mode='r') as HANGUL_SYLLABLE_TYPE_FILE:
384 HANGUL_SYLLABLE_TYPE_LINES = []
385 for LINE in HANGUL_SYLLABLE_TYPE_FILE:
386 if re.match(r'.*<reserved-.+>', LINE):
387 continue
388 if re.match(r'^[^;]*;\s*[VT]', LINE):
389 HANGUL_SYLLABLE_TYPE_LINES.append(LINE.strip())
390 with open('UTF-8', mode='w') as OUTFILE:
391 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
392 write_header_charmap(OUTFILE)
393 process_charmap(UNICODE_DATA_LINES, OUTFILE)
394 OUTFILE.write("END CHARMAP\n\n")
395 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
396 write_header_width(OUTFILE, ARGS.unicode_version)
397 process_width(OUTFILE,
398 UNICODE_DATA_LINES,
399 DERIVED_CORE_PROPERTIES_LINES,
400 EAST_ASIAN_WIDTH_LINES,
401 HANGUL_SYLLABLE_TYPE_LINES)
402 OUTFILE.write("END WIDTH\n")