hurd: Fix glob64 compatibility symbols
[glibc.git] / localedata / unicode-gen / utf8_gen.py
blob715b753ec1bb407510442af1abcb4c3334775715
1 #!/usr/bin/python3
2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2018 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
20 '''glibc/localedata/charmaps/UTF-8 file generator script
22 This script generates a glibc/localedata/charmaps/UTF-8 file
23 from Unicode data.
25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
27 It will output UTF-8 file
28 '''
30 import sys
31 import re
32 import unicode_utils
34 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
35 # sections 3.11 and 4.4.
37 JAMO_INITIAL_SHORT_NAME = (
38 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
39 'C', 'K', 'T', 'P', 'H'
42 JAMO_MEDIAL_SHORT_NAME = (
43 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
44 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
47 JAMO_FINAL_SHORT_NAME = (
48 '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
49 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
50 'P', 'H'
53 def process_range(start, end, outfile, name):
54 '''Writes a range of code points into the CHARMAP section of the
55 output file
57 '''
58 if 'Hangul Syllable' in name:
59 # from glibc/localedata/ChangeLog:
61 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
62 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
63 # so they become printable and carry a width. Comment out surrogate
64 # ranges. Add a WIDTH table
66 # So we expand the Hangul Syllables here:
67 for i in range(int(start, 16), int(end, 16)+1 ):
68 index2, index3 = divmod(i - 0xaC00, 28)
69 index1, index2 = divmod(index2, 21)
70 hangul_syllable_name = 'HANGUL SYLLABLE ' \
71 + JAMO_INITIAL_SHORT_NAME[index1] \
72 + JAMO_MEDIAL_SHORT_NAME[index2] \
73 + JAMO_FINAL_SHORT_NAME[index3]
74 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
75 unicode_utils.ucs_symbol(i), convert_to_hex(i),
76 hangul_syllable_name))
77 return
78 # UnicodeData.txt file has contains code point ranges like this:
80 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
81 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
83 # The glibc UTF-8 file splits ranges like these into shorter
84 # ranges of 64 code points each:
86 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
87 # …
88 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
89 for i in range(int(start, 16), int(end, 16), 64 ):
90 if i > (int(end, 16)-64):
91 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
92 unicode_utils.ucs_symbol(i),
93 unicode_utils.ucs_symbol(int(end,16)),
94 convert_to_hex(i),
95 name))
96 break
97 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
98 unicode_utils.ucs_symbol(i),
99 unicode_utils.ucs_symbol(i+63),
100 convert_to_hex(i),
101 name))
103 def process_charmap(flines, outfile):
104 '''This function takes an array which contains *all* lines of
105 of UnicodeData.txt and write lines to outfile as used in the
107 CHARMAP
109 END CHARMAP
111 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
113 Samples for input lines:
115 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
116 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
117 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
118 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
119 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
120 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
121 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
123 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
125 <U0010> /x10 DATA LINK ESCAPE
126 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
127 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
128 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
129 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
132 fields_start = []
133 for line in flines:
134 fields = line.split(";")
135 # Some characters have “<control>” as their name. We try to
136 # use the “Unicode 1.0 Name” (10th field in
137 # UnicodeData.txt) for them.
139 # The Characters U+0080, U+0081, U+0084 and U+0099 have
140 # “<control>” as their name but do not even have aa
141 # ”Unicode 1.0 Name”. We could write code to take their
142 # alternate names from NameAliases.txt.
143 if fields[1] == "<control>" and fields[10]:
144 fields[1] = fields[10]
145 # Handling code point ranges like:
147 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
148 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
149 if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
150 fields_start = fields
151 continue
152 if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
153 process_range(fields_start[0], fields[0],
154 outfile, fields[1][:-7]+'>')
155 fields_start = []
156 continue
157 fields_start = []
158 if 'Surrogate,' in fields[1]:
159 # Comment out the surrogates in the UTF-8 file.
160 # One could of course skip them completely but
161 # the original UTF-8 file in glibc had them as
162 # comments, so we keep these comment lines.
163 outfile.write('%')
164 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
165 unicode_utils.ucs_symbol(int(fields[0], 16)),
166 convert_to_hex(int(fields[0], 16)),
167 fields[1]))
169 def convert_to_hex(code_point):
170 '''Converts a code point to a hexadecimal UTF-8 representation
171 like /x**/x**/x**.'''
172 # Getting UTF8 of Unicode characters.
173 # In Python3, .encode('UTF-8') does not work for
174 # surrogates. Therefore, we use this conversion table
175 surrogates = {
176 0xD800: '/xed/xa0/x80',
177 0xDB7F: '/xed/xad/xbf',
178 0xDB80: '/xed/xae/x80',
179 0xDBFF: '/xed/xaf/xbf',
180 0xDC00: '/xed/xb0/x80',
181 0xDFFF: '/xed/xbf/xbf',
183 if code_point in surrogates:
184 return surrogates[code_point]
185 return ''.join([
186 '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
189 def write_header_charmap(outfile):
190 '''Write the header on top of the CHARMAP section to the output file'''
191 outfile.write("<code_set_name> UTF-8\n")
192 outfile.write("<comment_char> %\n")
193 outfile.write("<escape_char> /\n")
194 outfile.write("<mb_cur_min> 1\n")
195 outfile.write("<mb_cur_max> 6\n\n")
196 outfile.write("% CHARMAP generated using utf8_gen.py\n")
197 outfile.write("% alias ISO-10646/UTF-8\n")
198 outfile.write("CHARMAP\n")
200 def write_header_width(outfile):
201 '''Writes the header on top of the WIDTH section to the output file'''
202 outfile.write('% Character width according to Unicode 10.0.0.\n')
203 outfile.write('% - Default width is 1.\n')
204 outfile.write('% - Double-width characters have width 2; generated from\n')
205 outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
206 outfile.write('% - Non-spacing characters have width 0; '
207 + 'generated from PropList.txt or\n')
208 outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
209 + 'UnicodeData.txt"\n')
210 outfile.write('% - Format control characters have width 0; '
211 + 'generated from\n')
212 outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
213 # Not needed covered by Cf
214 # outfile.write("% - Zero width characters have width 0; generated from\n")
215 # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
216 outfile.write("WIDTH\n")
218 def process_width(outfile, ulines, elines, plines):
219 '''ulines are lines from UnicodeData.txt, elines are lines from
220 EastAsianWidth.txt containing characters with width “W” or “F”,
221 plines are lines from PropList.txt which contain characters
222 with the property “Prepended_Concatenation_Mark”.
225 width_dict = {}
226 for line in elines:
227 fields = line.split(";")
228 if not '..' in fields[0]:
229 code_points = (fields[0], fields[0])
230 else:
231 code_points = fields[0].split("..")
232 for key in range(int(code_points[0], 16),
233 int(code_points[1], 16)+1):
234 width_dict[key] = 2
236 for line in ulines:
237 fields = line.split(";")
238 if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
239 width_dict[int(fields[0], 16)] = 0
241 for line in plines:
242 # Characters with the property “Prepended_Concatenation_Mark”
243 # should have the width 1:
244 fields = line.split(";")
245 if not '..' in fields[0]:
246 code_points = (fields[0], fields[0])
247 else:
248 code_points = fields[0].split("..")
249 for key in range(int(code_points[0], 16),
250 int(code_points[1], 16)+1):
251 del width_dict[key] # default width is 1
253 # handle special cases for compatibility
254 for key in list((0x00AD,)):
255 # https://www.cs.tut.fi/~jkorpela/shy.html
256 if key in width_dict:
257 del width_dict[key] # default width is 1
258 for key in list(range(0x1160, 0x1200)):
259 width_dict[key] = 0
260 for key in list(range(0x3248, 0x3250)):
261 # These are “A” which means we can decide whether to treat them
262 # as “W” or “N” based on context:
263 # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
264 # For us, “W” seems better.
265 width_dict[key] = 2
266 for key in list(range(0x4DC0, 0x4E00)):
267 width_dict[key] = 2
269 same_width_lists = []
270 current_width_list = []
271 for key in sorted(width_dict):
272 if not current_width_list:
273 current_width_list = [key]
274 elif (key == current_width_list[-1] + 1
275 and width_dict[key] == width_dict[current_width_list[0]]):
276 current_width_list.append(key)
277 else:
278 same_width_lists.append(current_width_list)
279 current_width_list = [key]
280 if current_width_list:
281 same_width_lists.append(current_width_list)
283 for same_width_list in same_width_lists:
284 if len(same_width_list) == 1:
285 outfile.write('{:s}\t{:d}\n'.format(
286 unicode_utils.ucs_symbol(same_width_list[0]),
287 width_dict[same_width_list[0]]))
288 else:
289 outfile.write('{:s}...{:s}\t{:d}\n'.format(
290 unicode_utils.ucs_symbol(same_width_list[0]),
291 unicode_utils.ucs_symbol(same_width_list[-1]),
292 width_dict[same_width_list[0]]))
294 if __name__ == "__main__":
295 if len(sys.argv) < 3:
296 print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
297 else:
298 with open(sys.argv[1], mode='r') as UNIDATA_FILE:
299 UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
300 with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
301 EAST_ASIAN_WIDTH_LINES = []
302 for LINE in EAST_ASIAN_WIDTH_FILE:
303 # If characters from EastAasianWidth.txt which are from
304 # from reserved ranges (i.e. not yet assigned code points)
305 # are added to the WIDTH section of the UTF-8 file, then
306 # “make check” produces “Unknown Character” errors for
307 # these code points because such unassigned code points
308 # are not in the CHARMAP section of the UTF-8 file.
310 # Therefore, we skip all reserved code points when reading
311 # the EastAsianWidth.txt file.
312 if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
313 continue
314 if re.match(r'^[^;]*;[WF]', LINE):
315 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
316 with open(sys.argv[3], mode='r') as PROP_LIST_FILE:
317 PROP_LIST_LINES = []
318 for LINE in PROP_LIST_FILE:
319 if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
320 PROP_LIST_LINES.append(LINE.strip())
321 with open('UTF-8', mode='w') as OUTFILE:
322 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
323 write_header_charmap(OUTFILE)
324 process_charmap(UNICODE_DATA_LINES, OUTFILE)
325 OUTFILE.write("END CHARMAP\n\n")
326 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
327 write_header_width(OUTFILE)
328 process_width(OUTFILE,
329 UNICODE_DATA_LINES,
330 EAST_ASIAN_WIDTH_LINES,
331 PROP_LIST_LINES)
332 OUTFILE.write("END WIDTH\n")