2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
4 # Copyright The GNU Toolchain Authors.
5 # This file is part of the GNU C Library.
7 # The GNU C Library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # The GNU C Library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with the GNU C Library; if not, see
19 # <https://www.gnu.org/licenses/>.
21 '''glibc/localedata/charmaps/UTF-8 file generator script
23 This script generates a glibc/localedata/charmaps/UTF-8 file
26 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
28 It will output UTF-8 file
35 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
36 # sections 3.11 and 4.4.
38 JAMO_INITIAL_SHORT_NAME
= (
39 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
40 'C', 'K', 'T', 'P', 'H'
43 JAMO_MEDIAL_SHORT_NAME
= (
44 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
45 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
48 JAMO_FINAL_SHORT_NAME
= (
49 '', 'G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
50 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
54 def process_range(start
, end
, outfile
, name
):
55 '''Writes a range of code points into the CHARMAP section of the
59 if 'Hangul Syllable' in name
:
60 # from glibc/localedata/ChangeLog:
62 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
63 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
64 # so they become printable and carry a width. Comment out surrogate
65 # ranges. Add a WIDTH table
67 # So we expand the Hangul Syllables here:
68 for i
in range(int(start
, 16), int(end
, 16)+1 ):
69 index2
, index3
= divmod(i
- 0xaC00, 28)
70 index1
, index2
= divmod(index2
, 21)
71 hangul_syllable_name
= 'HANGUL SYLLABLE ' \
72 + JAMO_INITIAL_SHORT_NAME
[index1
] \
73 + JAMO_MEDIAL_SHORT_NAME
[index2
] \
74 + JAMO_FINAL_SHORT_NAME
[index3
]
75 outfile
.write('{:<11s} {:<12s} {:s}\n'.format(
76 unicode_utils
.ucs_symbol(i
), convert_to_hex(i
),
77 hangul_syllable_name
))
79 # UnicodeData.txt file has contains code point ranges like this:
81 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
82 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
84 # The glibc UTF-8 file splits ranges like these into shorter
85 # ranges of 64 code points each:
87 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
89 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
90 for i
in range(int(start
, 16), int(end
, 16), 64 ):
91 if i
> (int(end
, 16)-64):
92 outfile
.write('{:s}..{:s} {:<12s} {:s}\n'.format(
93 unicode_utils
.ucs_symbol(i
),
94 unicode_utils
.ucs_symbol(int(end
,16)),
98 outfile
.write('{:s}..{:s} {:<12s} {:s}\n'.format(
99 unicode_utils
.ucs_symbol(i
),
100 unicode_utils
.ucs_symbol(i
+63),
104 def process_charmap(flines
, outfile
):
105 '''This function takes an array which contains *all* lines of
106 of UnicodeData.txt and write lines to outfile as used in the
112 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
114 Samples for input lines:
116 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
117 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
118 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
119 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
120 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
121 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
122 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
124 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
126 <U0010> /x10 DATA LINK ESCAPE
127 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
128 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
129 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
130 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
135 fields
= line
.split(";")
136 # Some characters have “<control>” as their name. We try to
137 # use the “Unicode 1.0 Name” (10th field in
138 # UnicodeData.txt) for them.
140 # The Characters U+0080, U+0081, U+0084 and U+0099 have
141 # “<control>” as their name but do not even have aa
142 # ”Unicode 1.0 Name”. We could write code to take their
143 # alternate names from NameAliases.txt.
144 if fields
[1] == "<control>" and fields
[10]:
145 fields
[1] = fields
[10]
146 # Handling code point ranges like:
148 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
149 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
150 if fields
[1].endswith(', First>') and not 'Surrogate,' in fields
[1]:
151 fields_start
= fields
153 if fields
[1].endswith(', Last>') and not 'Surrogate,' in fields
[1]:
154 process_range(fields_start
[0], fields
[0],
155 outfile
, fields
[1][:-7]+'>')
159 if 'Surrogate,' in fields
[1]:
160 # Comment out the surrogates in the UTF-8 file.
161 # One could of course skip them completely but
162 # the original UTF-8 file in glibc had them as
163 # comments, so we keep these comment lines.
165 outfile
.write('{:<11s} {:<12s} {:s}\n'.format(
166 unicode_utils
.ucs_symbol(int(fields
[0], 16)),
167 convert_to_hex(int(fields
[0], 16)),
170 def convert_to_hex(code_point
):
171 '''Converts a code point to a hexadecimal UTF-8 representation
172 like /x**/x**/x**.'''
173 # Getting UTF8 of Unicode characters.
174 # In Python3, .encode('UTF-8') does not work for
175 # surrogates. Therefore, we use this conversion table
177 0xD800: '/xed/xa0/x80',
178 0xDB7F: '/xed/xad/xbf',
179 0xDB80: '/xed/xae/x80',
180 0xDBFF: '/xed/xaf/xbf',
181 0xDC00: '/xed/xb0/x80',
182 0xDFFF: '/xed/xbf/xbf',
184 if code_point
in surrogates
:
185 return surrogates
[code_point
]
187 '/x{:02x}'.format(c
) for c
in chr(code_point
).encode('UTF-8')
190 def write_header_charmap(outfile
):
191 '''Write the header on top of the CHARMAP section to the output file'''
192 outfile
.write("<code_set_name> UTF-8\n")
193 outfile
.write("<comment_char> %\n")
194 outfile
.write("<escape_char> /\n")
195 outfile
.write("<mb_cur_min> 1\n")
196 outfile
.write("<mb_cur_max> 6\n\n")
197 outfile
.write("% CHARMAP generated using utf8_gen.py\n")
198 outfile
.write("% alias ISO-10646/UTF-8\n")
199 outfile
.write("CHARMAP\n")
201 def write_header_width(outfile
, unicode_version
):
202 '''Writes the header on top of the WIDTH section to the output file'''
203 outfile
.write('% Character width according to Unicode {:s}.\n'.format(unicode_version
))
204 outfile
.write('% Width is determined by the following rules, in order of decreasing precedence:\n')
205 outfile
.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n')
206 outfile
.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n')
207 outfile
.write('% This character stands in for an intentionally omitted leading consonant\n')
208 outfile
.write('% in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n')
209 outfile
.write('% of visible display to ensure that the complete block has the correct width.\n')
210 outfile
.write('% (See below for more information on Hangul syllables.)\n')
211 outfile
.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n')
212 outfile
.write('% "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n')
213 outfile
.write('% One composed Hangul "syllable block" like 퓛 is made up of\n')
214 outfile
.write('% two to three individual component characters called "jamo".\n')
215 outfile
.write('% The complete block must have total width 2;\n')
216 outfile
.write('% to achieve this, we assign a width of 2 to leading "choseong" jamo,\n')
217 outfile
.write('% and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n')
218 outfile
.write('% - Non-spacing and enclosing marks have width 0; generated from\n')
219 outfile
.write('% "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n')
220 outfile
.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n')
221 outfile
.write('% "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n')
222 outfile
.write('% - Double-width characters have width 2; generated from\n')
223 outfile
.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n')
224 outfile
.write('% - Default width for all other characters is 1.\n')
225 outfile
.write("WIDTH\n")
227 def process_width(outfile
, ulines
, dlines
, elines
, klines
):
228 '''ulines are lines from UnicodeData.txt.
229 elines are lines from EastAsianWidth.txt containing characters with width
231 dlines are lines from DerivedCoreProperties.txt which contain
232 characters with the property “Default_Ignorable_Code_Point”.
233 klines are lines from HangulSyllableType.txt which contain characters
234 with syllable type “V” or “T”.
236 # Wide and fullwidth characters have width 1
239 fields
= line
.split(";")
240 if not '..' in fields
[0]:
241 code_points
= (fields
[0], fields
[0])
243 code_points
= fields
[0].split("..")
244 for key
in range(int(code_points
[0], 16),
245 int(code_points
[1], 16)+1):
248 # Nonspacing and enclosing marks have width 0
250 fields
= line
.split(";")
251 if fields
[4] == "NSM" or fields
[2] in ("Me", "Mn"):
252 width_dict
[int(fields
[0], 16)] = 0
254 # Conjoining vowel and trailing jamo have width 0
256 fields
= line
.split(";")
257 if not '..' in fields
[0]:
258 code_points
= (fields
[0], fields
[0])
260 code_points
= fields
[0].split("..")
261 for key
in range(int(code_points
[0], 16),
262 int(code_points
[1], 16)+1):
265 # “Default_Ignorable_Code_Point”s have width 0
267 fields
= line
.split(";")
268 if not '..' in fields
[0]:
269 code_points
= (fields
[0], fields
[0])
271 code_points
= fields
[0].split("..")
272 for key
in range(int(code_points
[0], 16),
273 int(code_points
[1], 16)+1):
274 width_dict
[key
] = 0 # default width is 1
277 # Special case: U+00AD SOFT HYPHEN
278 del width_dict
[0x00AD]
280 # Special case: U+115F HANGUL CHOSEONG FILLER
281 width_dict
[0x115F] = 2
283 for key
in list(range(0x3248, 0x3250)):
284 # These are “A” which means we can decide whether to treat them
285 # as “W” or “N” based on context:
286 # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
287 # For us, “W” seems better.
289 for key
in list(range(0x4DC0, 0x4E00)):
292 same_width_lists
= []
293 current_width_list
= []
294 for key
in sorted(width_dict
):
295 if not current_width_list
:
296 current_width_list
= [key
]
297 elif (key
== current_width_list
[-1] + 1
298 and width_dict
[key
] == width_dict
[current_width_list
[0]]):
299 current_width_list
.append(key
)
301 same_width_lists
.append(current_width_list
)
302 current_width_list
= [key
]
303 if current_width_list
:
304 same_width_lists
.append(current_width_list
)
306 for same_width_list
in same_width_lists
:
307 if len(same_width_list
) == 1:
308 outfile
.write('{:s}\t{:d}\n'.format(
309 unicode_utils
.ucs_symbol(same_width_list
[0]),
310 width_dict
[same_width_list
[0]]))
312 outfile
.write('{:s}...{:s}\t{:d}\n'.format(
313 unicode_utils
.ucs_symbol(same_width_list
[0]),
314 unicode_utils
.ucs_symbol(same_width_list
[-1]),
315 width_dict
[same_width_list
[0]]))
317 if __name__
== "__main__":
318 PARSER
= argparse
.ArgumentParser(
320 Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt
323 '-u', '--unicode_data_file',
326 default
='UnicodeData.txt',
327 help=('The UnicodeData.txt file to read, '
328 + 'default: %(default)s'))
330 '-d', '--derived_core_properties_file',
333 default
='DerivedCoreProperties.txt',
334 help=('The DerivedCoreProperties.txt file to read, '
335 + 'default: %(default)s'))
337 '-e', '--east_asian_with_file',
340 default
='EastAsianWidth.txt',
341 help=('The EastAsianWidth.txt file to read, '
342 + 'default: %(default)s'))
344 '-k', '--hangul_syllable_type_file',
347 default
='HangulSyllableType.txt',
348 help=('The HangulSyllableType.txt file to read, '
349 + 'default: %(default)s'))
355 help='The Unicode version of the input files used.')
356 ARGS
= PARSER
.parse_args()
358 unicode_utils
.fill_attributes(ARGS
.unicode_data_file
)
359 with
open(ARGS
.unicode_data_file
, mode
='r') as UNIDATA_FILE
:
360 UNICODE_DATA_LINES
= UNIDATA_FILE
.readlines()
361 with
open(ARGS
.derived_core_properties_file
, mode
='r') as DERIVED_CORE_PROPERTIES_FILE
:
362 DERIVED_CORE_PROPERTIES_LINES
= []
363 for LINE
in DERIVED_CORE_PROPERTIES_FILE
:
364 # If characters which are from reserved ranges
365 # (i.e. not yet assigned code points)
366 # are added to the WIDTH section of the UTF-8 file, then
367 # “make check” produces “Unknown Character” errors for
368 # these code points because such unassigned code points
369 # are not in the CHARMAP section of the UTF-8 file.
371 # Therefore, we skip all reserved code points.
372 if re
.match(r
'.*<reserved-.+>', LINE
):
374 if re
.match(r
'^[^;]*;\s*Default_Ignorable_Code_Point', LINE
):
375 DERIVED_CORE_PROPERTIES_LINES
.append(LINE
.strip())
376 with
open(ARGS
.east_asian_with_file
, mode
='r') as EAST_ASIAN_WIDTH_FILE
:
377 EAST_ASIAN_WIDTH_LINES
= []
378 for LINE
in EAST_ASIAN_WIDTH_FILE
:
379 if re
.match(r
'.*<reserved-.+>', LINE
):
381 if re
.match(r
'^[^;]*;\s*[WF]', LINE
):
382 EAST_ASIAN_WIDTH_LINES
.append(LINE
.strip())
383 with
open(ARGS
.hangul_syllable_type_file
, mode
='r') as HANGUL_SYLLABLE_TYPE_FILE
:
384 HANGUL_SYLLABLE_TYPE_LINES
= []
385 for LINE
in HANGUL_SYLLABLE_TYPE_FILE
:
386 if re
.match(r
'.*<reserved-.+>', LINE
):
388 if re
.match(r
'^[^;]*;\s*[VT]', LINE
):
389 HANGUL_SYLLABLE_TYPE_LINES
.append(LINE
.strip())
390 with
open('UTF-8', mode
='w') as OUTFILE
:
391 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
392 write_header_charmap(OUTFILE
)
393 process_charmap(UNICODE_DATA_LINES
, OUTFILE
)
394 OUTFILE
.write("END CHARMAP\n\n")
395 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
396 write_header_width(OUTFILE
, ARGS
.unicode_version
)
397 process_width(OUTFILE
,
399 DERIVED_CORE_PROPERTIES_LINES
,
400 EAST_ASIAN_WIDTH_LINES
,
401 HANGUL_SYLLABLE_TYPE_LINES
)
402 OUTFILE
.write("END WIDTH\n")