2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2018 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
20 '''glibc/localedata/charmaps/UTF-8 file generator script
22 This script generates a glibc/localedata/charmaps/UTF-8 file
25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
27 It will output UTF-8 file
34 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
35 # sections 3.11 and 4.4.
37 JAMO_INITIAL_SHORT_NAME
= (
38 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
39 'C', 'K', 'T', 'P', 'H'
42 JAMO_MEDIAL_SHORT_NAME
= (
43 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
44 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
47 JAMO_FINAL_SHORT_NAME
= (
48 '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
49 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
53 def process_range(start
, end
, outfile
, name
):
54 '''Writes a range of code points into the CHARMAP section of the
58 if 'Hangul Syllable' in name
:
59 # from glibc/localedata/ChangeLog:
61 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
62 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
63 # so they become printable and carry a width. Comment out surrogate
64 # ranges. Add a WIDTH table
66 # So we expand the Hangul Syllables here:
67 for i
in range(int(start
, 16), int(end
, 16)+1 ):
68 index2
, index3
= divmod(i
- 0xaC00, 28)
69 index1
, index2
= divmod(index2
, 21)
70 hangul_syllable_name
= 'HANGUL SYLLABLE ' \
71 + JAMO_INITIAL_SHORT_NAME
[index1
] \
72 + JAMO_MEDIAL_SHORT_NAME
[index2
] \
73 + JAMO_FINAL_SHORT_NAME
[index3
]
74 outfile
.write('{:<11s} {:<12s} {:s}\n'.format(
75 unicode_utils
.ucs_symbol(i
), convert_to_hex(i
),
76 hangul_syllable_name
))
78 # UnicodeData.txt file has contains code point ranges like this:
80 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
81 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
83 # The glibc UTF-8 file splits ranges like these into shorter
84 # ranges of 64 code points each:
86 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
88 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
89 for i
in range(int(start
, 16), int(end
, 16), 64 ):
90 if i
> (int(end
, 16)-64):
91 outfile
.write('{:s}..{:s} {:<12s} {:s}\n'.format(
92 unicode_utils
.ucs_symbol(i
),
93 unicode_utils
.ucs_symbol(int(end
,16)),
97 outfile
.write('{:s}..{:s} {:<12s} {:s}\n'.format(
98 unicode_utils
.ucs_symbol(i
),
99 unicode_utils
.ucs_symbol(i
+63),
103 def process_charmap(flines
, outfile
):
104 '''This function takes an array which contains *all* lines of
105 of UnicodeData.txt and write lines to outfile as used in the
111 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
113 Samples for input lines:
115 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
116 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
117 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
118 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
119 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
120 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
121 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
123 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
125 <U0010> /x10 DATA LINK ESCAPE
126 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
127 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
128 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
129 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
134 fields
= line
.split(";")
135 # Some characters have “<control>” as their name. We try to
136 # use the “Unicode 1.0 Name” (10th field in
137 # UnicodeData.txt) for them.
139 # The Characters U+0080, U+0081, U+0084 and U+0099 have
140 # “<control>” as their name but do not even have aa
141 # ”Unicode 1.0 Name”. We could write code to take their
142 # alternate names from NameAliases.txt.
143 if fields
[1] == "<control>" and fields
[10]:
144 fields
[1] = fields
[10]
145 # Handling code point ranges like:
147 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
148 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
149 if fields
[1].endswith(', First>') and not 'Surrogate,' in fields
[1]:
150 fields_start
= fields
152 if fields
[1].endswith(', Last>') and not 'Surrogate,' in fields
[1]:
153 process_range(fields_start
[0], fields
[0],
154 outfile
, fields
[1][:-7]+'>')
158 if 'Surrogate,' in fields
[1]:
159 # Comment out the surrogates in the UTF-8 file.
160 # One could of course skip them completely but
161 # the original UTF-8 file in glibc had them as
162 # comments, so we keep these comment lines.
164 outfile
.write('{:<11s} {:<12s} {:s}\n'.format(
165 unicode_utils
.ucs_symbol(int(fields
[0], 16)),
166 convert_to_hex(int(fields
[0], 16)),
169 def convert_to_hex(code_point
):
170 '''Converts a code point to a hexadecimal UTF-8 representation
171 like /x**/x**/x**.'''
172 # Getting UTF8 of Unicode characters.
173 # In Python3, .encode('UTF-8') does not work for
174 # surrogates. Therefore, we use this conversion table
176 0xD800: '/xed/xa0/x80',
177 0xDB7F: '/xed/xad/xbf',
178 0xDB80: '/xed/xae/x80',
179 0xDBFF: '/xed/xaf/xbf',
180 0xDC00: '/xed/xb0/x80',
181 0xDFFF: '/xed/xbf/xbf',
183 if code_point
in surrogates
:
184 return surrogates
[code_point
]
186 '/x{:02x}'.format(c
) for c
in chr(code_point
).encode('UTF-8')
189 def write_header_charmap(outfile
):
190 '''Write the header on top of the CHARMAP section to the output file'''
191 outfile
.write("<code_set_name> UTF-8\n")
192 outfile
.write("<comment_char> %\n")
193 outfile
.write("<escape_char> /\n")
194 outfile
.write("<mb_cur_min> 1\n")
195 outfile
.write("<mb_cur_max> 6\n\n")
196 outfile
.write("% CHARMAP generated using utf8_gen.py\n")
197 outfile
.write("% alias ISO-10646/UTF-8\n")
198 outfile
.write("CHARMAP\n")
200 def write_header_width(outfile
):
201 '''Writes the header on top of the WIDTH section to the output file'''
202 outfile
.write('% Character width according to Unicode 10.0.0.\n')
203 outfile
.write('% - Default width is 1.\n')
204 outfile
.write('% - Double-width characters have width 2; generated from\n')
205 outfile
.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
206 outfile
.write('% - Non-spacing characters have width 0; '
207 + 'generated from PropList.txt or\n')
208 outfile
.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
209 + 'UnicodeData.txt"\n')
210 outfile
.write('% - Format control characters have width 0; '
211 + 'generated from\n')
212 outfile
.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
213 # Not needed covered by Cf
214 # outfile.write("% - Zero width characters have width 0; generated from\n")
215 # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
216 outfile
.write("WIDTH\n")
218 def process_width(outfile
, ulines
, elines
, plines
):
219 '''ulines are lines from UnicodeData.txt, elines are lines from
220 EastAsianWidth.txt containing characters with width “W” or “F”,
221 plines are lines from PropList.txt which contain characters
222 with the property “Prepended_Concatenation_Mark”.
227 fields
= line
.split(";")
228 if not '..' in fields
[0]:
229 code_points
= (fields
[0], fields
[0])
231 code_points
= fields
[0].split("..")
232 for key
in range(int(code_points
[0], 16),
233 int(code_points
[1], 16)+1):
237 fields
= line
.split(";")
238 if fields
[4] == "NSM" or fields
[2] in ("Cf", "Me", "Mn"):
239 width_dict
[int(fields
[0], 16)] = 0
242 # Characters with the property “Prepended_Concatenation_Mark”
243 # should have the width 1:
244 fields
= line
.split(";")
245 if not '..' in fields
[0]:
246 code_points
= (fields
[0], fields
[0])
248 code_points
= fields
[0].split("..")
249 for key
in range(int(code_points
[0], 16),
250 int(code_points
[1], 16)+1):
251 del width_dict
[key
] # default width is 1
253 # handle special cases for compatibility
254 for key
in list((0x00AD,)):
255 # https://www.cs.tut.fi/~jkorpela/shy.html
256 if key
in width_dict
:
257 del width_dict
[key
] # default width is 1
258 for key
in list(range(0x1160, 0x1200)):
260 for key
in list(range(0x3248, 0x3250)):
261 # These are “A” which means we can decide whether to treat them
262 # as “W” or “N” based on context:
263 # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
264 # For us, “W” seems better.
266 for key
in list(range(0x4DC0, 0x4E00)):
269 same_width_lists
= []
270 current_width_list
= []
271 for key
in sorted(width_dict
):
272 if not current_width_list
:
273 current_width_list
= [key
]
274 elif (key
== current_width_list
[-1] + 1
275 and width_dict
[key
] == width_dict
[current_width_list
[0]]):
276 current_width_list
.append(key
)
278 same_width_lists
.append(current_width_list
)
279 current_width_list
= [key
]
280 if current_width_list
:
281 same_width_lists
.append(current_width_list
)
283 for same_width_list
in same_width_lists
:
284 if len(same_width_list
) == 1:
285 outfile
.write('{:s}\t{:d}\n'.format(
286 unicode_utils
.ucs_symbol(same_width_list
[0]),
287 width_dict
[same_width_list
[0]]))
289 outfile
.write('{:s}...{:s}\t{:d}\n'.format(
290 unicode_utils
.ucs_symbol(same_width_list
[0]),
291 unicode_utils
.ucs_symbol(same_width_list
[-1]),
292 width_dict
[same_width_list
[0]]))
294 if __name__
== "__main__":
295 if len(sys
.argv
) < 3:
296 print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
298 with
open(sys
.argv
[1], mode
='r') as UNIDATA_FILE
:
299 UNICODE_DATA_LINES
= UNIDATA_FILE
.readlines()
300 with
open(sys
.argv
[2], mode
='r') as EAST_ASIAN_WIDTH_FILE
:
301 EAST_ASIAN_WIDTH_LINES
= []
302 for LINE
in EAST_ASIAN_WIDTH_FILE
:
303 # If characters from EastAasianWidth.txt which are from
304 # from reserved ranges (i.e. not yet assigned code points)
305 # are added to the WIDTH section of the UTF-8 file, then
306 # “make check” produces “Unknown Character” errors for
307 # these code points because such unassigned code points
308 # are not in the CHARMAP section of the UTF-8 file.
310 # Therefore, we skip all reserved code points when reading
311 # the EastAsianWidth.txt file.
312 if re
.match(r
'.*<reserved-.+>\.\.<reserved-.+>.*', LINE
):
314 if re
.match(r
'^[^;]*;[WF]', LINE
):
315 EAST_ASIAN_WIDTH_LINES
.append(LINE
.strip())
316 with
open(sys
.argv
[3], mode
='r') as PROP_LIST_FILE
:
318 for LINE
in PROP_LIST_FILE
:
319 if re
.match(r
'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE
):
320 PROP_LIST_LINES
.append(LINE
.strip())
321 with
open('UTF-8', mode
='w') as OUTFILE
:
322 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
323 write_header_charmap(OUTFILE
)
324 process_charmap(UNICODE_DATA_LINES
, OUTFILE
)
325 OUTFILE
.write("END CHARMAP\n\n")
326 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
327 write_header_width(OUTFILE
)
328 process_width(OUTFILE
,
330 EAST_ASIAN_WIDTH_LINES
,
332 OUTFILE
.write("END WIDTH\n")