2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2016 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
20 '''glibc/localedata/charmaps/UTF-8 file generator script
22 This script generates a glibc/localedata/charmaps/UTF-8 file
25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
27 It will output UTF-8 file
34 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
35 # sections 3.11 and 4.4.
37 JAMO_INITIAL_SHORT_NAME
= (
38 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
39 'C', 'K', 'T', 'P', 'H'
42 JAMO_MEDIAL_SHORT_NAME
= (
43 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
44 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
47 JAMO_FINAL_SHORT_NAME
= (
48 '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
49 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
53 def process_range(start
, end
, outfile
, name
):
54 '''Writes a range of code points into the CHARMAP section of the
58 if 'Hangul Syllable' in name
:
59 # from glibc/localedata/ChangeLog:
61 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
62 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
63 # so they become printable and carry a width. Comment out surrogate
64 # ranges. Add a WIDTH table
66 # So we expand the Hangul Syllables here:
67 for i
in range(int(start
, 16), int(end
, 16)+1 ):
68 index2
, index3
= divmod(i
- 0xaC00, 28)
69 index1
, index2
= divmod(index2
, 21)
70 hangul_syllable_name
= 'HANGUL SYLLABLE ' \
71 + JAMO_INITIAL_SHORT_NAME
[index1
] \
72 + JAMO_MEDIAL_SHORT_NAME
[index2
] \
73 + JAMO_FINAL_SHORT_NAME
[index3
]
74 outfile
.write('{:<11s} {:<12s} {:s}\n'.format(
75 unicode_utils
.ucs_symbol(i
), convert_to_hex(i
),
76 hangul_syllable_name
))
78 # UnicodeData.txt file has contains code point ranges like this:
80 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
81 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
83 # The glibc UTF-8 file splits ranges like these into shorter
84 # ranges of 64 code points each:
86 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
88 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
89 for i
in range(int(start
, 16), int(end
, 16), 64 ):
90 if i
> (int(end
, 16)-64):
91 outfile
.write('{:s}..{:s} {:<12s} {:s}\n'.format(
92 unicode_utils
.ucs_symbol(i
),
93 unicode_utils
.ucs_symbol(int(end
,16)),
97 outfile
.write('{:s}..{:s} {:<12s} {:s}\n'.format(
98 unicode_utils
.ucs_symbol(i
),
99 unicode_utils
.ucs_symbol(i
+63),
103 def process_charmap(flines
, outfile
):
104 '''This function takes an array which contains *all* lines of
105 of UnicodeData.txt and write lines to outfile as used in the
111 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
113 Samples for input lines:
115 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
116 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
117 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
118 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
119 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
120 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
121 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
123 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
125 <U0010> /x10 DATA LINK ESCAPE
126 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
127 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
128 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
129 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
134 fields
= line
.split(";")
135 # Some characters have “<control>” as their name. We try to
136 # use the “Unicode 1.0 Name” (10th field in
137 # UnicodeData.txt) for them.
139 # The Characters U+0080, U+0081, U+0084 and U+0099 have
140 # “<control>” as their name but do not even have aa
141 # ”Unicode 1.0 Name”. We could write code to take their
142 # alternate names from NameAliases.txt.
143 if fields
[1] == "<control>" and fields
[10]:
144 fields
[1] = fields
[10]
145 # Handling code point ranges like:
147 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
148 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
149 if fields
[1].endswith(', First>') and not 'Surrogate,' in fields
[1]:
150 fields_start
= fields
152 if fields
[1].endswith(', Last>') and not 'Surrogate,' in fields
[1]:
153 process_range(fields_start
[0], fields
[0],
154 outfile
, fields
[1][:-7]+'>')
158 if 'Surrogate,' in fields
[1]:
159 # Comment out the surrogates in the UTF-8 file.
160 # One could of course skip them completely but
161 # the original UTF-8 file in glibc had them as
162 # comments, so we keep these comment lines.
164 outfile
.write('{:<11s} {:<12s} {:s}\n'.format(
165 unicode_utils
.ucs_symbol(int(fields
[0], 16)),
166 convert_to_hex(int(fields
[0], 16)),
169 def convert_to_hex(code_point
):
170 '''Converts a code point to a hexadecimal UTF-8 representation
171 like /x**/x**/x**.'''
172 # Getting UTF8 of Unicode characters.
173 # In Python3, .encode('UTF-8') does not work for
174 # surrogates. Therefore, we use this conversion table
176 0xD800: '/xed/xa0/x80',
177 0xDB7F: '/xed/xad/xbf',
178 0xDB80: '/xed/xae/x80',
179 0xDBFF: '/xed/xaf/xbf',
180 0xDC00: '/xed/xb0/x80',
181 0xDFFF: '/xed/xbf/xbf',
183 if code_point
in surrogates
:
184 return surrogates
[code_point
]
186 '/x{:02x}'.format(c
) for c
in chr(code_point
).encode('UTF-8')
189 def write_header_charmap(outfile
):
190 '''Write the header on top of the CHARMAP section to the output file'''
191 outfile
.write("<code_set_name> UTF-8\n")
192 outfile
.write("<comment_char> %\n")
193 outfile
.write("<escape_char> /\n")
194 outfile
.write("<mb_cur_min> 1\n")
195 outfile
.write("<mb_cur_max> 6\n\n")
196 outfile
.write("% CHARMAP generated using utf8_gen.py\n")
197 outfile
.write("% alias ISO-10646/UTF-8\n")
198 outfile
.write("CHARMAP\n")
200 def write_header_width(outfile
):
201 '''Writes the header on top of the WIDTH section to the output file'''
202 outfile
.write('% Character width according to Unicode 7.0.0.\n')
203 outfile
.write('% - Default width is 1.\n')
204 outfile
.write('% - Double-width characters have width 2; generated from\n')
205 outfile
.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
206 outfile
.write('% - Non-spacing characters have width 0; '
207 + 'generated from PropList.txt or\n')
208 outfile
.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
209 + 'UnicodeData.txt"\n')
210 outfile
.write('% - Format control characters have width 0; '
211 + 'generated from\n')
212 outfile
.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
213 # Not needed covered by Cf
214 # outfile.write("% - Zero width characters have width 0; generated from\n")
215 # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
216 outfile
.write("WIDTH\n")
218 def process_width(outfile
, ulines
, elines
):
219 '''ulines are lines from UnicodeData.txt, elines are lines from
225 fields
= line
.split(";")
226 if fields
[4] == "NSM" or fields
[2] == "Cf":
227 width_dict
[int(fields
[0], 16)] = unicode_utils
.ucs_symbol(
228 int(fields
[0], 16)) + '\t0'
231 # If an entry in EastAsianWidth.txt is found, it overrides entries in
233 fields
= line
.split(";")
234 if not '..' in fields
[0]:
235 width_dict
[int(fields
[0], 16)] = unicode_utils
.ucs_symbol(
236 int(fields
[0], 16)) + '\t2'
238 code_points
= fields
[0].split("..")
239 for key
in range(int(code_points
[0], 16),
240 int(code_points
[1], 16)+1):
241 if key
in width_dict
:
243 width_dict
[int(code_points
[0], 16)] = '{:s}...{:s}\t2'.format(
244 unicode_utils
.ucs_symbol(int(code_points
[0], 16)),
245 unicode_utils
.ucs_symbol(int(code_points
[1], 16)))
247 for key
in sorted(width_dict
):
248 outfile
.write(width_dict
[key
]+'\n')
250 if __name__
== "__main__":
251 if len(sys
.argv
) < 3:
252 print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
254 with
open(sys
.argv
[1], mode
='r') as UNIDATA_FILE
:
255 UNICODE_DATA_LINES
= UNIDATA_FILE
.readlines()
256 with
open(sys
.argv
[2], mode
='r') as EAST_ASIAN_WIDTH_FILE
:
257 EAST_ASIAN_WIDTH_LINES
= []
258 for LINE
in EAST_ASIAN_WIDTH_FILE
:
259 # If characters from EastAasianWidth.txt which are from
260 # from reserved ranges (i.e. not yet assigned code points)
261 # are added to the WIDTH section of the UTF-8 file, then
262 # “make check” produces “Unknown Character” errors for
263 # these code points because such unassigned code points
264 # are not in the CHARMAP section of the UTF-8 file.
266 # Therefore, we skip all reserved code points when reading
267 # the EastAsianWidth.txt file.
268 if re
.match(r
'.*<reserved-.+>\.\.<reserved-.+>.*', LINE
):
270 if re
.match(r
'^[^;]*;[WF]', LINE
):
271 EAST_ASIAN_WIDTH_LINES
.append(LINE
.strip())
272 with
open('UTF-8', mode
='w') as OUTFILE
:
273 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
274 write_header_charmap(OUTFILE
)
275 process_charmap(UNICODE_DATA_LINES
, OUTFILE
)
276 OUTFILE
.write("END CHARMAP\n\n")
277 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
278 write_header_width(OUTFILE
)
279 process_width(OUTFILE
, UNICODE_DATA_LINES
, EAST_ASIAN_WIDTH_LINES
)
280 OUTFILE
.write("END WIDTH\n")