Correcting language code for Bhili and Tulu locales (bug 17475)
[glibc.git] / localedata / unicode-gen / utf8_gen.py
blobf1b88f5b29410e5d85d0ef7015ed222be1b89b78
1 #!/usr/bin/python3
2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2015 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
20 '''glibc/localedata/charmaps/UTF-8 file generator script
22 This script generates a glibc/localedata/charmaps/UTF-8 file
23 from Unicode data.
25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
27 It will output UTF-8 file
28 '''
30 import sys
31 import re
33 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
34 # sections 3.11 and 4.4.
36 JAMO_INITIAL_SHORT_NAME = (
37 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
38 'C', 'K', 'T', 'P', 'H'
41 JAMO_MEDIAL_SHORT_NAME = (
42 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
43 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
46 JAMO_FINAL_SHORT_NAME = (
47 '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
48 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
49 'P', 'H'
52 def ucs_symbol(code_point):
53 '''Return the UCS symbol string for a Unicode character.'''
54 if code_point < 0x10000:
55 return '<U{:04X}>'.format(code_point)
56 else:
57 return '<U{:08X}>'.format(code_point)
59 def process_range(start, end, outfile, name):
60 '''Writes a range of code points into the CHARMAP section of the
61 output file
63 '''
64 if 'Hangul Syllable' in name:
65 # from glibc/localedata/ChangeLog:
67 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
68 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
69 # so they become printable and carry a width. Comment out surrogate
70 # ranges. Add a WIDTH table
72 # So we expand the Hangul Syllables here:
73 for i in range(int(start, 16), int(end, 16)+1 ):
74 index2, index3 = divmod(i - 0xaC00, 28)
75 index1, index2 = divmod(index2, 21)
76 hangul_syllable_name = 'HANGUL SYLLABLE ' \
77 + JAMO_INITIAL_SHORT_NAME[index1] \
78 + JAMO_MEDIAL_SHORT_NAME[index2] \
79 + JAMO_FINAL_SHORT_NAME[index3]
80 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
81 ucs_symbol(i), convert_to_hex(i),
82 hangul_syllable_name))
83 return
84 # UnicodeData.txt file has contains code point ranges like this:
86 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
87 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
89 # The glibc UTF-8 file splits ranges like these into shorter
90 # ranges of 64 code points each:
92 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
93 # …
94 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
95 for i in range(int(start, 16), int(end, 16), 64 ):
96 if i > (int(end, 16)-64):
97 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
98 ucs_symbol(i),
99 ucs_symbol(int(end,16)),
100 convert_to_hex(i),
101 name))
102 break
103 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
104 ucs_symbol(i),
105 ucs_symbol(i+63),
106 convert_to_hex(i),
107 name))
109 def process_charmap(flines, outfile):
110 '''This function takes an array which contains *all* lines of
111 of UnicodeData.txt and write lines to outfile as used in the
113 CHARMAP
115 END CHARMAP
117 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
119 Samples for input lines:
121 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
122 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
123 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
124 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
125 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
126 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
127 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
129 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
131 <U0010> /x10 DATA LINK ESCAPE
132 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
133 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
134 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
135 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
138 fields_start = []
139 for line in flines:
140 fields = line.split(";")
141 # Some characters have “<control>” as their name. We try to
142 # use the “Unicode 1.0 Name” (10th field in
143 # UnicodeData.txt) for them.
145 # The Characters U+0080, U+0081, U+0084 and U+0099 have
146 # “<control>” as their name but do not even have aa
147 # ”Unicode 1.0 Name”. We could write code to take their
148 # alternate names from NameAliases.txt.
149 if fields[1] == "<control>" and fields[10]:
150 fields[1] = fields[10]
151 # Handling code point ranges like:
153 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
154 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
155 if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
156 fields_start = fields
157 continue
158 if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
159 process_range(fields_start[0], fields[0],
160 outfile, fields[1][:-7]+'>')
161 fields_start = []
162 continue
163 fields_start = []
164 if 'Surrogate,' in fields[1]:
165 # Comment out the surrogates in the UTF-8 file.
166 # One could of course skip them completely but
167 # the original UTF-8 file in glibc had them as
168 # comments, so we keep these comment lines.
169 outfile.write('%')
170 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
171 ucs_symbol(int(fields[0], 16)),
172 convert_to_hex(int(fields[0], 16)),
173 fields[1]))
175 def convert_to_hex(code_point):
176 '''Converts a code point to a hexadecimal UTF-8 representation
177 like /x**/x**/x**.'''
178 # Getting UTF8 of Unicode characters.
179 # In Python3, .encode('UTF-8') does not work for
180 # surrogates. Therefore, we use this conversion table
181 surrogates = {
182 0xD800: '/xed/xa0/x80',
183 0xDB7F: '/xed/xad/xbf',
184 0xDB80: '/xed/xae/x80',
185 0xDBFF: '/xed/xaf/xbf',
186 0xDC00: '/xed/xb0/x80',
187 0xDFFF: '/xed/xbf/xbf',
189 if code_point in surrogates:
190 return surrogates[code_point]
191 return ''.join([
192 '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
195 def write_header_charmap(outfile):
196 '''Write the header on top of the CHARMAP section to the output file'''
197 outfile.write("<code_set_name> UTF-8\n")
198 outfile.write("<comment_char> %\n")
199 outfile.write("<escape_char> /\n")
200 outfile.write("<mb_cur_min> 1\n")
201 outfile.write("<mb_cur_max> 6\n\n")
202 outfile.write("% CHARMAP generated using utf8_gen.py\n")
203 outfile.write("% alias ISO-10646/UTF-8\n")
204 outfile.write("CHARMAP\n")
206 def write_header_width(outfile):
207 '''Writes the header on top of the WIDTH section to the output file'''
208 outfile.write('% Character width according to Unicode 7.0.0.\n')
209 outfile.write('% - Default width is 1.\n')
210 outfile.write('% - Double-width characters have width 2; generated from\n')
211 outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
212 outfile.write('% - Non-spacing characters have width 0; '
213 + 'generated from PropList.txt or\n')
214 outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
215 + 'UnicodeData.txt"\n')
216 outfile.write('% - Format control characters have width 0; '
217 + 'generated from\n')
218 outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
219 # Not needed covered by Cf
220 # outfile.write("% - Zero width characters have width 0; generated from\n")
221 # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
222 outfile.write("WIDTH\n")
224 def process_width(outfile, ulines, elines):
225 '''ulines are lines from UnicodeData.txt, elines are lines from
226 EastAsianWidth.txt
229 width_dict = {}
230 for line in ulines:
231 fields = line.split(";")
232 if fields[4] == "NSM" or fields[2] == "Cf":
233 width_dict[int(fields[0], 16)] = ucs_symbol(
234 int(fields[0], 16)) + '\t0'
236 for line in elines:
237 # If an entry in EastAsianWidth.txt is found, it overrides entries in
238 # UnicodeData.txt:
239 fields = line.split(";")
240 if not '..' in fields[0]:
241 width_dict[int(fields[0], 16)] = ucs_symbol(
242 int(fields[0], 16)) + '\t2'
243 else:
244 code_points = fields[0].split("..")
245 for key in range(int(code_points[0], 16),
246 int(code_points[1], 16)+1):
247 if key in width_dict:
248 del width_dict[key]
249 width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
250 ucs_symbol(int(code_points[0], 16)),
251 ucs_symbol(int(code_points[1], 16)))
253 for key in sorted(width_dict):
254 outfile.write(width_dict[key]+'\n')
256 if __name__ == "__main__":
257 if len(sys.argv) < 3:
258 print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
259 else:
260 with open(sys.argv[1], mode='r') as UNIDATA_FILE:
261 UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
262 with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
263 EAST_ASIAN_WIDTH_LINES = []
264 for LINE in EAST_ASIAN_WIDTH_FILE:
265 # If characters from EastAasianWidth.txt which are from
266 # from reserved ranges (i.e. not yet assigned code points)
267 # are added to the WIDTH section of the UTF-8 file, then
268 # “make check” produces “Unknown Character” errors for
269 # these code points because such unassigned code points
270 # are not in the CHARMAP section of the UTF-8 file.
272 # Therefore, we skip all reserved code points when reading
273 # the EastAsianWidth.txt file.
274 if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
275 continue
276 if re.match(r'^[^;]*;[WF]', LINE):
277 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
278 with open('UTF-8', mode='w') as OUTFILE:
279 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
280 write_header_charmap(OUTFILE)
281 process_charmap(UNICODE_DATA_LINES, OUTFILE)
282 OUTFILE.write("END CHARMAP\n\n")
283 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
284 write_header_width(OUTFILE)
285 process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
286 OUTFILE.write("END WIDTH\n")