Amendments to Unicode 7 update.
[glibc.git] / localedata / unicode-gen / utf8_compatibility.py
blobb84a1eb3de8f182ca0e32397257113d1fb1a9871
1 #!/usr/bin/python3
2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2015 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
20 '''
21 This script is useful for checking backward compatibility of newly
22 generated UTF-8 file from utf8_gen.py script
24 To see how this script is used, call it with the “-h” option:
26 $ ./utf8_compatibility.py -h
27 … prints usage message …
28 '''
30 import sys
31 import re
32 import argparse
34 # Dictionary holding the entire contents of the UnicodeData.txt file
36 # Contents of this dictionary look like this:
38 # {0: {'category': 'Cc',
39 # 'title': None,
40 # 'digit': '',
41 # 'name': '<control>',
42 # 'bidi': 'BN',
43 # 'combining': '0',
44 # 'comment': '',
45 # 'oldname': 'NULL',
46 # 'decomposition': '',
47 # 'upper': None,
48 # 'mirrored': 'N',
49 # 'lower': None,
50 # 'decdigit': '',
51 # 'numeric': ''},
52 # …
53 # }
54 UNICODE_ATTRIBUTES = {}
56 # Dictionary holding the entire contents of the EastAsianWidths.txt file
58 # Contents of this dictionary look like this:
60 # {0: 'N', … , 45430: 'W', …}
61 EAST_ASIAN_WIDTHS = {}
63 def fill_attribute(code_point, fields):
64 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
66 One entry in the UNICODE_ATTRIBUTES dictionary represents one line
67 in the UnicodeData.txt file.
69 '''
70 UNICODE_ATTRIBUTES[code_point] = {
71 'name': fields[1], # Character name
72 'category': fields[2], # General category
73 'combining': fields[3], # Canonical combining classes
74 'bidi': fields[4], # Bidirectional category
75 'decomposition': fields[5], # Character decomposition mapping
76 'decdigit': fields[6], # Decimal digit value
77 'digit': fields[7], # Digit value
78 'numeric': fields[8], # Numeric value
79 'mirrored': fields[9], # mirrored
80 'oldname': fields[10], # Old Unicode 1.0 name
81 'comment': fields[11], # comment
82 # Uppercase mapping
83 'upper': int(fields[12], 16) if fields[12] else None,
84 # Lowercase mapping
85 'lower': int(fields[13], 16) if fields[13] else None,
86 # Titlecase mapping
87 'title': int(fields[14], 16) if fields[14] else None,
90 def fill_attributes(filename):
91 '''Stores the entire contents of the UnicodeData.txt file
92 in the UNICODE_ATTRIBUTES dictionary.
94 A typical line for a single code point in UnicodeData.txt looks
95 like this:
97 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
99 Code point ranges are indicated by pairs of lines like this:
101 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
102 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
104 with open(filename, mode='r') as unicode_data_file:
105 fields_start = []
106 for line in unicode_data_file:
107 fields = line.strip().split(';')
108 if len(fields) != 15:
109 sys.stderr.write(
110 'short line in file "%(f)s": %(l)s\n' %{
111 'f': filename, 'l': line})
112 exit(1)
113 if fields[2] == 'Cs':
114 # Surrogates are UTF-16 artefacts,
115 # not real characters. Ignore them.
116 fields_start = []
117 continue
118 if fields[1].endswith(', First>'):
119 fields_start = fields
120 fields_start[1] = fields_start[1].split(',')[0][1:]
121 continue
122 if fields[1].endswith(', Last>'):
123 fields[1] = fields[1].split(',')[0][1:]
124 if fields[1:] != fields_start[1:]:
125 sys.stderr.write(
126 'broken code point range in file "%(f)s": %(l)s\n' %{
127 'f': filename, 'l': line})
128 exit(1)
129 for code_point in range(
130 int(fields_start[0], 16),
131 int(fields[0], 16)+1):
132 fill_attribute(code_point, fields)
133 fields_start = []
134 continue
135 fill_attribute(int(fields[0], 16), fields)
136 fields_start = []
138 def fill_east_asian_widths(filename):
139 '''Stores the entire contents of the EastAsianWidths.txt file
140 in the EAST_ASIAN_WIDTHS dictionary.
142 Lines in EastAsianWidths.txt are either a code point range like
143 this:
145 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
147 or a single code point like this:
149 A015;W # Lm YI SYLLABLE WU
151 with open(filename, mode='r') as east_asian_widths_file:
152 for line in east_asian_widths_file:
153 match = re.match(
154 r'^(?P<codepoint1>[0-9A-F]{4,6})'
155 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
156 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
157 line)
158 if not match:
159 continue
160 start = match.group('codepoint1')
161 end = match.group('codepoint2')
162 if not end:
163 end = start
164 for code_point in range(int(start, 16), int(end, 16)+1):
165 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
167 def ucs_symbol(code_point):
168 '''Return the UCS symbol string for a Unicode character.'''
169 if code_point < 0x10000:
170 return '<U{:04X}>'.format(code_point)
171 else:
172 return '<U{:08X}>'.format(code_point)
174 def create_charmap_dictionary(file_name):
175 '''Create a dictionary for all code points found in the CHARMAP
176 section of a file
178 with open(file_name, mode='r') as utf8_file:
179 charmap_dictionary = {}
180 for line in utf8_file:
181 if line.startswith('CHARMAP'):
182 break
183 for line in utf8_file:
184 if line.startswith('END CHARMAP'):
185 return charmap_dictionary
186 if line.startswith('%'):
187 continue
188 match = re.match(
189 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
190 +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
191 +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
192 line)
193 if not match:
194 continue
195 codepoint1 = match.group('codepoint1')
196 codepoint2 = match.group('codepoint2')
197 if not codepoint2:
198 codepoint2 = codepoint1
199 for i in range(int(codepoint1, 16),
200 int(codepoint2, 16) + 1):
201 charmap_dictionary[i] = match.group('hexutf8')
202 sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
203 %file_name)
204 exit(1)
206 def check_charmap(original_file_name, new_file_name):
207 '''Report differences in the CHARMAP section between the old and the
208 new file
210 print('************************************************************')
211 print('Report on CHARMAP:')
212 ocharmap = create_charmap_dictionary(original_file_name)
213 ncharmap = create_charmap_dictionary(new_file_name)
214 print('------------------------------------------------------------')
215 print('Total removed characters in newly generated CHARMAP: %d'
216 %len(set(ocharmap)-set(ncharmap)))
217 if ARGS.show_missing_characters:
218 for key in sorted(set(ocharmap)-set(ncharmap)):
219 print('removed: {:s} {:s} {:s}'.format(
220 ucs_symbol(key),
221 ocharmap[key],
222 UNICODE_ATTRIBUTES[key]['name'] \
223 if key in UNICODE_ATTRIBUTES else None))
224 print('------------------------------------------------------------')
225 changed_charmap = {}
226 for key in set(ocharmap).intersection(set(ncharmap)):
227 if ocharmap[key] != ncharmap[key]:
228 changed_charmap[key] = (ocharmap[key], ncharmap[key])
229 print('Total changed characters in newly generated CHARMAP: %d'
230 %len(changed_charmap))
231 if ARGS.show_changed_characters:
232 for key in sorted(changed_charmap):
233 print('changed: {:s} {:s}->{:s} {:s}'.format(
234 ucs_symbol(key),
235 changed_charmap[key][0],
236 changed_charmap[key][1],
237 UNICODE_ATTRIBUTES[key]['name'] \
238 if key in UNICODE_ATTRIBUTES else None))
239 print('------------------------------------------------------------')
240 print('Total added characters in newly generated CHARMAP: %d'
241 %len(set(ncharmap)-set(ocharmap)))
242 if ARGS.show_added_characters:
243 for key in sorted(set(ncharmap)-set(ocharmap)):
244 print('added: {:s} {:s} {:s}'.format(
245 ucs_symbol(key),
246 ncharmap[key],
247 UNICODE_ATTRIBUTES[key]['name'] \
248 if key in UNICODE_ATTRIBUTES else None))
250 def create_width_dictionary(file_name):
251 '''Create a dictionary for all code points found in the WIDTH
252 section of a file
254 with open(file_name, mode='r') as utf8_file:
255 width_dictionary = {}
256 for line in utf8_file:
257 if line.startswith('WIDTH'):
258 break
259 for line in utf8_file:
260 if line.startswith('END WIDTH'):
261 return width_dictionary
262 match = re.match(
263 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
264 +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
265 +r'\s+(?P<width>[02])',
266 line)
267 if not match:
268 continue
269 codepoint1 = match.group('codepoint1')
270 codepoint2 = match.group('codepoint2')
271 if not codepoint2:
272 codepoint2 = codepoint1
273 for i in range(int(codepoint1, 16),
274 int(codepoint2, 16) + 1):
275 width_dictionary[i] = int(match.group('width'))
276 sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
278 def check_width(original_file_name, new_file_name):
279 '''Report differences in the WIDTH section between the old and the new
280 file
282 print('************************************************************')
283 print('Report on WIDTH:')
284 owidth = create_width_dictionary(original_file_name)
285 nwidth = create_width_dictionary(new_file_name)
286 print('------------------------------------------------------------')
287 print('Total removed characters in newly generated WIDTH: %d'
288 %len(set(owidth)-set(nwidth)))
289 print('(Characters not in WIDTH get width 1 by default, '
290 + 'i.e. these have width 1 now.)')
291 if ARGS.show_missing_characters:
292 for key in sorted(set(owidth)-set(nwidth)):
293 print('removed: {:s} '.format(ucs_symbol(key))
294 + '{:d} : '.format(owidth[key])
295 + 'eaw={:s} '.format(
296 EAST_ASIAN_WIDTHS[key]
297 if key in EAST_ASIAN_WIDTHS else None)
298 + 'category={:2s} '.format(
299 UNICODE_ATTRIBUTES[key]['category']
300 if key in UNICODE_ATTRIBUTES else None)
301 + 'bidi={:3s} '.format(
302 UNICODE_ATTRIBUTES[key]['bidi']
303 if key in UNICODE_ATTRIBUTES else None)
304 + 'name={:s}'.format(
305 UNICODE_ATTRIBUTES[key]['name']
306 if key in UNICODE_ATTRIBUTES else None))
307 print('------------------------------------------------------------')
308 changed_width = {}
309 for key in set(owidth).intersection(set(nwidth)):
310 if owidth[key] != nwidth[key]:
311 changed_width[key] = (owidth[key], nwidth[key])
312 print('Total changed characters in newly generated WIDTH: %d'
313 %len(changed_width))
314 if ARGS.show_changed_characters:
315 for key in sorted(changed_width):
316 print('changed width: {:s} '.format(ucs_symbol(key))
317 + '{:d}->{:d} : '.format(changed_width[key][0],
318 changed_width[key][1])
319 + 'eaw={:s} '.format(
320 EAST_ASIAN_WIDTHS[key]
321 if key in EAST_ASIAN_WIDTHS else None)
322 + 'category={:2s} '.format(
323 UNICODE_ATTRIBUTES[key]['category']
324 if key in UNICODE_ATTRIBUTES else None)
325 + 'bidi={:3s} '.format(
326 UNICODE_ATTRIBUTES[key]['bidi']
327 if key in UNICODE_ATTRIBUTES else None)
328 + 'name={:s}'.format(
329 UNICODE_ATTRIBUTES[key]['name']
330 if key in UNICODE_ATTRIBUTES else None))
331 print('------------------------------------------------------------')
332 print('Total added characters in newly generated WIDTH: %d'
333 %len(set(nwidth)-set(owidth)))
334 print('(Characters not in WIDTH get width 1 by default, '
335 + 'i.e. these had width 1 before.)')
336 if ARGS.show_added_characters:
337 for key in sorted(set(nwidth)-set(owidth)):
338 print('added: {:s} '.format(ucs_symbol(key))
339 + '{:d} : '.format(nwidth[key])
340 + 'eaw={:s} '.format(
341 EAST_ASIAN_WIDTHS[key]
342 if key in EAST_ASIAN_WIDTHS else None)
343 + 'category={:2s} '.format(
344 UNICODE_ATTRIBUTES[key]['category']
345 if key in UNICODE_ATTRIBUTES else None)
346 + 'bidi={:3s} '.format(
347 UNICODE_ATTRIBUTES[key]['bidi']
348 if key in UNICODE_ATTRIBUTES else None)
349 + 'name={:s}'.format(
350 UNICODE_ATTRIBUTES[key]['name']
351 if key in UNICODE_ATTRIBUTES else None))
353 if __name__ == "__main__":
354 PARSER = argparse.ArgumentParser(
355 description='''
356 Compare the contents of LC_CTYPE in two files and check for errors.
357 ''')
358 PARSER.add_argument(
359 '-o', '--old_utf8_file',
360 nargs='?',
361 required=True,
362 type=str,
363 help='The old UTF-8 file.')
364 PARSER.add_argument(
365 '-n', '--new_utf8_file',
366 nargs='?',
367 required=True,
368 type=str,
369 help='The new UTF-8 file.')
370 PARSER.add_argument(
371 '-u', '--unicode_data_file',
372 nargs='?',
373 type=str,
374 help='The UnicodeData.txt file to read.')
375 PARSER.add_argument(
376 '-e', '--east_asian_width_file',
377 nargs='?',
378 type=str,
379 help='The EastAsianWidth.txt file to read.')
380 PARSER.add_argument(
381 '-a', '--show_added_characters',
382 action='store_true',
383 help='Show characters which were added in detail.')
384 PARSER.add_argument(
385 '-m', '--show_missing_characters',
386 action='store_true',
387 help='Show characters which were removed in detail.')
388 PARSER.add_argument(
389 '-c', '--show_changed_characters',
390 action='store_true',
391 help='Show characters whose width was changed in detail.')
392 ARGS = PARSER.parse_args()
394 if ARGS.unicode_data_file:
395 fill_attributes(ARGS.unicode_data_file)
396 if ARGS.east_asian_width_file:
397 fill_east_asian_widths(ARGS.east_asian_width_file)
398 check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
399 check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)