2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2023 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <https://www.gnu.org/licenses/>.
21 This script is useful for checking the differences between
22 an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
23 new one generated by gen_unicode_ctype.py
25 To see how it is used, call it with the “-h” option:
27 $ ./ctype_compatibility.py -h
28 … prints usage message …
36 from ctype_compatibility_test_cases
import TEST_CASES
38 def get_lines_from_file(filename
):
39 '''Get all non-comment lines from a i18n file
41 Also merge all lines which are continued on the next line because
42 they end in “/” into a single line.
44 with
open(filename
) as i18n_file
:
46 for line
in i18n_file
:
47 line
= line
.strip('\n')
49 if line
.endswith('/'):
50 line
= line
[0:line
.find('%')] + '/'
52 line
= line
[0:line
.find('%')]
54 if line
.endswith('/'):
55 current_line
+= line
[:-1]
57 yield current_line
+ line
59 if current_line
: # file ends with a continuation line
62 def extract_character_classes(filename
):
63 '''Get all Unicode code points for each character class from a file
65 Store these code points in a dictionary using the character classes
66 as keys and the list of code points in this character class as values.
68 In case of the character classes “toupper”, “tolower”, and “totitle”,
69 these area actually pairs of code points
72 for line
in get_lines_from_file(filename
):
91 match
= re
.match(r
'^('
92 r
'(?:(?:class|map)\s+")'
93 +re
.escape(char_class
)+
96 +re
.escape(char_class
)+r
'\s+'+
99 if char_class
not in ctype_dict
:
100 ctype_dict
[char_class
] = []
102 ctype_dict
[char_class
],
106 def process_chars(char_class_list
, code_point_line
):
108 Extract Unicode values from code_point_line
109 and add to the list of code points in a character class
111 for code_points
in code_point_line
.split(';'):
112 code_points
= code_points
.strip()
113 match
= re
.match(r
'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points
)
115 char_class_list
.append(
116 int(match
.group('codepoint'), 16))
119 r
'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
121 r
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
123 if match
: # <Uxxxx>..<Uxxxx>
124 for codepoint
in range(
125 int(match
.group('codepoint1'), 16),
126 int(match
.group('codepoint2'), 16) + 1):
127 char_class_list
.append(codepoint
)
130 r
'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
132 r
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
134 if match
: # <Uxxxx>..(2)..<Uxxxx>
135 for codepoint
in range(
136 int(match
.group('codepoint1'), 16),
137 int(match
.group('codepoint2'), 16) + 1,
139 char_class_list
.append(codepoint
)
143 r
'<U(?P<codepoint1>[0-9A-F]{4,8})>'
145 r
'<U(?P<codepoint2>[0-9A-F]{4,8})>'
148 if match
: # (<Uxxxx>,<Uxxxx>)
149 char_class_list
.append((
150 int(match
.group('codepoint1'), 16),
151 int(match
.group('codepoint2'), 16)))
154 ('None of the regexps matched '
155 + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
157 'cpl': code_point_line
161 def compare_lists(old_ctype_dict
, new_ctype_dict
):
162 '''Compare character classes in the old and the new LC_CTYPE'''
163 print('****************************************************')
164 print('Character classes which are only in the new '
165 + 'or only in the old file:')
166 for char_class
in sorted(old_ctype_dict
):
167 if char_class
not in new_ctype_dict
:
168 print('Character class %s is in old ctype but not in new ctype'
170 for char_class
in sorted(new_ctype_dict
):
171 if char_class
not in old_ctype_dict
:
172 print('Character class %s is in new ctype but not in old ctype'
174 for char_class
in sorted(old_ctype_dict
):
175 print("****************************************************")
176 print("%s: %d chars in old ctype and %d chars in new ctype" %(
178 len(old_ctype_dict
[char_class
]),
179 len(new_ctype_dict
[char_class
])))
180 print("----------------------------------------------------")
182 old_ctype_dict
[char_class
],
183 new_ctype_dict
[char_class
])
185 def report_code_points(char_class
, code_point_list
, text
=''):
186 '''Report all code points which have been added to or removed from a
189 for code_point
in sorted(code_point_list
):
190 if type(code_point
) == type(int()):
191 print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
193 'char': chr(code_point
),
194 'char_class': char_class
,
195 'code_point': hex(code_point
),
196 'name': unicodedata
.name(chr(code_point
), 'name unknown')})
198 print(('%(char_class)s: %(text)s: '
199 + '%(char0)s → %(char1)s '
200 + '%(code_point0)s → %(code_point1)s '
201 + '%(name0)s → %(name1)s') %{
203 'char_class': char_class
,
204 'char0': chr(code_point
[0]),
205 'code_point0': hex(code_point
[0]),
206 'name0': unicodedata
.name(chr(code_point
[0]), 'name unknown'),
207 'char1': chr(code_point
[1]),
208 'code_point1': hex(code_point
[1]),
209 'name1': unicodedata
.name(chr(code_point
[1]), 'name unknown')
212 def report(char_class
, old_list
, new_list
):
213 '''Report the differences for a certain LC_CTYPE character class
214 between the old and the newly generated state
216 missing_chars
= list(set(old_list
)-set(new_list
))
217 print(('%(char_class)s: Missing %(number)d characters '
218 + 'of old ctype in new ctype ')
219 %{'char_class': char_class
, 'number': len(missing_chars
)})
220 if ARGS
.show_missing_characters
:
221 report_code_points(char_class
, missing_chars
, 'Missing')
222 added_chars
= list(set(new_list
)-set(old_list
))
223 print(('%(char_class)s: Added %(number)d characters '
224 + 'in new ctype which were not in old ctype')
225 %{'char_class': char_class
, 'number': len(added_chars
)})
226 if ARGS
.show_added_characters
:
227 report_code_points(char_class
, added_chars
, 'Added')
230 def cperror(error_message
, errorcounter
=0):
231 '''Increase number of errors by one and print an error message'''
233 return errorcounter
+ 1
235 def cpcheck(ctype_dict
, code_point_list_with_ranges
, char_classes
, reason
='',
237 '''The parameter “code_point_list_with_ranges” is a list of
238 integers or pairs of integers, for example:
240 [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
242 where the pairs of integers stand for all the code points in the range
243 of the two integers given, including the two integers of the pair.
246 for code_point_range
in code_point_list_with_ranges
:
247 for code_point
in ([code_point_range
]
248 if type(code_point_range
) == type(int())
249 else range(code_point_range
[0],
250 code_point_range
[1]+1)):
251 for char_class_tuple
in char_classes
:
252 char_class
= char_class_tuple
[0]
253 in_char_class
= char_class_tuple
[1]
254 if (code_point
in ctype_dict
[char_class
]) != in_char_class
:
255 errorcounter
= cperror(
256 ('error: %(code_point)s %(char)s '
257 + '%(char_class)s %(in)s: %(reason)s') %{
258 'code_point': hex(code_point
),
259 'char': chr(code_point
),
260 'char_class': char_class
,
261 'in': not in_char_class
,
266 def tests(ctype_dict
, errorcounter
= 0):
267 '''Test a LC_CTYPE character class dictionary for known errors'''
268 # copy the information from ctype_dict (which contains lists) in
269 # a new dictionary ctype_dict2 (which contains dictionaries).
270 # The checks below are easier with that type of data structure.
273 for key
in ctype_dict
:
274 ctype_dict2
[key
] = {}
276 if type(ctype_dict
[key
][0]) == type(int()):
277 for value
in ctype_dict
[key
]:
278 ctype_dict2
[key
][value
] = 1
279 else: # key is 'toupper', 'tolower', or 'totitle'
280 for value
in ctype_dict
[key
]:
281 ctype_dict2
[key
][value
[0]] = value
[1]
283 for test_case
in TEST_CASES
:
284 errorcounter
= cpcheck(ctype_dict2
,
288 errorcounter
= errorcounter
)
290 for code_point
in range(0, 0x110000):
291 # toupper restriction: "Only characters specified for the keywords
292 # lower and upper shall be specified.
293 if (code_point
in ctype_dict2
['toupper']
294 and code_point
!= ctype_dict2
['toupper'][code_point
]
295 and not (code_point
in ctype_dict2
['lower']
296 or code_point
in ctype_dict2
['upper'])):
297 errorcounter
= cperror(
298 ('error: %(char1)s is not upper|lower '
299 + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
300 'char1': chr(code_point
),
301 'cp1': hex(code_point
),
302 'cp2': hex(ctype_dict2
['toupper'][code_point
]),
303 'char2': chr(ctype_dict2
['toupper'][code_point
])
306 # tolower restriction: "Only characters specified for the keywords
307 # lower and upper shall be specified.
308 if (code_point
in ctype_dict2
['tolower']
309 and code_point
!= ctype_dict2
['tolower'][code_point
]
310 and not (code_point
in ctype_dict2
['lower']
311 or code_point
in ctype_dict2
['upper'])):
312 errorcounter
= cperror(
313 ('error: %(char1)s is not upper|lower '
314 + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
315 'char1': chr(code_point
),
316 'cp1': hex(code_point
),
317 'cp2': hex(ctype_dict2
['tolower'][code_point
]),
318 'char2': chr(ctype_dict2
['tolower'][code_point
])
321 # alpha restriction: "Characters classified as either upper or lower
322 # shall automatically belong to this class.
323 if ((code_point
in ctype_dict2
['lower']
324 or code_point
in ctype_dict2
['upper'])
325 and code_point
not in ctype_dict2
['alpha']):
326 errorcounter
= cperror(
327 'error: %(char)s %(cp)s is upper|lower but not alpha' %{
328 'char': chr(code_point
),
329 'cp': hex(code_point
)
332 # alpha restriction: "No character specified for the keywords cntrl,
333 # digit, punct or space shall be specified."
334 if (code_point
in ctype_dict2
['alpha']
335 and code_point
in ctype_dict2
['cntrl']):
336 errorcounter
= cperror(
337 'error: %(char)s %(cp)s is alpha and cntrl' %{
338 'char': chr(code_point
),
339 'cp': hex(code_point
)
342 if (code_point
in ctype_dict2
['alpha']
343 and code_point
in ctype_dict2
['digit']):
344 errorcounter
= cperror(
345 'error: %(char)s %(cp)s is alpha and digit' %{
346 'char': chr(code_point
),
347 'cp': hex(code_point
)
350 if (code_point
in ctype_dict2
['alpha']
351 and code_point
in ctype_dict2
['punct']):
352 errorcounter
= cperror(
353 'error: %(char)s %(cp)s is alpha and punct' %{
354 'char': chr(code_point
),
355 'cp': hex(code_point
)
358 if (code_point
in ctype_dict2
['alpha']
359 and code_point
in ctype_dict2
['space']):
360 errorcounter
= cperror(
361 'error: %(char)s %(cp)s is alpha and space' %{
362 'char': chr(code_point
),
363 'cp': hex(code_point
)
366 # space restriction: "No character specified for the keywords upper,
367 # lower, alpha, digit, graph or xdigit shall be specified."
368 # upper, lower, alpha already checked above.
369 if (code_point
in ctype_dict2
['space']
370 and code_point
in ctype_dict2
['digit']):
371 errorcounter
= cperror(
372 'error: %(char)s %(cp)s is space and digit' %{
373 'char': chr(code_point
),
374 'cp': hex(code_point
)
377 if (code_point
in ctype_dict2
['space']
378 and code_point
in ctype_dict2
['graph']):
379 errorcounter
= cperror(
380 'error: %(char)s %(cp)s is space and graph' %{
381 'char': chr(code_point
),
382 'cp': hex(code_point
)
385 if (code_point
in ctype_dict2
['space']
386 and code_point
in ctype_dict2
['xdigit']):
387 errorcounter
= cperror(
388 'error: %(char)s %(cp)s is space and xdigit' %{
389 'char': chr(code_point
),
390 'cp': hex(code_point
)
393 # cntrl restriction: "No character specified for the keywords upper,
394 # lower, alpha, digit, punct, graph, print or xdigit shall be
395 # specified." upper, lower, alpha already checked above.
396 if (code_point
in ctype_dict2
['cntrl']
397 and code_point
in ctype_dict2
['digit']):
398 errorcounter
= cperror(
399 'error: %(char)s %(cp)s is cntrl and digit' %{
400 'char': chr(code_point
),
401 'cp': hex(code_point
)
404 if (code_point
in ctype_dict2
['cntrl']
405 and code_point
in ctype_dict2
['punct']):
406 errorcounter
= cperror(
407 'error: %(char)s %(cp)s is cntrl and punct' %{
408 'char': chr(code_point
),
409 'cp': hex(code_point
)
412 if (code_point
in ctype_dict2
['cntrl']
413 and code_point
in ctype_dict2
['graph']):
414 errorcounter
= cperror(
415 'error: %(char)s %(cp)s is cntrl and graph' %{
416 'char': chr(code_point
),
417 'cp': hex(code_point
)
420 if (code_point
in ctype_dict2
['cntrl']
421 and code_point
in ctype_dict2
['print']):
422 errorcounter
= cperror(
423 'error: %(char)s %(cp)s is cntrl and print' %{
424 'char': chr(code_point
),
425 'cp': hex(code_point
)
428 if (code_point
in ctype_dict2
['cntrl']
429 and code_point
in ctype_dict2
['xdigit']):
430 errorcounter
= cperror(
431 'error: %(char)s %(cp)s is cntrl and xdigit' %{
432 'char': chr(code_point
),
433 'cp': hex(code_point
)
436 # punct restriction: "No character specified for the keywords upper,
437 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
438 # be specified." upper, lower, alpha, cntrl already checked above.
439 if (code_point
in ctype_dict2
['punct']
440 and code_point
in ctype_dict2
['digit']):
441 errorcounter
= cperror(
442 'error: %(char)s %(cp)s is punct and digit' %{
443 'char': chr(code_point
),
444 'cp': hex(code_point
)
447 if (code_point
in ctype_dict2
['punct']
448 and code_point
in ctype_dict2
['xdigit']):
449 errorcounter
= cperror(
450 'error: %(char)s %(cp)s is punct and xdigit' %{
451 'char': chr(code_point
),
452 'cp': hex(code_point
)
455 if (code_point
in ctype_dict2
['punct']
456 and code_point
== 0x0020):
457 errorcounter
= cperror(
458 'error: %(char)s %(cp)s is punct.' %{
459 'char': chr(code_point
),
460 'cp': hex(code_point
)
463 # graph restriction: "No character specified for the keyword cntrl
464 # shall be specified." Already checked above.
466 # print restriction: "No character specified for the keyword cntrl
467 # shall be specified." Already checked above.
469 # graph - print relation: differ only in the <space> character.
470 # How is this possible if there are more than one space character?!
471 # I think susv2/xbd/locale.html should speak of "space characters",
472 # not "space character".
473 if (code_point
in ctype_dict2
['print']
474 and not (code_point
in ctype_dict2
['graph']
475 or code_point
in ctype_dict2
['space'])):
476 errorcounter
= cperror(
477 'error: %(char)s %(cp)s is print but not graph|space' %{
478 'char': chr(code_point
),
479 'cp': hex(code_point
)
482 if (code_point
not in ctype_dict2
['print']
483 and (code_point
in ctype_dict2
['graph']
484 or code_point
== 0x0020)):
485 errorcounter
= cperror(
486 'error: %(char)s %(cp)s graph|space but not print' %{
487 'char': chr(code_point
),
488 'cp': hex(code_point
)
493 if __name__
== "__main__":
494 PARSER
= argparse
.ArgumentParser(
496 Compare the contents of LC_CTYPE in two files and check for errors.
499 '-o', '--old_ctype_file',
503 help='The old ctype file, default: %(default)s')
505 '-n', '--new_ctype_file',
508 default
='unicode-ctype',
509 help='The new ctype file, default: %(default)s')
511 '-a', '--show_added_characters',
513 help=('Show characters which were added to each '
514 + 'character class in detail.'))
516 '-m', '--show_missing_characters',
518 help=('Show characters which were removed from each '
519 + 'character class in detail.'))
520 ARGS
= PARSER
.parse_args()
522 OLD_CTYPE_DICT
= extract_character_classes(
524 NEW_CTYPE_DICT
= extract_character_classes(
526 compare_lists(OLD_CTYPE_DICT
, NEW_CTYPE_DICT
)
527 print('============================================================')
528 print('Checking for errors in old ctype file: %s' %ARGS
.old_ctype_file
)
529 print('------------------------------------------------------------')
530 NUMBER_OF_ERRORS_IN_OLD_FILE
= tests(OLD_CTYPE_DICT
, errorcounter
= 0)
531 print('------------------------------------------------------------')
532 print('Old file = %s' %ARGS
.old_ctype_file
)
533 print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE
)
534 print('------------------------------------------------------------')
535 print('============================================================')
536 print('Checking for errors in new ctype file: %s' %ARGS
.new_ctype_file
)
537 print('------------------------------------------------------------')
538 NUMBER_OF_ERRORS_IN_NEW_FILE
= tests(NEW_CTYPE_DICT
, errorcounter
= 0)
539 print('------------------------------------------------------------')
540 print('New file = %s' %ARGS
.new_ctype_file
)
541 print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE
)
542 print('------------------------------------------------------------')
543 if NUMBER_OF_ERRORS_IN_NEW_FILE
> 0: