localedata/unicode-gen/ctype_compatibility.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2023 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <https://www.gnu.org/licenses/>.
  19
  20 '''
  21 This script is useful for checking the differences between
  22 an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
  23 new one generated by gen_unicode_ctype.py
  24
  25 To see how it is used, call it with the “-h” option:
  26
  27     $ ./ctype_compatibility.py -h
  28     … prints usage message …
  29 '''
  30
  31 import sys
  32 import re
  33 import unicodedata
  34 import argparse
  35
  36 from ctype_compatibility_test_cases import TEST_CASES
  37
  38 def get_lines_from_file(filename):
  39     '''Get all non-comment lines from a i18n file
  40
  41     Also merge all lines which are continued on the next line because
  42     they end in “/” into a single line.
  43     '''
  44     with open(filename) as i18n_file:
  45         current_line = ''
  46         for line in i18n_file:
  47             line = line.strip('\n')
  48             if '%' in line:
  49                 if line.endswith('/'):
  50                     line = line[0:line.find('%')] + '/'
  51                 else:
  52                     line = line[0:line.find('%')]
  53             line = line.strip()
  54             if line.endswith('/'):
  55                 current_line += line[:-1]
  56             else:
  57                 yield current_line + line
  58                 current_line = ''
  59     if current_line: # file ends with a continuation line
  60         yield current_line
  61
  62 def extract_character_classes(filename):
  63     '''Get all Unicode code points for each character class from a file
  64
  65     Store these code points in a dictionary using the character classes
  66     as keys and the list of code points in this character class as values.
  67
  68     In case  of the character classes “toupper”, “tolower”, and “totitle”,
  69     these area actually pairs of code points
  70     '''
  71     ctype_dict = {}
  72     for line in get_lines_from_file(filename):
  73         for char_class in [
  74                 'upper',
  75                 'lower',
  76                 'alpha',
  77                 'digit',
  78                 'outdigit',
  79                 'space',
  80                 'cntrl',
  81                 'punct',
  82                 'graph',
  83                 'print',
  84                 'xdigit',
  85                 'blank',
  86                 'combining',
  87                 'combining_level3',
  88                 'toupper',
  89                 'tolower',
  90                 'totitle']:
  91             match = re.match(r'^('
  92                              r'(?:(?:class|map)\s+")'
  93                              +re.escape(char_class)+
  94                              r'(?:";)\s+'
  95                              r'|'
  96                              +re.escape(char_class)+r'\s+'+
  97                              r')', line)
  98             if match:
  99                 if char_class not in ctype_dict:
 100                     ctype_dict[char_class] = []
 101                 process_chars(
 102                     ctype_dict[char_class],
 103                     line[match.end():])
 104     return ctype_dict
 105
 106 def process_chars(char_class_list, code_point_line):
 107     '''
 108     Extract Unicode values from code_point_line
 109     and add to the list of code points in a character class
 110     '''
 111     for code_points in code_point_line.split(';'):
 112         code_points = code_points.strip()
 113         match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
 114         if match: # <Uxxxx>
 115             char_class_list.append(
 116                 int(match.group('codepoint'), 16))
 117             continue
 118         match = re.match(
 119             r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
 120             r'\.\.'
 121             r'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
 122             code_points)
 123         if match: # <Uxxxx>..<Uxxxx>
 124             for codepoint in range(
 125                     int(match.group('codepoint1'), 16),
 126                     int(match.group('codepoint2'), 16) + 1):
 127                 char_class_list.append(codepoint)
 128             continue
 129         match = re.match(
 130             r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
 131             r'\.\.\(2\)\.\.'
 132             r'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
 133             code_points)
 134         if match: # <Uxxxx>..(2)..<Uxxxx>
 135             for codepoint in range(
 136                     int(match.group('codepoint1'), 16),
 137                     int(match.group('codepoint2'), 16) + 1,
 138                     2):
 139                 char_class_list.append(codepoint)
 140             continue
 141         match = re.match(
 142             r'^\('
 143             r'<U(?P<codepoint1>[0-9A-F]{4,8})>'
 144             r','
 145             r'<U(?P<codepoint2>[0-9A-F]{4,8})>'
 146             r'\)$',
 147             code_points)
 148         if match: # (<Uxxxx>,<Uxxxx>)
 149             char_class_list.append((
 150                 int(match.group('codepoint1'), 16),
 151                 int(match.group('codepoint2'), 16)))
 152             continue
 153         sys.stderr.write(
 154             ('None of the regexps matched '
 155              + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
 156             'cp': code_points,
 157             'cpl': code_point_line
 158         })
 159         exit(1)
 160
 161 def compare_lists(old_ctype_dict, new_ctype_dict):
 162     '''Compare character classes in the old and the new LC_CTYPE'''
 163     print('****************************************************')
 164     print('Character classes which are only in the new '
 165           + 'or only in the old file:')
 166     for char_class in sorted(old_ctype_dict):
 167         if char_class not in new_ctype_dict:
 168             print('Character class %s is in old ctype but not in new ctype'
 169                   %char_class)
 170     for char_class in sorted(new_ctype_dict):
 171         if char_class not in old_ctype_dict:
 172             print('Character class %s is in new ctype but not in old ctype'
 173                   %char_class)
 174     for char_class in sorted(old_ctype_dict):
 175         print("****************************************************")
 176         print("%s: %d chars in old ctype and %d chars in new ctype" %(
 177             char_class,
 178             len(old_ctype_dict[char_class]),
 179             len(new_ctype_dict[char_class])))
 180         print("----------------------------------------------------")
 181         report(char_class,
 182                old_ctype_dict[char_class],
 183                new_ctype_dict[char_class])
 184
 185 def report_code_points(char_class, code_point_list, text=''):
 186     '''Report all code points which have been added to or removed from a
 187     character class.
 188     '''
 189     for code_point in sorted(code_point_list):
 190         if type(code_point) == type(int()):
 191             print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
 192                   %{'text': text,
 193                     'char': chr(code_point),
 194                     'char_class': char_class,
 195                     'code_point': hex(code_point),
 196                     'name': unicodedata.name(chr(code_point), 'name unknown')})
 197         else:
 198             print(('%(char_class)s: %(text)s: '
 199                    + '%(char0)s → %(char1)s '
 200                    + '%(code_point0)s → %(code_point1)s '
 201                    + '%(name0)s → %(name1)s') %{
 202                 'text': text,
 203                 'char_class': char_class,
 204                 'char0': chr(code_point[0]),
 205                 'code_point0': hex(code_point[0]),
 206                 'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
 207                 'char1': chr(code_point[1]),
 208                 'code_point1': hex(code_point[1]),
 209                 'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
 210             })
 211
 212 def report(char_class, old_list, new_list):
 213     '''Report the differences for a certain LC_CTYPE character class
 214     between the old and the newly generated state
 215     '''
 216     missing_chars = list(set(old_list)-set(new_list))
 217     print(('%(char_class)s: Missing %(number)d characters '
 218            + 'of old ctype in new ctype ')
 219           %{'char_class': char_class, 'number': len(missing_chars)})
 220     if ARGS.show_missing_characters:
 221         report_code_points(char_class, missing_chars, 'Missing')
 222     added_chars = list(set(new_list)-set(old_list))
 223     print(('%(char_class)s: Added %(number)d characters '
 224            + 'in new ctype which were not in old ctype')
 225           %{'char_class': char_class, 'number': len(added_chars)})
 226     if ARGS.show_added_characters:
 227         report_code_points(char_class, added_chars, 'Added')
 228
 229
 230 def cperror(error_message, errorcounter=0):
 231     '''Increase number of errors by one and print an error message'''
 232     print(error_message)
 233     return errorcounter + 1
 234
 235 def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
 236             errorcounter=0):
 237     '''The parameter “code_point_list_with_ranges” is a list of
 238     integers or pairs of integers, for example:
 239
 240     [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
 241
 242     where the pairs of integers stand for all the code points in the range
 243     of the two integers given, including the two integers of the pair.
 244
 245     '''
 246     for code_point_range in code_point_list_with_ranges:
 247         for code_point in ([code_point_range]
 248                            if type(code_point_range) == type(int())
 249                            else range(code_point_range[0],
 250                                       code_point_range[1]+1)):
 251             for char_class_tuple in char_classes:
 252                 char_class = char_class_tuple[0]
 253                 in_char_class = char_class_tuple[1]
 254                 if (code_point in ctype_dict[char_class]) != in_char_class:
 255                     errorcounter = cperror(
 256                         ('error: %(code_point)s %(char)s '
 257                          + '%(char_class)s %(in)s: %(reason)s') %{
 258                              'code_point': hex(code_point),
 259                              'char': chr(code_point),
 260                              'char_class': char_class,
 261                              'in': not in_char_class,
 262                              'reason': reason},
 263                         errorcounter)
 264     return errorcounter
 265
 266 def tests(ctype_dict, errorcounter = 0):
 267     '''Test a LC_CTYPE character class dictionary for known errors'''
 268     # copy the information from ctype_dict (which contains lists) in
 269     # a new dictionary ctype_dict2 (which contains dictionaries).
 270     # The checks below are easier with that type of data structure.
 271
 272     ctype_dict2 = {}
 273     for key in ctype_dict:
 274         ctype_dict2[key] = {}
 275         if ctype_dict[key]:
 276             if type(ctype_dict[key][0]) == type(int()):
 277                 for value in ctype_dict[key]:
 278                     ctype_dict2[key][value] = 1
 279             else: # key is 'toupper', 'tolower', or 'totitle'
 280                 for value in ctype_dict[key]:
 281                     ctype_dict2[key][value[0]] = value[1]
 282
 283     for test_case in TEST_CASES:
 284         errorcounter = cpcheck(ctype_dict2,
 285                                test_case[0],
 286                                test_case[1],
 287                                test_case[2],
 288                                errorcounter = errorcounter)
 289
 290     for code_point in range(0, 0x110000):
 291         # toupper restriction: "Only characters specified for the keywords
 292         # lower and upper shall be specified.
 293         if (code_point in ctype_dict2['toupper']
 294             and code_point != ctype_dict2['toupper'][code_point]
 295             and not (code_point in ctype_dict2['lower']
 296                      or code_point in ctype_dict2['upper'])):
 297             errorcounter = cperror(
 298                 ('error: %(char1)s is not upper|lower '
 299                  + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
 300                      'char1': chr(code_point),
 301                      'cp1': hex(code_point),
 302                      'cp2': hex(ctype_dict2['toupper'][code_point]),
 303                      'char2': chr(ctype_dict2['toupper'][code_point])
 304                  },
 305                 errorcounter)
 306         # tolower restriction: "Only characters specified for the keywords
 307         # lower and upper shall be specified.
 308         if (code_point in ctype_dict2['tolower']
 309             and code_point != ctype_dict2['tolower'][code_point]
 310             and not (code_point in ctype_dict2['lower']
 311                      or code_point in ctype_dict2['upper'])):
 312             errorcounter = cperror(
 313                 ('error: %(char1)s is not upper|lower '
 314                  + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
 315                      'char1': chr(code_point),
 316                      'cp1': hex(code_point),
 317                      'cp2': hex(ctype_dict2['tolower'][code_point]),
 318                      'char2': chr(ctype_dict2['tolower'][code_point])
 319                  },
 320                 errorcounter)
 321         # alpha restriction: "Characters classified as either upper or lower
 322         # shall automatically belong to this class.
 323         if ((code_point in ctype_dict2['lower']
 324              or code_point in ctype_dict2['upper'])
 325             and code_point not in ctype_dict2['alpha']):
 326             errorcounter = cperror(
 327                 'error: %(char)s %(cp)s is upper|lower but not alpha' %{
 328                     'char': chr(code_point),
 329                     'cp': hex(code_point)
 330                 },
 331                 errorcounter)
 332         # alpha restriction: "No character specified for the keywords cntrl,
 333         # digit, punct or space shall be specified."
 334         if (code_point in ctype_dict2['alpha']
 335             and code_point in ctype_dict2['cntrl']):
 336             errorcounter = cperror(
 337                 'error: %(char)s %(cp)s is alpha and cntrl' %{
 338                     'char': chr(code_point),
 339                     'cp': hex(code_point)
 340                 },
 341                 errorcounter)
 342         if (code_point in ctype_dict2['alpha']
 343             and code_point in ctype_dict2['digit']):
 344             errorcounter = cperror(
 345                 'error: %(char)s %(cp)s is alpha and digit' %{
 346                     'char': chr(code_point),
 347                     'cp': hex(code_point)
 348                 },
 349                 errorcounter)
 350         if (code_point in ctype_dict2['alpha']
 351             and code_point in ctype_dict2['punct']):
 352             errorcounter = cperror(
 353                 'error: %(char)s %(cp)s is alpha and punct' %{
 354                     'char': chr(code_point),
 355                     'cp': hex(code_point)
 356                 },
 357                 errorcounter)
 358         if (code_point in ctype_dict2['alpha']
 359             and code_point in ctype_dict2['space']):
 360             errorcounter = cperror(
 361                 'error: %(char)s %(cp)s is alpha and space' %{
 362                     'char': chr(code_point),
 363                     'cp': hex(code_point)
 364                 },
 365                 errorcounter)
 366         # space restriction: "No character specified for the keywords upper,
 367         # lower, alpha, digit, graph or xdigit shall be specified."
 368         # upper, lower, alpha already checked above.
 369         if (code_point in ctype_dict2['space']
 370             and code_point in ctype_dict2['digit']):
 371             errorcounter = cperror(
 372                 'error: %(char)s %(cp)s is space and digit' %{
 373                     'char': chr(code_point),
 374                     'cp': hex(code_point)
 375                 },
 376                 errorcounter)
 377         if (code_point in ctype_dict2['space']
 378             and code_point in ctype_dict2['graph']):
 379             errorcounter = cperror(
 380                 'error: %(char)s %(cp)s is space and graph' %{
 381                     'char': chr(code_point),
 382                     'cp': hex(code_point)
 383                 },
 384                 errorcounter)
 385         if (code_point in ctype_dict2['space']
 386             and code_point in ctype_dict2['xdigit']):
 387             errorcounter = cperror(
 388                 'error: %(char)s %(cp)s is space and xdigit' %{
 389                     'char': chr(code_point),
 390                     'cp': hex(code_point)
 391                 },
 392                 errorcounter)
 393         # cntrl restriction: "No character specified for the keywords upper,
 394         # lower, alpha, digit, punct, graph, print or xdigit shall be
 395         # specified."  upper, lower, alpha already checked above.
 396         if (code_point in ctype_dict2['cntrl']
 397             and code_point in ctype_dict2['digit']):
 398             errorcounter = cperror(
 399                 'error: %(char)s %(cp)s is cntrl and digit' %{
 400                     'char': chr(code_point),
 401                     'cp': hex(code_point)
 402                 },
 403                 errorcounter)
 404         if (code_point in ctype_dict2['cntrl']
 405             and code_point in ctype_dict2['punct']):
 406             errorcounter = cperror(
 407                 'error: %(char)s %(cp)s is cntrl and punct' %{
 408                     'char': chr(code_point),
 409                     'cp': hex(code_point)
 410                 },
 411                 errorcounter)
 412         if (code_point in ctype_dict2['cntrl']
 413             and code_point in ctype_dict2['graph']):
 414             errorcounter = cperror(
 415                 'error: %(char)s %(cp)s is cntrl and graph' %{
 416                     'char': chr(code_point),
 417                     'cp': hex(code_point)
 418                 },
 419                 errorcounter)
 420         if (code_point in ctype_dict2['cntrl']
 421             and code_point in ctype_dict2['print']):
 422             errorcounter = cperror(
 423                 'error: %(char)s %(cp)s is cntrl and print' %{
 424                     'char': chr(code_point),
 425                     'cp': hex(code_point)
 426                 },
 427                 errorcounter)
 428         if (code_point in ctype_dict2['cntrl']
 429             and code_point in ctype_dict2['xdigit']):
 430             errorcounter = cperror(
 431                 'error: %(char)s %(cp)s is cntrl and xdigit' %{
 432                     'char': chr(code_point),
 433                     'cp': hex(code_point)
 434                 },
 435                 errorcounter)
 436         # punct restriction: "No character specified for the keywords upper,
 437         # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 438         # be specified."  upper, lower, alpha, cntrl already checked above.
 439         if (code_point in ctype_dict2['punct']
 440             and code_point in ctype_dict2['digit']):
 441             errorcounter = cperror(
 442                 'error: %(char)s %(cp)s is punct and digit' %{
 443                     'char': chr(code_point),
 444                     'cp': hex(code_point)
 445                 },
 446                 errorcounter)
 447         if (code_point in ctype_dict2['punct']
 448             and code_point in ctype_dict2['xdigit']):
 449             errorcounter = cperror(
 450                 'error: %(char)s %(cp)s is punct and xdigit' %{
 451                     'char': chr(code_point),
 452                     'cp': hex(code_point)
 453                 },
 454                 errorcounter)
 455         if (code_point in ctype_dict2['punct']
 456             and code_point == 0x0020):
 457             errorcounter = cperror(
 458                 'error: %(char)s %(cp)s is punct.' %{
 459                     'char': chr(code_point),
 460                     'cp': hex(code_point)
 461                 },
 462                 errorcounter)
 463         # graph restriction: "No character specified for the keyword cntrl
 464         # shall be specified."  Already checked above.
 465
 466         # print restriction: "No character specified for the keyword cntrl
 467         # shall be specified."  Already checked above.
 468
 469         # graph - print relation: differ only in the <space> character.
 470         # How is this possible if there are more than one space character?!
 471         # I think susv2/xbd/locale.html should speak of "space characters",
 472         # not "space character".
 473         if (code_point in ctype_dict2['print']
 474             and not (code_point in ctype_dict2['graph']
 475                      or code_point in ctype_dict2['space'])):
 476             errorcounter = cperror(
 477                 'error: %(char)s %(cp)s is print but not graph|space' %{
 478                     'char': chr(code_point),
 479                     'cp': hex(code_point)
 480                 },
 481                 errorcounter)
 482         if (code_point not in ctype_dict2['print']
 483             and (code_point in ctype_dict2['graph']
 484                  or code_point ==  0x0020)):
 485             errorcounter = cperror(
 486                 'error: %(char)s %(cp)s graph|space but not print' %{
 487                     'char': chr(code_point),
 488                     'cp': hex(code_point)
 489                 },
 490                 errorcounter)
 491     return errorcounter
 492
 493 if __name__ == "__main__":
 494     PARSER = argparse.ArgumentParser(
 495         description='''
 496         Compare the contents of LC_CTYPE in two files and check for errors.
 497         ''')
 498     PARSER.add_argument(
 499         '-o', '--old_ctype_file',
 500         nargs='?',
 501         type=str,
 502         default='i18n',
 503         help='The old ctype file, default: %(default)s')
 504     PARSER.add_argument(
 505         '-n', '--new_ctype_file',
 506         nargs='?',
 507         type=str,
 508         default='unicode-ctype',
 509         help='The new ctype file, default: %(default)s')
 510     PARSER.add_argument(
 511         '-a', '--show_added_characters',
 512         action='store_true',
 513         help=('Show characters which were added to each '
 514               + 'character class in detail.'))
 515     PARSER.add_argument(
 516         '-m', '--show_missing_characters',
 517         action='store_true',
 518         help=('Show characters which were removed from each '
 519               + 'character class in detail.'))
 520     ARGS = PARSER.parse_args()
 521
 522     OLD_CTYPE_DICT = extract_character_classes(
 523         ARGS.old_ctype_file)
 524     NEW_CTYPE_DICT = extract_character_classes(
 525         ARGS.new_ctype_file)
 526     compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
 527     print('============================================================')
 528     print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
 529     print('------------------------------------------------------------')
 530     NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
 531     print('------------------------------------------------------------')
 532     print('Old file = %s' %ARGS.old_ctype_file)
 533     print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
 534     print('------------------------------------------------------------')
 535     print('============================================================')
 536     print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
 537     print('------------------------------------------------------------')
 538     NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
 539     print('------------------------------------------------------------')
 540     print('New file = %s' %ARGS.new_ctype_file)
 541     print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
 542     print('------------------------------------------------------------')
 543     if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
 544         exit(1)
 545     else:
 546         exit(0)