localedata/unicode-gen/utf8_compatibility.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2015 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <http://www.gnu.org/licenses/>.
  19
  20 '''
  21 This script is useful for checking backward compatibility of newly
  22 generated UTF-8 file from utf8_gen.py script
  23
  24 To see how this script is used, call it with the “-h” option:
  25
  26     $ ./utf8_compatibility.py -h
  27     … prints usage message …
  28 '''
  29
  30 import sys
  31 import re
  32 import argparse
  33
  34 # Dictionary holding the entire contents of the UnicodeData.txt file
  35 #
  36 # Contents of this dictionary look like this:
  37 #
  38 # {0: {'category': 'Cc',
  39 #      'title': None,
  40 #      'digit': '',
  41 #      'name': '<control>',
  42 #      'bidi': 'BN',
  43 #      'combining': '0',
  44 #      'comment': '',
  45 #      'oldname': 'NULL',
  46 #      'decomposition': '',
  47 #      'upper': None,
  48 #      'mirrored': 'N',
  49 #      'lower': None,
  50 #      'decdigit': '',
  51 #      'numeric': ''},
  52 #      …
  53 # }
  54 UNICODE_ATTRIBUTES = {}
  55
  56 # Dictionary holding the entire contents of the EastAsianWidths.txt file
  57 #
  58 # Contents of this dictionary look like this:
  59 #
  60 # {0: 'N', … , 45430: 'W', …}
  61 EAST_ASIAN_WIDTHS = {}
  62
  63 def fill_attribute(code_point, fields):
  64     '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
  65
  66     One entry in the UNICODE_ATTRIBUTES dictionary represents one line
  67     in the UnicodeData.txt file.
  68
  69     '''
  70     UNICODE_ATTRIBUTES[code_point] =  {
  71         'name': fields[1],          # Character name
  72         'category': fields[2],      # General category
  73         'combining': fields[3],     # Canonical combining classes
  74         'bidi': fields[4],          # Bidirectional category
  75         'decomposition': fields[5], # Character decomposition mapping
  76         'decdigit': fields[6],      # Decimal digit value
  77         'digit': fields[7],         # Digit value
  78         'numeric': fields[8],       # Numeric value
  79         'mirrored': fields[9],      # mirrored
  80         'oldname': fields[10],      # Old Unicode 1.0 name
  81         'comment': fields[11],      # comment
  82         # Uppercase mapping
  83         'upper': int(fields[12], 16) if fields[12] else None,
  84         # Lowercase mapping
  85         'lower': int(fields[13], 16) if fields[13] else None,
  86         # Titlecase mapping
  87         'title': int(fields[14], 16) if fields[14] else None,
  88     }
  89
  90 def fill_attributes(filename):
  91     '''Stores the entire contents of the UnicodeData.txt file
  92     in the UNICODE_ATTRIBUTES dictionary.
  93
  94     A typical line for a single code point in UnicodeData.txt looks
  95     like this:
  96
  97     0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
  98
  99     Code point ranges are indicated by pairs of lines like this:
 100
 101     4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
 102     9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
 103     '''
 104     with open(filename, mode='r') as unicode_data_file:
 105         fields_start = []
 106         for line in unicode_data_file:
 107             fields = line.strip().split(';')
 108             if len(fields) != 15:
 109                 sys.stderr.write(
 110                     'short line in file "%(f)s": %(l)s\n' %{
 111                     'f': filename, 'l': line})
 112                 exit(1)
 113             if fields[2] == 'Cs':
 114                 # Surrogates are UTF-16 artefacts,
 115                 # not real characters. Ignore them.
 116                 fields_start = []
 117                 continue
 118             if fields[1].endswith(', First>'):
 119                 fields_start = fields
 120                 fields_start[1] = fields_start[1].split(',')[0][1:]
 121                 continue
 122             if fields[1].endswith(', Last>'):
 123                 fields[1] = fields[1].split(',')[0][1:]
 124                 if fields[1:] != fields_start[1:]:
 125                     sys.stderr.write(
 126                         'broken code point range in file "%(f)s": %(l)s\n' %{
 127                             'f': filename, 'l': line})
 128                     exit(1)
 129                 for code_point in range(
 130                         int(fields_start[0], 16),
 131                         int(fields[0], 16)+1):
 132                     fill_attribute(code_point, fields)
 133                 fields_start = []
 134                 continue
 135             fill_attribute(int(fields[0], 16), fields)
 136             fields_start = []
 137
 138 def fill_east_asian_widths(filename):
 139     '''Stores the entire contents of the EastAsianWidths.txt file
 140     in the EAST_ASIAN_WIDTHS dictionary.
 141
 142     Lines in EastAsianWidths.txt are either a code point range like
 143     this:
 144
 145     9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
 146
 147     or a single code point like this:
 148
 149     A015;W           # Lm         YI SYLLABLE WU
 150     '''
 151     with open(filename, mode='r') as east_asian_widths_file:
 152         for line in east_asian_widths_file:
 153             match = re.match(
 154                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
 155                 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
 156                 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
 157                 line)
 158             if not match:
 159                 continue
 160             start = match.group('codepoint1')
 161             end = match.group('codepoint2')
 162             if not end:
 163                 end = start
 164             for code_point in range(int(start, 16), int(end, 16)+1):
 165                 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
 166
 167 def ucs_symbol(code_point):
 168     '''Return the UCS symbol string for a Unicode character.'''
 169     if code_point < 0x10000:
 170         return '<U{:04X}>'.format(code_point)
 171     else:
 172         return '<U{:08X}>'.format(code_point)
 173
 174 def create_charmap_dictionary(file_name):
 175     '''Create a dictionary for all code points found in the CHARMAP
 176     section of a file
 177     '''
 178     with open(file_name, mode='r') as utf8_file:
 179         charmap_dictionary = {}
 180         for line in utf8_file:
 181             if line.startswith('CHARMAP'):
 182                 break
 183         for line in utf8_file:
 184             if line.startswith('END CHARMAP'):
 185                 return charmap_dictionary
 186             if line.startswith('%'):
 187                 continue
 188             match = re.match(
 189                 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
 190                 +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
 191                 +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
 192                 line)
 193             if not match:
 194                 continue
 195             codepoint1 = match.group('codepoint1')
 196             codepoint2 = match.group('codepoint2')
 197             if not codepoint2:
 198                 codepoint2 = codepoint1
 199             for i in range(int(codepoint1, 16),
 200                            int(codepoint2, 16) + 1):
 201                 charmap_dictionary[i] = match.group('hexutf8')
 202         sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
 203                          %file_name)
 204         exit(1)
 205
 206 def check_charmap(original_file_name, new_file_name):
 207     '''Report differences in the CHARMAP section between the old and the
 208     new file
 209     '''
 210     print('************************************************************')
 211     print('Report on CHARMAP:')
 212     ocharmap = create_charmap_dictionary(original_file_name)
 213     ncharmap = create_charmap_dictionary(new_file_name)
 214     print('------------------------------------------------------------')
 215     print('Total removed characters in newly generated CHARMAP: %d'
 216           %len(set(ocharmap)-set(ncharmap)))
 217     if ARGS.show_missing_characters:
 218         for key in sorted(set(ocharmap)-set(ncharmap)):
 219             print('removed: {:s}     {:s} {:s}'.format(
 220                 ucs_symbol(key),
 221                 ocharmap[key],
 222                 UNICODE_ATTRIBUTES[key]['name'] \
 223                 if key in UNICODE_ATTRIBUTES else None))
 224     print('------------------------------------------------------------')
 225     changed_charmap = {}
 226     for key in set(ocharmap).intersection(set(ncharmap)):
 227         if ocharmap[key] != ncharmap[key]:
 228             changed_charmap[key] = (ocharmap[key], ncharmap[key])
 229     print('Total changed characters in newly generated CHARMAP: %d'
 230           %len(changed_charmap))
 231     if ARGS.show_changed_characters:
 232         for key in sorted(changed_charmap):
 233             print('changed: {:s}     {:s}->{:s} {:s}'.format(
 234                 ucs_symbol(key),
 235                 changed_charmap[key][0],
 236                 changed_charmap[key][1],
 237                 UNICODE_ATTRIBUTES[key]['name'] \
 238                 if key in UNICODE_ATTRIBUTES else None))
 239     print('------------------------------------------------------------')
 240     print('Total added characters in newly generated CHARMAP: %d'
 241           %len(set(ncharmap)-set(ocharmap)))
 242     if ARGS.show_added_characters:
 243         for key in sorted(set(ncharmap)-set(ocharmap)):
 244             print('added: {:s}     {:s} {:s}'.format(
 245                 ucs_symbol(key),
 246                 ncharmap[key],
 247                 UNICODE_ATTRIBUTES[key]['name'] \
 248                 if key in UNICODE_ATTRIBUTES else None))
 249
 250 def create_width_dictionary(file_name):
 251     '''Create a dictionary for all code points found in the WIDTH
 252     section of a file
 253     '''
 254     with open(file_name, mode='r') as utf8_file:
 255         width_dictionary = {}
 256         for line in utf8_file:
 257             if line.startswith('WIDTH'):
 258                 break
 259         for line in utf8_file:
 260             if line.startswith('END WIDTH'):
 261                 return width_dictionary
 262             match = re.match(
 263                 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
 264                 +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
 265                 +r'\s+(?P<width>[02])',
 266                 line)
 267             if not match:
 268                 continue
 269             codepoint1 = match.group('codepoint1')
 270             codepoint2 = match.group('codepoint2')
 271             if not codepoint2:
 272                 codepoint2 = codepoint1
 273             for i in range(int(codepoint1, 16),
 274                            int(codepoint2, 16) + 1):
 275                 width_dictionary[i] = int(match.group('width'))
 276         sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
 277
 278 def check_width(original_file_name, new_file_name):
 279     '''Report differences in the WIDTH section between the old and the new
 280     file
 281     '''
 282     print('************************************************************')
 283     print('Report on WIDTH:')
 284     owidth = create_width_dictionary(original_file_name)
 285     nwidth = create_width_dictionary(new_file_name)
 286     print('------------------------------------------------------------')
 287     print('Total removed characters in newly generated WIDTH: %d'
 288           %len(set(owidth)-set(nwidth)))
 289     print('(Characters not in WIDTH get width 1 by default, '
 290           + 'i.e. these have width 1 now.)')
 291     if ARGS.show_missing_characters:
 292         for key in sorted(set(owidth)-set(nwidth)):
 293             print('removed: {:s} '.format(ucs_symbol(key))
 294                   + '{:d} : '.format(owidth[key])
 295                   + 'eaw={:s} '.format(
 296                       EAST_ASIAN_WIDTHS[key]
 297                       if key in EAST_ASIAN_WIDTHS else None)
 298                   + 'category={:2s} '.format(
 299                       UNICODE_ATTRIBUTES[key]['category']
 300                       if key in UNICODE_ATTRIBUTES else None)
 301                   + 'bidi={:3s} '.format(
 302                       UNICODE_ATTRIBUTES[key]['bidi']
 303                       if key in UNICODE_ATTRIBUTES else None)
 304                   + 'name={:s}'.format(
 305                       UNICODE_ATTRIBUTES[key]['name']
 306                       if key in UNICODE_ATTRIBUTES else None))
 307     print('------------------------------------------------------------')
 308     changed_width = {}
 309     for key in set(owidth).intersection(set(nwidth)):
 310         if owidth[key] != nwidth[key]:
 311             changed_width[key] = (owidth[key], nwidth[key])
 312     print('Total changed characters in newly generated WIDTH: %d'
 313           %len(changed_width))
 314     if ARGS.show_changed_characters:
 315         for key in sorted(changed_width):
 316             print('changed width: {:s} '.format(ucs_symbol(key))
 317                   + '{:d}->{:d} : '.format(changed_width[key][0],
 318                                           changed_width[key][1])
 319                   + 'eaw={:s} '.format(
 320                       EAST_ASIAN_WIDTHS[key]
 321                       if key in EAST_ASIAN_WIDTHS else None)
 322                   + 'category={:2s} '.format(
 323                       UNICODE_ATTRIBUTES[key]['category']
 324                       if key in UNICODE_ATTRIBUTES else None)
 325                   + 'bidi={:3s} '.format(
 326                       UNICODE_ATTRIBUTES[key]['bidi']
 327                       if key in UNICODE_ATTRIBUTES else None)
 328                   + 'name={:s}'.format(
 329                       UNICODE_ATTRIBUTES[key]['name']
 330                       if key in UNICODE_ATTRIBUTES else None))
 331     print('------------------------------------------------------------')
 332     print('Total added characters in newly generated WIDTH: %d'
 333           %len(set(nwidth)-set(owidth)))
 334     print('(Characters not in WIDTH get width 1 by default, '
 335           + 'i.e. these had width 1 before.)')
 336     if ARGS.show_added_characters:
 337         for key in sorted(set(nwidth)-set(owidth)):
 338             print('added: {:s} '.format(ucs_symbol(key))
 339                   + '{:d} : '.format(nwidth[key])
 340                   + 'eaw={:s} '.format(
 341                       EAST_ASIAN_WIDTHS[key]
 342                       if key in EAST_ASIAN_WIDTHS else None)
 343                   + 'category={:2s} '.format(
 344                       UNICODE_ATTRIBUTES[key]['category']
 345                       if key in UNICODE_ATTRIBUTES else None)
 346                   + 'bidi={:3s} '.format(
 347                       UNICODE_ATTRIBUTES[key]['bidi']
 348                       if key in UNICODE_ATTRIBUTES else None)
 349                   + 'name={:s}'.format(
 350                       UNICODE_ATTRIBUTES[key]['name']
 351                       if key in UNICODE_ATTRIBUTES else None))
 352
 353 if __name__ == "__main__":
 354     PARSER = argparse.ArgumentParser(
 355         description='''
 356         Compare the contents of LC_CTYPE in two files and check for errors.
 357         ''')
 358     PARSER.add_argument(
 359         '-o', '--old_utf8_file',
 360         nargs='?',
 361         required=True,
 362         type=str,
 363         help='The old UTF-8 file.')
 364     PARSER.add_argument(
 365         '-n', '--new_utf8_file',
 366         nargs='?',
 367         required=True,
 368         type=str,
 369         help='The new UTF-8 file.')
 370     PARSER.add_argument(
 371         '-u', '--unicode_data_file',
 372         nargs='?',
 373         type=str,
 374         help='The UnicodeData.txt file to read.')
 375     PARSER.add_argument(
 376         '-e', '--east_asian_width_file',
 377         nargs='?',
 378         type=str,
 379         help='The EastAsianWidth.txt file to read.')
 380     PARSER.add_argument(
 381         '-a', '--show_added_characters',
 382         action='store_true',
 383         help='Show characters which were added in detail.')
 384     PARSER.add_argument(
 385         '-m', '--show_missing_characters',
 386         action='store_true',
 387         help='Show characters which were removed in detail.')
 388     PARSER.add_argument(
 389         '-c', '--show_changed_characters',
 390         action='store_true',
 391         help='Show characters whose width was changed in detail.')
 392     ARGS = PARSER.parse_args()
 393
 394     if ARGS.unicode_data_file:
 395         fill_attributes(ARGS.unicode_data_file)
 396     if ARGS.east_asian_width_file:
 397         fill_east_asian_widths(ARGS.east_asian_width_file)
 398     check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
 399     check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)