localedata/unicode-gen/utf8_compatibility.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <https://www.gnu.org/licenses/>.
  19
  20 '''
  21 This script is useful for checking backward compatibility of newly
  22 generated UTF-8 file from utf8_gen.py script
  23
  24 To see how this script is used, call it with the “-h” option:
  25
  26     $ ./utf8_compatibility.py -h
  27     … prints usage message …
  28 '''
  29
  30 import sys
  31 import re
  32 import argparse
  33 import unicode_utils
  34
  35 def create_charmap_dictionary(file_name):
  36     '''Create a dictionary for all code points found in the CHARMAP
  37     section of a file
  38     '''
  39     with open(file_name, mode='r') as utf8_file:
  40         charmap_dictionary = {}
  41         for line in utf8_file:
  42             if line.startswith('CHARMAP'):
  43                 break
  44         for line in utf8_file:
  45             if line.startswith('END CHARMAP'):
  46                 return charmap_dictionary
  47             if line.startswith('%'):
  48                 continue
  49             match = re.match(
  50                 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
  51                 +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
  52                 +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
  53                 line)
  54             if not match:
  55                 continue
  56             codepoint1 = match.group('codepoint1')
  57             codepoint2 = match.group('codepoint2')
  58             if not codepoint2:
  59                 codepoint2 = codepoint1
  60             for i in range(int(codepoint1, 16),
  61                            int(codepoint2, 16) + 1):
  62                 charmap_dictionary[i] = match.group('hexutf8')
  63         sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
  64                          %file_name)
  65         exit(1)
  66
  67 def check_charmap(original_file_name, new_file_name):
  68     '''Report differences in the CHARMAP section between the old and the
  69     new file
  70     '''
  71     print('************************************************************')
  72     print('Report on CHARMAP:')
  73     ocharmap = create_charmap_dictionary(original_file_name)
  74     ncharmap = create_charmap_dictionary(new_file_name)
  75     print('------------------------------------------------------------')
  76     print('Total removed characters in newly generated CHARMAP: %d'
  77           %len(set(ocharmap)-set(ncharmap)))
  78     if ARGS.show_missing_characters:
  79         for key in sorted(set(ocharmap)-set(ncharmap)):
  80             print('removed: {:s}     {:s} {:s}'.format(
  81                 unicode_utils.ucs_symbol(key),
  82                 ocharmap[key],
  83                 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
  84                 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
  85     print('------------------------------------------------------------')
  86     changed_charmap = {}
  87     for key in set(ocharmap).intersection(set(ncharmap)):
  88         if ocharmap[key] != ncharmap[key]:
  89             changed_charmap[key] = (ocharmap[key], ncharmap[key])
  90     print('Total changed characters in newly generated CHARMAP: %d'
  91           %len(changed_charmap))
  92     if ARGS.show_changed_characters:
  93         for key in sorted(changed_charmap):
  94             print('changed: {:s}     {:s}->{:s} {:s}'.format(
  95                 unicode_utils.ucs_symbol(key),
  96                 changed_charmap[key][0],
  97                 changed_charmap[key][1],
  98                 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
  99                 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 100     print('------------------------------------------------------------')
 101     print('Total added characters in newly generated CHARMAP: %d'
 102           %len(set(ncharmap)-set(ocharmap)))
 103     if ARGS.show_added_characters:
 104         for key in sorted(set(ncharmap)-set(ocharmap)):
 105             print('added: {:s}     {:s} {:s}'.format(
 106                 unicode_utils.ucs_symbol(key),
 107                 ncharmap[key],
 108                 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
 109                 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 110
 111 def create_width_dictionary(file_name):
 112     '''Create a dictionary for all code points found in the WIDTH
 113     section of a file
 114     '''
 115     with open(file_name, mode='r') as utf8_file:
 116         width_dictionary = {}
 117         for line in utf8_file:
 118             if line.startswith('WIDTH'):
 119                 break
 120         for line in utf8_file:
 121             if line.startswith('END WIDTH'):
 122                 return width_dictionary
 123             match = re.match(
 124                 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
 125                 +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
 126                 +r'\s+(?P<width>[02])',
 127                 line)
 128             if not match:
 129                 continue
 130             codepoint1 = match.group('codepoint1')
 131             codepoint2 = match.group('codepoint2')
 132             if not codepoint2:
 133                 codepoint2 = codepoint1
 134             for i in range(int(codepoint1, 16),
 135                            int(codepoint2, 16) + 1):
 136                 width_dictionary[i] = int(match.group('width'))
 137         sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
 138
 139 def check_width(original_file_name, new_file_name):
 140     '''Report differences in the WIDTH section between the old and the new
 141     file
 142     '''
 143     print('************************************************************')
 144     print('Report on WIDTH:')
 145     owidth = create_width_dictionary(original_file_name)
 146     nwidth = create_width_dictionary(new_file_name)
 147     print('------------------------------------------------------------')
 148     print('Total removed characters in newly generated WIDTH: %d'
 149           %len(set(owidth)-set(nwidth)))
 150     print('(Characters not in WIDTH get width 1 by default, '
 151           + 'i.e. these have width 1 now.)')
 152     if ARGS.show_missing_characters:
 153         for key in sorted(set(owidth)-set(nwidth)):
 154             print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
 155                   + '{:d} : '.format(owidth[key])
 156                   + 'eaw={:s} '.format(
 157                       unicode_utils.EAST_ASIAN_WIDTHS[key]
 158                       if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
 159                   + 'category={:2s} '.format(
 160                       unicode_utils.UNICODE_ATTRIBUTES[key]['category']
 161                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
 162                   + 'bidi={:3s} '.format(
 163                       unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
 164                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
 165                   + 'name={:s}'.format(
 166                       unicode_utils.UNICODE_ATTRIBUTES[key]['name']
 167                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 168     print('------------------------------------------------------------')
 169     changed_width = {}
 170     for key in set(owidth).intersection(set(nwidth)):
 171         if owidth[key] != nwidth[key]:
 172             changed_width[key] = (owidth[key], nwidth[key])
 173     print('Total changed characters in newly generated WIDTH: %d'
 174           %len(changed_width))
 175     if ARGS.show_changed_characters:
 176         for key in sorted(changed_width):
 177             print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
 178                   + '{:d}->{:d} : '.format(changed_width[key][0],
 179                                           changed_width[key][1])
 180                   + 'eaw={:s} '.format(
 181                       unicode_utils.EAST_ASIAN_WIDTHS[key]
 182                       if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
 183                   + 'category={:2s} '.format(
 184                       unicode_utils.UNICODE_ATTRIBUTES[key]['category']
 185                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
 186                   + 'bidi={:3s} '.format(
 187                       unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
 188                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
 189                   + 'name={:s}'.format(
 190                       unicode_utils.UNICODE_ATTRIBUTES[key]['name']
 191                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 192     print('------------------------------------------------------------')
 193     print('Total added characters in newly generated WIDTH: %d'
 194           %len(set(nwidth)-set(owidth)))
 195     print('(Characters not in WIDTH get width 1 by default, '
 196           + 'i.e. these had width 1 before.)')
 197     if ARGS.show_added_characters:
 198         for key in sorted(set(nwidth)-set(owidth)):
 199             print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
 200                   + '{:d} : '.format(nwidth[key])
 201                   + 'eaw={:s} '.format(
 202                       unicode_utils.EAST_ASIAN_WIDTHS[key]
 203                       if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
 204                   + 'category={:2s} '.format(
 205                       unicode_utils.UNICODE_ATTRIBUTES[key]['category']
 206                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
 207                   + 'bidi={:3s} '.format(
 208                       unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
 209                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
 210                   + 'name={:s}'.format(
 211                       unicode_utils.UNICODE_ATTRIBUTES[key]['name']
 212                       if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
 213
 214 if __name__ == "__main__":
 215     PARSER = argparse.ArgumentParser(
 216         description='''
 217         Compare the contents of LC_CTYPE in two files and check for errors.
 218         ''')
 219     PARSER.add_argument(
 220         '-o', '--old_utf8_file',
 221         nargs='?',
 222         required=True,
 223         type=str,
 224         help='The old UTF-8 file.')
 225     PARSER.add_argument(
 226         '-n', '--new_utf8_file',
 227         nargs='?',
 228         required=True,
 229         type=str,
 230         help='The new UTF-8 file.')
 231     PARSER.add_argument(
 232         '-u', '--unicode_data_file',
 233         nargs='?',
 234         type=str,
 235         help='The UnicodeData.txt file to read.')
 236     PARSER.add_argument(
 237         '-e', '--east_asian_width_file',
 238         nargs='?',
 239         type=str,
 240         help='The EastAsianWidth.txt file to read.')
 241     PARSER.add_argument(
 242         '-a', '--show_added_characters',
 243         action='store_true',
 244         help='Show characters which were added in detail.')
 245     PARSER.add_argument(
 246         '-m', '--show_missing_characters',
 247         action='store_true',
 248         help='Show characters which were removed in detail.')
 249     PARSER.add_argument(
 250         '-c', '--show_changed_characters',
 251         action='store_true',
 252         help='Show characters whose width was changed in detail.')
 253     ARGS = PARSER.parse_args()
 254
 255     if ARGS.unicode_data_file:
 256         unicode_utils.fill_attributes(ARGS.unicode_data_file)
 257     if ARGS.east_asian_width_file:
 258         unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
 259     check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
 260     check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)