localedata/unicode-gen/gen_unicode_ctype.py

   1 #!/usr/bin/python3
   2 #
   3 # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
   4 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
   5 # This file is part of the GNU C Library.
   6 #
   7 # The GNU C Library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # The GNU C Library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with the GNU C Library; if not, see
  19 # <https://www.gnu.org/licenses/>.
  20
  21 '''
  22 Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
  23 DerivedCoreProperties.txt files.
  24
  25 To see how this script is used, call it with the “-h” option:
  26
  27     $ ./gen_unicode_ctype.py -h
  28     … prints usage message …
  29 '''
  30
  31 import argparse
  32 import time
  33 import re
  34 import unicode_utils
  35
  36 def code_point_ranges(is_class_function):
  37     '''Returns a list of ranges of code points for which is_class_function
  38     returns True.
  39
  40     Example:
  41
  42     [[65, 90], [192, 214], [216, 222], [256], … ]
  43     '''
  44     cp_ranges  = []
  45     for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
  46         if is_class_function(code_point):
  47             if (cp_ranges
  48                 and cp_ranges[-1][-1] == code_point - 1):
  49                 if len(cp_ranges[-1]) == 1:
  50                     cp_ranges[-1].append(code_point)
  51                 else:
  52                     cp_ranges[-1][-1] = code_point
  53             else:
  54                 cp_ranges.append([code_point])
  55     return cp_ranges
  56
  57 def output_charclass(i18n_file, class_name, is_class_function):
  58     '''Output a LC_CTYPE character class section
  59
  60     Example:
  61
  62     upper /
  63        <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
  64        …
  65        <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
  66        <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
  67     '''
  68     cp_ranges = code_point_ranges(is_class_function)
  69     if cp_ranges:
  70         i18n_file.write('%s /\n' %class_name)
  71         max_column = 75
  72         prefix = '   '
  73         line = prefix
  74         range_string = ''
  75         for code_point_range in cp_ranges:
  76             if line.strip():
  77                 line  += ';'
  78             if len(code_point_range) == 1:
  79                 range_string = unicode_utils.ucs_symbol(code_point_range[0])
  80             else:
  81                 range_string = unicode_utils.ucs_symbol_range(
  82                     code_point_range[0], code_point_range[-1])
  83             if len(line+range_string) > max_column:
  84                 i18n_file.write(line+'/\n')
  85                 line = prefix
  86             line += range_string
  87         if line.strip():
  88             i18n_file.write(line+'\n')
  89         i18n_file.write('\n')
  90
  91 def output_charmap(i18n_file, map_name, map_function):
  92     '''Output a LC_CTYPE character map section
  93
  94     Example:
  95
  96     toupper /
  97       (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
  98       …
  99       (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
 100       (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
 101     '''
 102     max_column = 75
 103     prefix = '   '
 104     line = prefix
 105     map_string = ''
 106     i18n_file.write('%s /\n' %map_name)
 107     for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
 108         mapped = map_function(code_point)
 109         if code_point != mapped:
 110             if line.strip():
 111                 line += ';'
 112             map_string = '(' \
 113                          + unicode_utils.ucs_symbol(code_point) \
 114                          + ',' \
 115                          + unicode_utils.ucs_symbol(mapped) \
 116                          + ')'
 117             if len(line+map_string) > max_column:
 118                 i18n_file.write(line+'/\n')
 119                 line = prefix
 120             line += map_string
 121     if line.strip():
 122         i18n_file.write(line+'\n')
 123     i18n_file.write('\n')
 124
 125 def read_input_file(filename):
 126     '''Reads the original glibc i18n file to get the original head
 127     and tail.
 128
 129     We want to replace only the character classes in LC_CTYPE, and the
 130     date stamp. All the rest of the i18n file should stay unchanged.
 131     To avoid having to cut and paste the generated data into the
 132     original file, it is helpful to read the original file here
 133     to be able to generate a complete result file.
 134     '''
 135     head = tail = ''
 136     with open(filename, mode='r') as i18n_file:
 137         for line in i18n_file:
 138             match = re.match(
 139                 r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
 140                 line)
 141             if match:
 142                 line = match.group('key') \
 143                        + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
 144             head = head + line
 145             if line.startswith('LC_CTYPE'):
 146                 break
 147         for line in i18n_file:
 148             if line.startswith('translit_start'):
 149                 tail = line
 150                 break
 151         for line in i18n_file:
 152             tail = tail + line
 153     return (head, tail)
 154
 155 def output_head(i18n_file, unicode_version, head=''):
 156     '''Write the header of the output file, i.e. the part of the file
 157     before the “LC_CTYPE” line.
 158     '''
 159     if ARGS.input_file and head:
 160         i18n_file.write(head)
 161     else:
 162         i18n_file.write('escape_char /\n')
 163         i18n_file.write('comment_char %\n')
 164         i18n_file.write('\n')
 165         i18n_file.write('% Generated automatically by '
 166                         + 'gen_unicode_ctype.py '
 167                         + 'for Unicode {:s}.\n'.format(unicode_version))
 168         i18n_file.write('\n')
 169         i18n_file.write('LC_IDENTIFICATION\n')
 170         i18n_file.write('title     "Unicode {:s} FDCC-set"\n'.format(
 171             unicode_version))
 172         i18n_file.write('source    "UnicodeData.txt, '
 173                         + 'DerivedCoreProperties.txt"\n')
 174         i18n_file.write('address   ""\n')
 175         i18n_file.write('contact   ""\n')
 176         i18n_file.write('email     "bug-glibc-locales@gnu.org"\n')
 177         i18n_file.write('tel       ""\n')
 178         i18n_file.write('fax       ""\n')
 179         i18n_file.write('language  ""\n')
 180         i18n_file.write('territory "Earth"\n')
 181         i18n_file.write('revision  "{:s}"\n'.format(unicode_version))
 182         i18n_file.write('date      "{:s}"\n'.format(
 183             time.strftime('%Y-%m-%d')))
 184         i18n_file.write('category  "i18n:2012";LC_CTYPE\n')
 185         i18n_file.write('END LC_IDENTIFICATION\n')
 186         i18n_file.write('\n')
 187         i18n_file.write('LC_CTYPE\n')
 188
 189 def output_tail(i18n_file, tail=''):
 190     '''Write the tail of the output file, i.e. the part of the file
 191     after the last “LC_CTYPE” character class.
 192     '''
 193     if ARGS.input_file and tail:
 194         i18n_file.write(tail)
 195     else:
 196         i18n_file.write('END LC_CTYPE\n')
 197
 198 def output_tables(i18n_file, unicode_version, turkish):
 199     '''Write the new LC_CTYPE character classes to the output file'''
 200     i18n_file.write('% The following is the 14652 i18n fdcc-set '
 201                     + 'LC_CTYPE category.\n')
 202     i18n_file.write('% It covers Unicode version {:s}.\n'.format(
 203         unicode_version))
 204     i18n_file.write('% The character classes and mapping tables were '
 205                     + 'automatically\n')
 206     i18n_file.write('% generated using the gen_unicode_ctype.py '
 207                     + 'program.\n\n')
 208     i18n_file.write('% The "upper" class reflects the uppercase '
 209                     + 'characters of class "alpha"\n')
 210     output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
 211     i18n_file.write('% The "lower" class reflects the lowercase '
 212                     + 'characters of class "alpha"\n')
 213     output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
 214     i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
 215                     + 'reflecting\n')
 216     i18n_file.write('% the recommendations in TR 10176 annex A\n')
 217     output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
 218     i18n_file.write('% The "digit" class must only contain the '
 219                     + 'BASIC LATIN digits, says ISO C 99\n')
 220     i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
 221     output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
 222     i18n_file.write('% The "outdigit" information is by default '
 223                     + '"0" to "9".  We don\'t have to\n')
 224     i18n_file.write('% provide it here since localedef will fill '
 225                + 'in the bits and it would\n')
 226     i18n_file.write('% prevent locales copying this file define '
 227                     + 'their own values.\n')
 228     i18n_file.write('% outdigit /\n')
 229     i18n_file.write('%    <U0030>..<U0039>\n\n')
 230     # output_charclass(i18n_file, 'outdigit', is_outdigit)
 231     output_charclass(i18n_file, 'space', unicode_utils.is_space)
 232     output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
 233     output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
 234     output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
 235     output_charclass(i18n_file, 'print', unicode_utils.is_print)
 236     i18n_file.write('% The "xdigit" class must only contain the '
 237                     + 'BASIC LATIN digits and A-F, a-f,\n')
 238     i18n_file.write('% says ISO C 99 '
 239                     + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
 240     output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
 241     output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
 242     if turkish:
 243         i18n_file.write('% The case conversions reflect '
 244                         + 'Turkish conventions.\n')
 245         output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
 246         output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
 247     else:
 248         output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
 249         output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
 250     output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
 251     i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
 252                     + 'annex B.1\n')
 253     i18n_file.write('% That is, all combining characters (level 2+3).\n')
 254     output_charclass(i18n_file, 'class "combining";',
 255                      unicode_utils.is_combining)
 256     i18n_file.write('% The "combining_level3" class reflects '
 257                     + 'ISO/IEC 10646-1 annex B.2\n')
 258     i18n_file.write('% That is, combining characters of level 3.\n')
 259     output_charclass(i18n_file, 'class "combining_level3";',
 260                      unicode_utils.is_combining_level3)
 261
 262 if __name__ == "__main__":
 263     PARSER = argparse.ArgumentParser(
 264         description='''
 265         Generate a Unicode conforming LC_CTYPE category from
 266         UnicodeData.txt and DerivedCoreProperties.txt files.
 267         ''')
 268     PARSER.add_argument(
 269         '-u', '--unicode_data_file',
 270         nargs='?',
 271         type=str,
 272         default='UnicodeData.txt',
 273         help=('The UnicodeData.txt file to read, '
 274               + 'default: %(default)s'))
 275     PARSER.add_argument(
 276         '-d', '--derived_core_properties_file',
 277         nargs='?',
 278         type=str,
 279         default='DerivedCoreProperties.txt',
 280         help=('The DerivedCoreProperties.txt file to read, '
 281               + 'default: %(default)s'))
 282     PARSER.add_argument(
 283         '-i', '--input_file',
 284         nargs='?',
 285         type=str,
 286         help='''The original glibc/localedata/locales/i18n file.''')
 287     PARSER.add_argument(
 288         '-o', '--output_file',
 289         nargs='?',
 290         type=str,
 291         default='i18n.new',
 292         help='''The file which shall contain the generated LC_CTYPE category,
 293         default: %(default)s.  If the original
 294         glibc/localedata/locales/i18n has been given
 295         as an option, all data from the original file
 296         except the newly generated LC_CTYPE character
 297         classes and the date stamp in
 298         LC_IDENTIFICATION will be copied unchanged
 299         into the output file.  ''')
 300     PARSER.add_argument(
 301         '--unicode_version',
 302         nargs='?',
 303         required=True,
 304         type=str,
 305         help='The Unicode version of the input files used.')
 306     PARSER.add_argument(
 307         '--turkish',
 308         action='store_true',
 309         help='Use Turkish case conversions.')
 310     ARGS = PARSER.parse_args()
 311
 312     unicode_utils.fill_attributes(
 313         ARGS.unicode_data_file)
 314     unicode_utils.fill_derived_core_properties(
 315         ARGS.derived_core_properties_file)
 316     unicode_utils.verifications()
 317     HEAD = TAIL = ''
 318     if ARGS.input_file:
 319         (HEAD, TAIL) = read_input_file(ARGS.input_file)
 320     with open(ARGS.output_file, mode='w') as I18N_FILE:
 321         output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
 322         output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
 323         output_tail(I18N_FILE, tail=TAIL)