localedata/unicode-gen/utf8_gen.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2023 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <https://www.gnu.org/licenses/>.
  19
  20 '''glibc/localedata/charmaps/UTF-8 file generator script
  21
  22 This script generates a glibc/localedata/charmaps/UTF-8 file
  23 from Unicode data.
  24
  25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
  26
  27 It will output UTF-8 file
  28 '''
  29
  30 import argparse
  31 import sys
  32 import re
  33 import unicode_utils
  34
  35 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
  36 # sections 3.11 and 4.4.
  37
  38 JAMO_INITIAL_SHORT_NAME = (
  39     'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
  40     'C', 'K', 'T', 'P', 'H'
  41 )
  42
  43 JAMO_MEDIAL_SHORT_NAME = (
  44     'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
  45     'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
  46 )
  47
  48 JAMO_FINAL_SHORT_NAME = (
  49     '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
  50     'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
  51     'P', 'H'
  52 )
  53
  54 def process_range(start, end, outfile, name):
  55     '''Writes a range of code points into the CHARMAP section of the
  56     output file
  57
  58     '''
  59     if 'Hangul Syllable' in name:
  60         # from glibc/localedata/ChangeLog:
  61         #
  62         #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
  63         #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
  64         #  so they become printable and carry a width. Comment out surrogate
  65         #  ranges. Add a WIDTH table
  66         #
  67         # So we expand the Hangul Syllables here:
  68         for i in range(int(start, 16), int(end, 16)+1 ):
  69             index2, index3 = divmod(i - 0xaC00, 28)
  70             index1, index2 = divmod(index2, 21)
  71             hangul_syllable_name = 'HANGUL SYLLABLE ' \
  72                                    + JAMO_INITIAL_SHORT_NAME[index1] \
  73                                    + JAMO_MEDIAL_SHORT_NAME[index2] \
  74                                    + JAMO_FINAL_SHORT_NAME[index3]
  75             outfile.write('{:<11s} {:<12s} {:s}\n'.format(
  76                 unicode_utils.ucs_symbol(i), convert_to_hex(i),
  77                 hangul_syllable_name))
  78         return
  79     # UnicodeData.txt file has contains code point ranges like this:
  80     #
  81     # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  82     # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  83     #
  84     # The glibc UTF-8 file splits ranges like these into shorter
  85     # ranges of 64 code points each:
  86     #
  87     # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
  88     # …
  89     # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
  90     for i in range(int(start, 16), int(end, 16), 64 ):
  91         if i > (int(end, 16)-64):
  92             outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  93                     unicode_utils.ucs_symbol(i),
  94                     unicode_utils.ucs_symbol(int(end,16)),
  95                     convert_to_hex(i),
  96                     name))
  97             break
  98         outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  99                 unicode_utils.ucs_symbol(i),
 100                 unicode_utils.ucs_symbol(i+63),
 101                 convert_to_hex(i),
 102                 name))
 103
 104 def process_charmap(flines, outfile):
 105     '''This function takes an array which contains *all* lines of
 106     of UnicodeData.txt and write lines to outfile as used in the
 107
 108     CHARMAP
 109     …
 110     END CHARMAP
 111
 112     section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
 113
 114     Samples for input lines:
 115
 116     0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
 117     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 118     4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 119     D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
 120     DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
 121     100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
 122     10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
 123
 124     Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
 125
 126     <U0010>     /x10 DATA LINK ESCAPE
 127     <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
 128     %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
 129     %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
 130     <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
 131
 132     '''
 133     fields_start = []
 134     for line in flines:
 135         fields = line.split(";")
 136          # Some characters have “<control>” as their name. We try to
 137          # use the “Unicode 1.0 Name” (10th field in
 138          # UnicodeData.txt) for them.
 139          #
 140          # The Characters U+0080, U+0081, U+0084 and U+0099 have
 141          # “<control>” as their name but do not even have aa
 142          # ”Unicode 1.0 Name”. We could write code to take their
 143          # alternate names from NameAliases.txt.
 144         if fields[1] == "<control>" and fields[10]:
 145             fields[1] = fields[10]
 146         # Handling code point ranges like:
 147         #
 148         # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 149         # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 150         if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
 151             fields_start = fields
 152             continue
 153         if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
 154             process_range(fields_start[0], fields[0],
 155                           outfile, fields[1][:-7]+'>')
 156             fields_start = []
 157             continue
 158         fields_start = []
 159         if 'Surrogate,' in fields[1]:
 160             # Comment out the surrogates in the UTF-8 file.
 161             # One could of course skip them completely but
 162             # the original UTF-8 file in glibc had them as
 163             # comments, so we keep these comment lines.
 164             outfile.write('%')
 165         outfile.write('{:<11s} {:<12s} {:s}\n'.format(
 166                 unicode_utils.ucs_symbol(int(fields[0], 16)),
 167                 convert_to_hex(int(fields[0], 16)),
 168                 fields[1]))
 169
 170 def convert_to_hex(code_point):
 171     '''Converts a code point to a hexadecimal UTF-8 representation
 172     like /x**/x**/x**.'''
 173     # Getting UTF8 of Unicode characters.
 174     # In Python3, .encode('UTF-8') does not work for
 175     # surrogates. Therefore, we use this conversion table
 176     surrogates = {
 177         0xD800: '/xed/xa0/x80',
 178         0xDB7F: '/xed/xad/xbf',
 179         0xDB80: '/xed/xae/x80',
 180         0xDBFF: '/xed/xaf/xbf',
 181         0xDC00: '/xed/xb0/x80',
 182         0xDFFF: '/xed/xbf/xbf',
 183     }
 184     if code_point in surrogates:
 185         return surrogates[code_point]
 186     return ''.join([
 187         '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
 188     ])
 189
 190 def write_header_charmap(outfile):
 191     '''Write the header on top of the CHARMAP section to the output file'''
 192     outfile.write("<code_set_name> UTF-8\n")
 193     outfile.write("<comment_char> %\n")
 194     outfile.write("<escape_char> /\n")
 195     outfile.write("<mb_cur_min> 1\n")
 196     outfile.write("<mb_cur_max> 6\n\n")
 197     outfile.write("% CHARMAP generated using utf8_gen.py\n")
 198     outfile.write("% alias ISO-10646/UTF-8\n")
 199     outfile.write("CHARMAP\n")
 200
 201 def write_header_width(outfile, unicode_version):
 202     '''Writes the header on top of the WIDTH section to the output file'''
 203     outfile.write('% Character width according to Unicode '
 204                   + '{:s}.\n'.format(unicode_version))
 205     outfile.write('% - Default width is 1.\n')
 206     outfile.write('% - Double-width characters have width 2; generated from\n')
 207     outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
 208     outfile.write('% - Non-spacing characters have width 0; '
 209                   + 'generated from PropList.txt or\n')
 210     outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
 211                   + 'UnicodeData.txt"\n')
 212     outfile.write('% - Format control characters have width 0; '
 213                   + 'generated from\n')
 214     outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
 215 #   Not needed covered by Cf
 216 #    outfile.write("% - Zero width characters have width 0; generated from\n")
 217 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
 218     outfile.write("WIDTH\n")
 219
 220 def process_width(outfile, ulines, elines, plines):
 221     '''ulines are lines from UnicodeData.txt, elines are lines from
 222     EastAsianWidth.txt containing characters with width “W” or “F”,
 223     plines are lines from PropList.txt which contain characters
 224     with the property “Prepended_Concatenation_Mark”.
 225
 226     '''
 227     width_dict = {}
 228     for line in elines:
 229         fields = line.split(";")
 230         if not '..' in fields[0]:
 231             code_points = (fields[0], fields[0])
 232         else:
 233             code_points = fields[0].split("..")
 234         for key in range(int(code_points[0], 16),
 235                          int(code_points[1], 16)+1):
 236             width_dict[key] = 2
 237
 238     for line in ulines:
 239         fields = line.split(";")
 240         if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
 241             width_dict[int(fields[0], 16)] = 0
 242
 243     for line in plines:
 244         # Characters with the property “Prepended_Concatenation_Mark”
 245         # should have the width 1:
 246         fields = line.split(";")
 247         if not '..' in fields[0]:
 248             code_points = (fields[0], fields[0])
 249         else:
 250             code_points = fields[0].split("..")
 251         for key in range(int(code_points[0], 16),
 252                          int(code_points[1], 16)+1):
 253             del width_dict[key] # default width is 1
 254
 255     # handle special cases for compatibility
 256     for key in list((0x00AD,)):
 257         # https://www.cs.tut.fi/~jkorpela/shy.html
 258         if key in width_dict:
 259             del width_dict[key] # default width is 1
 260     for key in list(range(0x1160, 0x1200)):
 261         # Hangul jungseong and jongseong:
 262         if key in unicode_utils.UNICODE_ATTRIBUTES:
 263             width_dict[key] = 0
 264     for key in list(range(0xD7B0, 0xD800)):
 265         # Hangul jungseong and jongseong:
 266         if key in unicode_utils.UNICODE_ATTRIBUTES:
 267             width_dict[key] = 0
 268     for key in list(range(0x3248, 0x3250)):
 269         # These are “A” which means we can decide whether to treat them
 270         # as “W” or “N” based on context:
 271         # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
 272         # For us, “W” seems better.
 273         width_dict[key] = 2
 274     for key in list(range(0x4DC0, 0x4E00)):
 275         width_dict[key] = 2
 276
 277     same_width_lists = []
 278     current_width_list = []
 279     for key in sorted(width_dict):
 280         if not current_width_list:
 281             current_width_list = [key]
 282         elif (key == current_width_list[-1] + 1
 283               and width_dict[key] == width_dict[current_width_list[0]]):
 284             current_width_list.append(key)
 285         else:
 286             same_width_lists.append(current_width_list)
 287             current_width_list = [key]
 288     if current_width_list:
 289         same_width_lists.append(current_width_list)
 290
 291     for same_width_list in same_width_lists:
 292         if len(same_width_list) == 1:
 293             outfile.write('{:s}\t{:d}\n'.format(
 294                 unicode_utils.ucs_symbol(same_width_list[0]),
 295                 width_dict[same_width_list[0]]))
 296         else:
 297             outfile.write('{:s}...{:s}\t{:d}\n'.format(
 298                 unicode_utils.ucs_symbol(same_width_list[0]),
 299                 unicode_utils.ucs_symbol(same_width_list[-1]),
 300                 width_dict[same_width_list[0]]))
 301
 302 if __name__ == "__main__":
 303     PARSER = argparse.ArgumentParser(
 304         description='''
 305         Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
 306         ''')
 307     PARSER.add_argument(
 308         '-u', '--unicode_data_file',
 309         nargs='?',
 310         type=str,
 311         default='UnicodeData.txt',
 312         help=('The UnicodeData.txt file to read, '
 313               + 'default: %(default)s'))
 314     PARSER.add_argument(
 315         '-e', '--east_asian_with_file',
 316         nargs='?',
 317         type=str,
 318         default='EastAsianWidth.txt',
 319         help=('The EastAsianWidth.txt file to read, '
 320               + 'default: %(default)s'))
 321     PARSER.add_argument(
 322         '-p', '--prop_list_file',
 323         nargs='?',
 324         type=str,
 325         default='PropList.txt',
 326         help=('The PropList.txt file to read, '
 327               + 'default: %(default)s'))
 328     PARSER.add_argument(
 329         '--unicode_version',
 330         nargs='?',
 331         required=True,
 332         type=str,
 333         help='The Unicode version of the input files used.')
 334     ARGS = PARSER.parse_args()
 335
 336     unicode_utils.fill_attributes(ARGS.unicode_data_file)
 337     with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
 338         UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
 339     with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
 340         EAST_ASIAN_WIDTH_LINES = []
 341         for LINE in EAST_ASIAN_WIDTH_FILE:
 342             # If characters from EastAasianWidth.txt which are from
 343             # from reserved ranges (i.e. not yet assigned code points)
 344             # are added to the WIDTH section of the UTF-8 file, then
 345             # “make check” produces “Unknown Character” errors for
 346             # these code points because such unassigned code points
 347             # are not in the CHARMAP section of the UTF-8 file.
 348             #
 349             # Therefore, we skip all reserved code points when reading
 350             # the EastAsianWidth.txt file.
 351             if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
 352                 continue
 353             if re.match(r'^[^;]*;\s*[WF]\s*', LINE):
 354                 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
 355     with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
 356         PROP_LIST_LINES = []
 357         for LINE in PROP_LIST_FILE:
 358             if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
 359                 PROP_LIST_LINES.append(LINE.strip())
 360     with open('UTF-8', mode='w') as OUTFILE:
 361         # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
 362         write_header_charmap(OUTFILE)
 363         process_charmap(UNICODE_DATA_LINES, OUTFILE)
 364         OUTFILE.write("END CHARMAP\n\n")
 365         # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
 366         write_header_width(OUTFILE, ARGS.unicode_version)
 367         process_width(OUTFILE,
 368                       UNICODE_DATA_LINES,
 369                       EAST_ASIAN_WIDTH_LINES,
 370                       PROP_LIST_LINES)
 371         OUTFILE.write("END WIDTH\n")