localedata/unicode-gen/utf8_gen.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
   4 # Copyright The GNU Toolchain Authors.
   5 # This file is part of the GNU C Library.
   6 #
   7 # The GNU C Library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # The GNU C Library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with the GNU C Library; if not, see
  19 # <https://www.gnu.org/licenses/>.
  20
  21 '''glibc/localedata/charmaps/UTF-8 file generator script
  22
  23 This script generates a glibc/localedata/charmaps/UTF-8 file
  24 from Unicode data.
  25
  26 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
  27
  28 It will output UTF-8 file
  29 '''
  30
  31 import argparse
  32 import re
  33 import unicode_utils
  34
  35 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
  36 # sections 3.11 and 4.4.
  37
  38 JAMO_INITIAL_SHORT_NAME = (
  39     'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
  40     'C', 'K', 'T', 'P', 'H'
  41 )
  42
  43 JAMO_MEDIAL_SHORT_NAME = (
  44     'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
  45     'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
  46 )
  47
  48 JAMO_FINAL_SHORT_NAME = (
  49     '', 'G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
  50     'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
  51     'P', 'H'
  52 )
  53
  54 def process_range(start, end, outfile, name):
  55     '''Writes a range of code points into the CHARMAP section of the
  56     output file
  57
  58     '''
  59     if 'Hangul Syllable' in name:
  60         # from glibc/localedata/ChangeLog:
  61         #
  62         #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
  63         #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
  64         #  so they become printable and carry a width. Comment out surrogate
  65         #  ranges. Add a WIDTH table
  66         #
  67         # So we expand the Hangul Syllables here:
  68         for i in range(int(start, 16), int(end, 16)+1 ):
  69             index2, index3 = divmod(i - 0xaC00, 28)
  70             index1, index2 = divmod(index2, 21)
  71             hangul_syllable_name = 'HANGUL SYLLABLE ' \
  72                                    + JAMO_INITIAL_SHORT_NAME[index1] \
  73                                    + JAMO_MEDIAL_SHORT_NAME[index2] \
  74                                    + JAMO_FINAL_SHORT_NAME[index3]
  75             outfile.write('{:<11s} {:<12s} {:s}\n'.format(
  76                 unicode_utils.ucs_symbol(i), convert_to_hex(i),
  77                 hangul_syllable_name))
  78         return
  79     # UnicodeData.txt file has contains code point ranges like this:
  80     #
  81     # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  82     # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  83     #
  84     # The glibc UTF-8 file splits ranges like these into shorter
  85     # ranges of 64 code points each:
  86     #
  87     # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
  88     # …
  89     # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
  90     for i in range(int(start, 16), int(end, 16), 64 ):
  91         if i > (int(end, 16)-64):
  92             outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  93                     unicode_utils.ucs_symbol(i),
  94                     unicode_utils.ucs_symbol(int(end,16)),
  95                     convert_to_hex(i),
  96                     name))
  97             break
  98         outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  99                 unicode_utils.ucs_symbol(i),
 100                 unicode_utils.ucs_symbol(i+63),
 101                 convert_to_hex(i),
 102                 name))
 103
 104 def process_charmap(flines, outfile):
 105     '''This function takes an array which contains *all* lines of
 106     of UnicodeData.txt and write lines to outfile as used in the
 107
 108     CHARMAP
 109     …
 110     END CHARMAP
 111
 112     section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
 113
 114     Samples for input lines:
 115
 116     0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
 117     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 118     4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 119     D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
 120     DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
 121     100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
 122     10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
 123
 124     Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
 125
 126     <U0010>     /x10 DATA LINK ESCAPE
 127     <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
 128     %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
 129     %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
 130     <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
 131
 132     '''
 133     fields_start = []
 134     for line in flines:
 135         fields = line.split(";")
 136          # Some characters have “<control>” as their name. We try to
 137          # use the “Unicode 1.0 Name” (10th field in
 138          # UnicodeData.txt) for them.
 139          #
 140          # The Characters U+0080, U+0081, U+0084 and U+0099 have
 141          # “<control>” as their name but do not even have aa
 142          # ”Unicode 1.0 Name”. We could write code to take their
 143          # alternate names from NameAliases.txt.
 144         if fields[1] == "<control>" and fields[10]:
 145             fields[1] = fields[10]
 146         # Handling code point ranges like:
 147         #
 148         # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 149         # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 150         if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
 151             fields_start = fields
 152             continue
 153         if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
 154             process_range(fields_start[0], fields[0],
 155                           outfile, fields[1][:-7]+'>')
 156             fields_start = []
 157             continue
 158         fields_start = []
 159         if 'Surrogate,' in fields[1]:
 160             # Comment out the surrogates in the UTF-8 file.
 161             # One could of course skip them completely but
 162             # the original UTF-8 file in glibc had them as
 163             # comments, so we keep these comment lines.
 164             outfile.write('%')
 165         outfile.write('{:<11s} {:<12s} {:s}\n'.format(
 166                 unicode_utils.ucs_symbol(int(fields[0], 16)),
 167                 convert_to_hex(int(fields[0], 16)),
 168                 fields[1]))
 169
 170 def convert_to_hex(code_point):
 171     '''Converts a code point to a hexadecimal UTF-8 representation
 172     like /x**/x**/x**.'''
 173     # Getting UTF8 of Unicode characters.
 174     # In Python3, .encode('UTF-8') does not work for
 175     # surrogates. Therefore, we use this conversion table
 176     surrogates = {
 177         0xD800: '/xed/xa0/x80',
 178         0xDB7F: '/xed/xad/xbf',
 179         0xDB80: '/xed/xae/x80',
 180         0xDBFF: '/xed/xaf/xbf',
 181         0xDC00: '/xed/xb0/x80',
 182         0xDFFF: '/xed/xbf/xbf',
 183     }
 184     if code_point in surrogates:
 185         return surrogates[code_point]
 186     return ''.join([
 187         '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
 188     ])
 189
 190 def write_header_charmap(outfile):
 191     '''Write the header on top of the CHARMAP section to the output file'''
 192     outfile.write("<code_set_name> UTF-8\n")
 193     outfile.write("<comment_char> %\n")
 194     outfile.write("<escape_char> /\n")
 195     outfile.write("<mb_cur_min> 1\n")
 196     outfile.write("<mb_cur_max> 6\n\n")
 197     outfile.write("% CHARMAP generated using utf8_gen.py\n")
 198     outfile.write("% alias ISO-10646/UTF-8\n")
 199     outfile.write("CHARMAP\n")
 200
 201 def write_header_width(outfile, unicode_version):
 202     '''Writes the header on top of the WIDTH section to the output file'''
 203     outfile.write('% Character width according to Unicode {:s}.\n'.format(unicode_version))
 204     outfile.write('% Width is determined by the following rules, in order of decreasing precedence:\n')
 205     outfile.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n')
 206     outfile.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n')
 207     outfile.write('%   This character stands in for an intentionally omitted leading consonant\n')
 208     outfile.write('%   in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n')
 209     outfile.write('%   of visible display to ensure that the complete block has the correct width.\n')
 210     outfile.write('%   (See below for more information on Hangul syllables.)\n')
 211     outfile.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n')
 212     outfile.write('%   "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n')
 213     outfile.write('%   One composed Hangul "syllable block" like 퓛 is made up of\n')
 214     outfile.write('%   two to three individual component characters called "jamo".\n')
 215     outfile.write('%   The complete block must have total width 2;\n')
 216     outfile.write('%   to achieve this, we assign a width of 2 to leading "choseong" jamo,\n')
 217     outfile.write('%   and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n')
 218     outfile.write('% - Non-spacing and enclosing marks have width 0; generated from\n')
 219     outfile.write('%   "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n')
 220     outfile.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n')
 221     outfile.write('%   "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n')
 222     outfile.write('% - Double-width characters have width 2; generated from\n')
 223     outfile.write('%   "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n')
 224     outfile.write('% - Default width for all other characters is 1.\n')
 225     outfile.write("WIDTH\n")
 226
 227 def process_width(outfile, ulines, dlines, elines, klines):
 228     '''ulines are lines from UnicodeData.txt.
 229     elines are lines from EastAsianWidth.txt containing characters with width
 230     “W” or “F”.
 231     dlines are lines from DerivedCoreProperties.txt which contain
 232     characters with the property “Default_Ignorable_Code_Point”.
 233     klines are lines from HangulSyllableType.txt which contain characters
 234     with syllable type “V” or “T”.
 235     '''
 236     # Wide and fullwidth characters have width 1
 237     width_dict = {}
 238     for line in elines:
 239         fields = line.split(";")
 240         if not '..' in fields[0]:
 241             code_points = (fields[0], fields[0])
 242         else:
 243             code_points = fields[0].split("..")
 244         for key in range(int(code_points[0], 16),
 245                          int(code_points[1], 16)+1):
 246             width_dict[key] = 2
 247
 248     # Nonspacing and enclosing marks have width 0
 249     for line in ulines:
 250         fields = line.split(";")
 251         if fields[4] == "NSM" or fields[2] in ("Me", "Mn"):
 252             width_dict[int(fields[0], 16)] = 0
 253
 254     # Conjoining vowel and trailing jamo have width 0
 255     for line in klines:
 256         fields = line.split(";")
 257         if not '..' in fields[0]:
 258             code_points = (fields[0], fields[0])
 259         else:
 260             code_points = fields[0].split("..")
 261         for key in range(int(code_points[0], 16),
 262                          int(code_points[1], 16)+1):
 263             width_dict[key] = 0
 264
 265     # “Default_Ignorable_Code_Point”s have width 0
 266     for line in dlines:
 267         fields = line.split(";")
 268         if not '..' in fields[0]:
 269             code_points = (fields[0], fields[0])
 270         else:
 271             code_points = fields[0].split("..")
 272         for key in range(int(code_points[0], 16),
 273                          int(code_points[1], 16)+1):
 274             width_dict[key] = 0 # default width is 1
 275
 276
 277     # Special case: U+00AD SOFT HYPHEN
 278     del width_dict[0x00AD]
 279
 280     # Special case: U+115F HANGUL CHOSEONG FILLER
 281     width_dict[0x115F] = 2
 282
 283     for key in list(range(0x3248, 0x3250)):
 284         # These are “A” which means we can decide whether to treat them
 285         # as “W” or “N” based on context:
 286         # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
 287         # For us, “W” seems better.
 288         width_dict[key] = 2
 289     for key in list(range(0x4DC0, 0x4E00)):
 290         width_dict[key] = 2
 291
 292     same_width_lists = []
 293     current_width_list = []
 294     for key in sorted(width_dict):
 295         if not current_width_list:
 296             current_width_list = [key]
 297         elif (key == current_width_list[-1] + 1
 298               and width_dict[key] == width_dict[current_width_list[0]]):
 299             current_width_list.append(key)
 300         else:
 301             same_width_lists.append(current_width_list)
 302             current_width_list = [key]
 303     if current_width_list:
 304         same_width_lists.append(current_width_list)
 305
 306     for same_width_list in same_width_lists:
 307         if len(same_width_list) == 1:
 308             outfile.write('{:s}\t{:d}\n'.format(
 309                 unicode_utils.ucs_symbol(same_width_list[0]),
 310                 width_dict[same_width_list[0]]))
 311         else:
 312             outfile.write('{:s}...{:s}\t{:d}\n'.format(
 313                 unicode_utils.ucs_symbol(same_width_list[0]),
 314                 unicode_utils.ucs_symbol(same_width_list[-1]),
 315                 width_dict[same_width_list[0]]))
 316
 317 if __name__ == "__main__":
 318     PARSER = argparse.ArgumentParser(
 319         description='''
 320         Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt
 321         ''')
 322     PARSER.add_argument(
 323         '-u', '--unicode_data_file',
 324         nargs='?',
 325         type=str,
 326         default='UnicodeData.txt',
 327         help=('The UnicodeData.txt file to read, '
 328               + 'default: %(default)s'))
 329     PARSER.add_argument(
 330         '-d', '--derived_core_properties_file',
 331         nargs='?',
 332         type=str,
 333         default='DerivedCoreProperties.txt',
 334         help=('The DerivedCoreProperties.txt file to read, '
 335               + 'default: %(default)s'))
 336     PARSER.add_argument(
 337         '-e', '--east_asian_with_file',
 338         nargs='?',
 339         type=str,
 340         default='EastAsianWidth.txt',
 341         help=('The EastAsianWidth.txt file to read, '
 342               + 'default: %(default)s'))
 343     PARSER.add_argument(
 344         '-k', '--hangul_syllable_type_file',
 345         nargs='?',
 346         type=str,
 347         default='HangulSyllableType.txt',
 348         help=('The HangulSyllableType.txt file to read, '
 349               + 'default: %(default)s'))
 350     PARSER.add_argument(
 351         '--unicode_version',
 352         nargs='?',
 353         required=True,
 354         type=str,
 355         help='The Unicode version of the input files used.')
 356     ARGS = PARSER.parse_args()
 357
 358     unicode_utils.fill_attributes(ARGS.unicode_data_file)
 359     with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
 360         UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
 361     with open(ARGS.derived_core_properties_file, mode='r') as DERIVED_CORE_PROPERTIES_FILE:
 362         DERIVED_CORE_PROPERTIES_LINES = []
 363         for LINE in DERIVED_CORE_PROPERTIES_FILE:
 364             # If characters which are from reserved ranges
 365             # (i.e. not yet assigned code points)
 366             # are added to the WIDTH section of the UTF-8 file, then
 367             # “make check” produces “Unknown Character” errors for
 368             # these code points because such unassigned code points
 369             # are not in the CHARMAP section of the UTF-8 file.
 370             #
 371             # Therefore, we skip all reserved code points.
 372             if re.match(r'.*<reserved-.+>', LINE):
 373                 continue
 374             if re.match(r'^[^;]*;\s*Default_Ignorable_Code_Point', LINE):
 375                 DERIVED_CORE_PROPERTIES_LINES.append(LINE.strip())
 376     with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
 377         EAST_ASIAN_WIDTH_LINES = []
 378         for LINE in EAST_ASIAN_WIDTH_FILE:
 379             if re.match(r'.*<reserved-.+>', LINE):
 380                 continue
 381             if re.match(r'^[^;]*;\s*[WF]', LINE):
 382                 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
 383     with open(ARGS.hangul_syllable_type_file, mode='r') as HANGUL_SYLLABLE_TYPE_FILE:
 384         HANGUL_SYLLABLE_TYPE_LINES = []
 385         for LINE in HANGUL_SYLLABLE_TYPE_FILE:
 386             if re.match(r'.*<reserved-.+>', LINE):
 387                 continue
 388             if re.match(r'^[^;]*;\s*[VT]', LINE):
 389                 HANGUL_SYLLABLE_TYPE_LINES.append(LINE.strip())
 390     with open('UTF-8', mode='w') as OUTFILE:
 391         # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
 392         write_header_charmap(OUTFILE)
 393         process_charmap(UNICODE_DATA_LINES, OUTFILE)
 394         OUTFILE.write("END CHARMAP\n\n")
 395         # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
 396         write_header_width(OUTFILE, ARGS.unicode_version)
 397         process_width(OUTFILE,
 398                       UNICODE_DATA_LINES,
 399                       DERIVED_CORE_PROPERTIES_LINES,
 400                       EAST_ASIAN_WIDTH_LINES,
 401                       HANGUL_SYLLABLE_TYPE_LINES)
 402         OUTFILE.write("END WIDTH\n")