localedata/unicode-gen/utf8_gen.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2018 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <http://www.gnu.org/licenses/>.
  19
  20 '''glibc/localedata/charmaps/UTF-8 file generator script
  21
  22 This script generates a glibc/localedata/charmaps/UTF-8 file
  23 from Unicode data.
  24
  25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
  26
  27 It will output UTF-8 file
  28 '''
  29
  30 import sys
  31 import re
  32 import unicode_utils
  33
  34 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
  35 # sections 3.11 and 4.4.
  36
  37 JAMO_INITIAL_SHORT_NAME = (
  38     'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
  39     'C', 'K', 'T', 'P', 'H'
  40 )
  41
  42 JAMO_MEDIAL_SHORT_NAME = (
  43     'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
  44     'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
  45 )
  46
  47 JAMO_FINAL_SHORT_NAME = (
  48     '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
  49     'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
  50     'P', 'H'
  51 )
  52
  53 def process_range(start, end, outfile, name):
  54     '''Writes a range of code points into the CHARMAP section of the
  55     output file
  56
  57     '''
  58     if 'Hangul Syllable' in name:
  59         # from glibc/localedata/ChangeLog:
  60         #
  61         #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
  62         #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
  63         #  so they become printable and carry a width. Comment out surrogate
  64         #  ranges. Add a WIDTH table
  65         #
  66         # So we expand the Hangul Syllables here:
  67         for i in range(int(start, 16), int(end, 16)+1 ):
  68             index2, index3 = divmod(i - 0xaC00, 28)
  69             index1, index2 = divmod(index2, 21)
  70             hangul_syllable_name = 'HANGUL SYLLABLE ' \
  71                                    + JAMO_INITIAL_SHORT_NAME[index1] \
  72                                    + JAMO_MEDIAL_SHORT_NAME[index2] \
  73                                    + JAMO_FINAL_SHORT_NAME[index3]
  74             outfile.write('{:<11s} {:<12s} {:s}\n'.format(
  75                 unicode_utils.ucs_symbol(i), convert_to_hex(i),
  76                 hangul_syllable_name))
  77         return
  78     # UnicodeData.txt file has contains code point ranges like this:
  79     #
  80     # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  81     # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  82     #
  83     # The glibc UTF-8 file splits ranges like these into shorter
  84     # ranges of 64 code points each:
  85     #
  86     # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
  87     # …
  88     # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
  89     for i in range(int(start, 16), int(end, 16), 64 ):
  90         if i > (int(end, 16)-64):
  91             outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  92                     unicode_utils.ucs_symbol(i),
  93                     unicode_utils.ucs_symbol(int(end,16)),
  94                     convert_to_hex(i),
  95                     name))
  96             break
  97         outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  98                 unicode_utils.ucs_symbol(i),
  99                 unicode_utils.ucs_symbol(i+63),
 100                 convert_to_hex(i),
 101                 name))
 102
 103 def process_charmap(flines, outfile):
 104     '''This function takes an array which contains *all* lines of
 105     of UnicodeData.txt and write lines to outfile as used in the
 106
 107     CHARMAP
 108     …
 109     END CHARMAP
 110
 111     section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
 112
 113     Samples for input lines:
 114
 115     0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
 116     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 117     4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 118     D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
 119     DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
 120     100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
 121     10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
 122
 123     Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
 124
 125     <U0010>     /x10 DATA LINK ESCAPE
 126     <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
 127     %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
 128     %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
 129     <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
 130
 131     '''
 132     fields_start = []
 133     for line in flines:
 134         fields = line.split(";")
 135          # Some characters have “<control>” as their name. We try to
 136          # use the “Unicode 1.0 Name” (10th field in
 137          # UnicodeData.txt) for them.
 138          #
 139          # The Characters U+0080, U+0081, U+0084 and U+0099 have
 140          # “<control>” as their name but do not even have aa
 141          # ”Unicode 1.0 Name”. We could write code to take their
 142          # alternate names from NameAliases.txt.
 143         if fields[1] == "<control>" and fields[10]:
 144             fields[1] = fields[10]
 145         # Handling code point ranges like:
 146         #
 147         # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 148         # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 149         if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
 150             fields_start = fields
 151             continue
 152         if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
 153             process_range(fields_start[0], fields[0],
 154                           outfile, fields[1][:-7]+'>')
 155             fields_start = []
 156             continue
 157         fields_start = []
 158         if 'Surrogate,' in fields[1]:
 159             # Comment out the surrogates in the UTF-8 file.
 160             # One could of course skip them completely but
 161             # the original UTF-8 file in glibc had them as
 162             # comments, so we keep these comment lines.
 163             outfile.write('%')
 164         outfile.write('{:<11s} {:<12s} {:s}\n'.format(
 165                 unicode_utils.ucs_symbol(int(fields[0], 16)),
 166                 convert_to_hex(int(fields[0], 16)),
 167                 fields[1]))
 168
 169 def convert_to_hex(code_point):
 170     '''Converts a code point to a hexadecimal UTF-8 representation
 171     like /x**/x**/x**.'''
 172     # Getting UTF8 of Unicode characters.
 173     # In Python3, .encode('UTF-8') does not work for
 174     # surrogates. Therefore, we use this conversion table
 175     surrogates = {
 176         0xD800: '/xed/xa0/x80',
 177         0xDB7F: '/xed/xad/xbf',
 178         0xDB80: '/xed/xae/x80',
 179         0xDBFF: '/xed/xaf/xbf',
 180         0xDC00: '/xed/xb0/x80',
 181         0xDFFF: '/xed/xbf/xbf',
 182     }
 183     if code_point in surrogates:
 184         return surrogates[code_point]
 185     return ''.join([
 186         '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
 187     ])
 188
 189 def write_header_charmap(outfile):
 190     '''Write the header on top of the CHARMAP section to the output file'''
 191     outfile.write("<code_set_name> UTF-8\n")
 192     outfile.write("<comment_char> %\n")
 193     outfile.write("<escape_char> /\n")
 194     outfile.write("<mb_cur_min> 1\n")
 195     outfile.write("<mb_cur_max> 6\n\n")
 196     outfile.write("% CHARMAP generated using utf8_gen.py\n")
 197     outfile.write("% alias ISO-10646/UTF-8\n")
 198     outfile.write("CHARMAP\n")
 199
 200 def write_header_width(outfile):
 201     '''Writes the header on top of the WIDTH section to the output file'''
 202     outfile.write('% Character width according to Unicode 10.0.0.\n')
 203     outfile.write('% - Default width is 1.\n')
 204     outfile.write('% - Double-width characters have width 2; generated from\n')
 205     outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
 206     outfile.write('% - Non-spacing characters have width 0; '
 207                   + 'generated from PropList.txt or\n')
 208     outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
 209                   + 'UnicodeData.txt"\n')
 210     outfile.write('% - Format control characters have width 0; '
 211                   + 'generated from\n')
 212     outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
 213 #   Not needed covered by Cf
 214 #    outfile.write("% - Zero width characters have width 0; generated from\n")
 215 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
 216     outfile.write("WIDTH\n")
 217
 218 def process_width(outfile, ulines, elines, plines):
 219     '''ulines are lines from UnicodeData.txt, elines are lines from
 220     EastAsianWidth.txt containing characters with width “W” or “F”,
 221     plines are lines from PropList.txt which contain characters
 222     with the property “Prepended_Concatenation_Mark”.
 223
 224     '''
 225     width_dict = {}
 226     for line in elines:
 227         fields = line.split(";")
 228         if not '..' in fields[0]:
 229             code_points = (fields[0], fields[0])
 230         else:
 231             code_points = fields[0].split("..")
 232         for key in range(int(code_points[0], 16),
 233                          int(code_points[1], 16)+1):
 234             width_dict[key] = 2
 235
 236     for line in ulines:
 237         fields = line.split(";")
 238         if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
 239             width_dict[int(fields[0], 16)] = 0
 240
 241     for line in plines:
 242         # Characters with the property “Prepended_Concatenation_Mark”
 243         # should have the width 1:
 244         fields = line.split(";")
 245         if not '..' in fields[0]:
 246             code_points = (fields[0], fields[0])
 247         else:
 248             code_points = fields[0].split("..")
 249         for key in range(int(code_points[0], 16),
 250                          int(code_points[1], 16)+1):
 251             del width_dict[key] # default width is 1
 252
 253     # handle special cases for compatibility
 254     for key in list((0x00AD,)):
 255         # https://www.cs.tut.fi/~jkorpela/shy.html
 256         if key in width_dict:
 257             del width_dict[key] # default width is 1
 258     for key in list(range(0x1160, 0x1200)):
 259         width_dict[key] = 0
 260     for key in list(range(0x3248, 0x3250)):
 261         # These are “A” which means we can decide whether to treat them
 262         # as “W” or “N” based on context:
 263         # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
 264         # For us, “W” seems better.
 265         width_dict[key] = 2
 266     for key in list(range(0x4DC0, 0x4E00)):
 267         width_dict[key] = 2
 268
 269     same_width_lists = []
 270     current_width_list = []
 271     for key in sorted(width_dict):
 272         if not current_width_list:
 273             current_width_list = [key]
 274         elif (key == current_width_list[-1] + 1
 275               and width_dict[key] == width_dict[current_width_list[0]]):
 276             current_width_list.append(key)
 277         else:
 278             same_width_lists.append(current_width_list)
 279             current_width_list = [key]
 280     if current_width_list:
 281         same_width_lists.append(current_width_list)
 282
 283     for same_width_list in same_width_lists:
 284         if len(same_width_list) == 1:
 285             outfile.write('{:s}\t{:d}\n'.format(
 286                 unicode_utils.ucs_symbol(same_width_list[0]),
 287                 width_dict[same_width_list[0]]))
 288         else:
 289             outfile.write('{:s}...{:s}\t{:d}\n'.format(
 290                 unicode_utils.ucs_symbol(same_width_list[0]),
 291                 unicode_utils.ucs_symbol(same_width_list[-1]),
 292                 width_dict[same_width_list[0]]))
 293
 294 if __name__ == "__main__":
 295     if len(sys.argv) < 3:
 296         print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
 297     else:
 298         with open(sys.argv[1], mode='r') as UNIDATA_FILE:
 299             UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
 300         with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
 301             EAST_ASIAN_WIDTH_LINES = []
 302             for LINE in EAST_ASIAN_WIDTH_FILE:
 303                 # If characters from EastAasianWidth.txt which are from
 304                 # from reserved ranges (i.e. not yet assigned code points)
 305                 # are added to the WIDTH section of the UTF-8 file, then
 306                 # “make check” produces “Unknown Character” errors for
 307                 # these code points because such unassigned code points
 308                 # are not in the CHARMAP section of the UTF-8 file.
 309                 #
 310                 # Therefore, we skip all reserved code points when reading
 311                 # the EastAsianWidth.txt file.
 312                 if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
 313                     continue
 314                 if re.match(r'^[^;]*;[WF]', LINE):
 315                     EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
 316         with open(sys.argv[3], mode='r') as PROP_LIST_FILE:
 317             PROP_LIST_LINES = []
 318             for LINE in PROP_LIST_FILE:
 319                 if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
 320                     PROP_LIST_LINES.append(LINE.strip())
 321         with open('UTF-8', mode='w') as OUTFILE:
 322             # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
 323             write_header_charmap(OUTFILE)
 324             process_charmap(UNICODE_DATA_LINES, OUTFILE)
 325             OUTFILE.write("END CHARMAP\n\n")
 326             # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
 327             write_header_width(OUTFILE)
 328             process_width(OUTFILE,
 329                           UNICODE_DATA_LINES,
 330                           EAST_ASIAN_WIDTH_LINES,
 331                           PROP_LIST_LINES)
 332             OUTFILE.write("END WIDTH\n")