localedata/unicode-gen/utf8_gen.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2015 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <http://www.gnu.org/licenses/>.
  19
  20 '''glibc/localedata/charmaps/UTF-8 file generator script
  21
  22 This script generates a glibc/localedata/charmaps/UTF-8 file
  23 from Unicode data.
  24
  25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
  26
  27 It will output UTF-8 file
  28 '''
  29
  30 import sys
  31 import re
  32
  33 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
  34 # sections 3.11 and 4.4.
  35
  36 JAMO_INITIAL_SHORT_NAME = (
  37     'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
  38     'C', 'K', 'T', 'P', 'H'
  39 )
  40
  41 JAMO_MEDIAL_SHORT_NAME = (
  42     'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
  43     'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
  44 )
  45
  46 JAMO_FINAL_SHORT_NAME = (
  47     '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
  48     'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
  49     'P', 'H'
  50 )
  51
  52 def ucs_symbol(code_point):
  53     '''Return the UCS symbol string for a Unicode character.'''
  54     if code_point < 0x10000:
  55         return '<U{:04X}>'.format(code_point)
  56     else:
  57         return '<U{:08X}>'.format(code_point)
  58
  59 def process_range(start, end, outfile, name):
  60     '''Writes a range of code points into the CHARMAP section of the
  61     output file
  62
  63     '''
  64     if 'Hangul Syllable' in name:
  65         # from glibc/localedata/ChangeLog:
  66         #
  67         #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
  68         #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
  69         #  so they become printable and carry a width. Comment out surrogate
  70         #  ranges. Add a WIDTH table
  71         #
  72         # So we expand the Hangul Syllables here:
  73         for i in range(int(start, 16), int(end, 16)+1 ):
  74             index2, index3 = divmod(i - 0xaC00, 28)
  75             index1, index2 = divmod(index2, 21)
  76             hangul_syllable_name = 'HANGUL SYLLABLE ' \
  77                                    + JAMO_INITIAL_SHORT_NAME[index1] \
  78                                    + JAMO_MEDIAL_SHORT_NAME[index2] \
  79                                    + JAMO_FINAL_SHORT_NAME[index3]
  80             outfile.write('{:<11s} {:<12s} {:s}\n'.format(
  81                 ucs_symbol(i), convert_to_hex(i),
  82                 hangul_syllable_name))
  83         return
  84     # UnicodeData.txt file has contains code point ranges like this:
  85     #
  86     # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  87     # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  88     #
  89     # The glibc UTF-8 file splits ranges like these into shorter
  90     # ranges of 64 code points each:
  91     #
  92     # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
  93     # …
  94     # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
  95     for i in range(int(start, 16), int(end, 16), 64 ):
  96         if i > (int(end, 16)-64):
  97             outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  98                     ucs_symbol(i),
  99                     ucs_symbol(int(end,16)),
 100                     convert_to_hex(i),
 101                     name))
 102             break
 103         outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
 104                 ucs_symbol(i),
 105                 ucs_symbol(i+63),
 106                 convert_to_hex(i),
 107                 name))
 108
 109 def process_charmap(flines, outfile):
 110     '''This function takes an array which contains *all* lines of
 111     of UnicodeData.txt and write lines to outfile as used in the
 112
 113     CHARMAP
 114     …
 115     END CHARMAP
 116
 117     section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
 118
 119     Samples for input lines:
 120
 121     0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
 122     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 123     4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 124     D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
 125     DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
 126     100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
 127     10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
 128
 129     Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
 130
 131     <U0010>     /x10 DATA LINK ESCAPE
 132     <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
 133     %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
 134     %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
 135     <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
 136
 137     '''
 138     fields_start = []
 139     for line in flines:
 140         fields = line.split(";")
 141          # Some characters have “<control>” as their name. We try to
 142          # use the “Unicode 1.0 Name” (10th field in
 143          # UnicodeData.txt) for them.
 144          #
 145          # The Characters U+0080, U+0081, U+0084 and U+0099 have
 146          # “<control>” as their name but do not even have aa
 147          # ”Unicode 1.0 Name”. We could write code to take their
 148          # alternate names from NameAliases.txt.
 149         if fields[1] == "<control>" and fields[10]:
 150             fields[1] = fields[10]
 151         # Handling code point ranges like:
 152         #
 153         # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 154         # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 155         if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
 156             fields_start = fields
 157             continue
 158         if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
 159             process_range(fields_start[0], fields[0],
 160                           outfile, fields[1][:-7]+'>')
 161             fields_start = []
 162             continue
 163         fields_start = []
 164         if 'Surrogate,' in fields[1]:
 165             # Comment out the surrogates in the UTF-8 file.
 166             # One could of course skip them completely but
 167             # the original UTF-8 file in glibc had them as
 168             # comments, so we keep these comment lines.
 169             outfile.write('%')
 170         outfile.write('{:<11s} {:<12s} {:s}\n'.format(
 171                 ucs_symbol(int(fields[0], 16)),
 172                 convert_to_hex(int(fields[0], 16)),
 173                 fields[1]))
 174
 175 def convert_to_hex(code_point):
 176     '''Converts a code point to a hexadecimal UTF-8 representation
 177     like /x**/x**/x**.'''
 178     # Getting UTF8 of Unicode characters.
 179     # In Python3, .encode('UTF-8') does not work for
 180     # surrogates. Therefore, we use this conversion table
 181     surrogates = {
 182         0xD800: '/xed/xa0/x80',
 183         0xDB7F: '/xed/xad/xbf',
 184         0xDB80: '/xed/xae/x80',
 185         0xDBFF: '/xed/xaf/xbf',
 186         0xDC00: '/xed/xb0/x80',
 187         0xDFFF: '/xed/xbf/xbf',
 188     }
 189     if code_point in surrogates:
 190         return surrogates[code_point]
 191     return ''.join([
 192         '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
 193     ])
 194
 195 def write_header_charmap(outfile):
 196     '''Write the header on top of the CHARMAP section to the output file'''
 197     outfile.write("<code_set_name> UTF-8\n")
 198     outfile.write("<comment_char> %\n")
 199     outfile.write("<escape_char> /\n")
 200     outfile.write("<mb_cur_min> 1\n")
 201     outfile.write("<mb_cur_max> 6\n\n")
 202     outfile.write("% CHARMAP generated using utf8_gen.py\n")
 203     outfile.write("% alias ISO-10646/UTF-8\n")
 204     outfile.write("CHARMAP\n")
 205
 206 def write_header_width(outfile):
 207     '''Writes the header on top of the WIDTH section to the output file'''
 208     outfile.write('% Character width according to Unicode 7.0.0.\n')
 209     outfile.write('% - Default width is 1.\n')
 210     outfile.write('% - Double-width characters have width 2; generated from\n')
 211     outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
 212     outfile.write('% - Non-spacing characters have width 0; '
 213                   + 'generated from PropList.txt or\n')
 214     outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
 215                   + 'UnicodeData.txt"\n')
 216     outfile.write('% - Format control characters have width 0; '
 217                   + 'generated from\n')
 218     outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
 219 #   Not needed covered by Cf
 220 #    outfile.write("% - Zero width characters have width 0; generated from\n")
 221 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
 222     outfile.write("WIDTH\n")
 223
 224 def process_width(outfile, ulines, elines):
 225     '''ulines are lines from UnicodeData.txt, elines are lines from
 226     EastAsianWidth.txt
 227
 228     '''
 229     width_dict = {}
 230     for line in ulines:
 231         fields = line.split(";")
 232         if fields[4] == "NSM" or fields[2] == "Cf":
 233             width_dict[int(fields[0], 16)] = ucs_symbol(
 234                 int(fields[0], 16)) + '\t0'
 235
 236     for line in elines:
 237         # If an entry in EastAsianWidth.txt is found, it overrides entries in
 238         # UnicodeData.txt:
 239         fields = line.split(";")
 240         if not '..' in fields[0]:
 241             width_dict[int(fields[0], 16)] = ucs_symbol(
 242                 int(fields[0], 16)) + '\t2'
 243         else:
 244             code_points = fields[0].split("..")
 245             for key in range(int(code_points[0], 16),
 246                              int(code_points[1], 16)+1):
 247                 if  key in width_dict:
 248                     del width_dict[key]
 249             width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
 250                 ucs_symbol(int(code_points[0], 16)),
 251                 ucs_symbol(int(code_points[1], 16)))
 252
 253     for key in sorted(width_dict):
 254         outfile.write(width_dict[key]+'\n')
 255
 256 if __name__ == "__main__":
 257     if len(sys.argv) < 3:
 258         print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
 259     else:
 260         with open(sys.argv[1], mode='r') as UNIDATA_FILE:
 261             UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
 262         with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
 263             EAST_ASIAN_WIDTH_LINES = []
 264             for LINE in EAST_ASIAN_WIDTH_FILE:
 265                 # If characters from EastAasianWidth.txt which are from
 266                 # from reserved ranges (i.e. not yet assigned code points)
 267                 # are added to the WIDTH section of the UTF-8 file, then
 268                 # “make check” produces “Unknown Character” errors for
 269                 # these code points because such unassigned code points
 270                 # are not in the CHARMAP section of the UTF-8 file.
 271                 #
 272                 # Therefore, we skip all reserved code points when reading
 273                 # the EastAsianWidth.txt file.
 274                 if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
 275                     continue
 276                 if re.match(r'^[^;]*;[WF]', LINE):
 277                     EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
 278         with open('UTF-8', mode='w') as OUTFILE:
 279             # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
 280             write_header_charmap(OUTFILE)
 281             process_charmap(UNICODE_DATA_LINES, OUTFILE)
 282             OUTFILE.write("END CHARMAP\n\n")
 283             # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
 284             write_header_width(OUTFILE)
 285             process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
 286             OUTFILE.write("END WIDTH\n")