localedata/unicode-gen/gen_translit_compat.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 #
   4 # Generate a translit_compat file from a UnicodeData file.
   5 # Copyright (C) 2015-2017 Free Software Foundation, Inc.
   6 # This file is part of the GNU C Library.
   7 #
   8 # The GNU C Library is free software; you can redistribute it and/or
   9 # modify it under the terms of the GNU Lesser General Public
  10 # License as published by the Free Software Foundation; either
  11 # version 2.1 of the License, or (at your option) any later version.
  12 #
  13 # The GNU C Library is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 # Lesser General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU Lesser General Public
  19 # License along with the GNU C Library; if not, see
  20 # <http://www.gnu.org/licenses/>.
  21
  22 '''
  23 Generate a translit_compat file from UnicodeData.txt
  24
  25 To see how this script is used, call it with the “-h” option:
  26
  27     $ ./gen_translit_compat -h
  28     … prints usage message …
  29 '''
  30
  31 import argparse
  32 import time
  33 import unicode_utils
  34
  35 def read_input_file(filename):
  36     '''Reads the original glibc translit_compat file to get the
  37     original head and tail.
  38
  39     We want to replace only the part of the file between
  40     “translit_start” and “translit_end”
  41     '''
  42     head = tail = ''
  43     with open(filename, mode='r') as translit_file:
  44         for line in translit_file:
  45             head = head + line
  46             if line.startswith('translit_start'):
  47                 break
  48         for line in translit_file:
  49             if line.startswith('translit_end'):
  50                 tail = line
  51                 break
  52         for line in translit_file:
  53             tail = tail + line
  54     return (head, tail)
  55
  56 def output_head(translit_file, unicode_version, head=''):
  57     '''Write the header of the output file, i.e. the part of the file
  58     before the “translit_start” line.
  59     '''
  60     if ARGS.input_file and head:
  61         translit_file.write(head)
  62     else:
  63         translit_file.write('escape_char /\n')
  64         translit_file.write('comment_char %\n')
  65         translit_file.write(unicode_utils.COMMENT_HEADER)
  66         translit_file.write('\n')
  67         translit_file.write('% Transliterations of compatibility characters ')
  68         translit_file.write('and ligatures.\n')
  69         translit_file.write('% Generated automatically from UnicodeData.txt '
  70                             + 'by gen_translit_compat.py '
  71                             + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
  72                             + 'for Unicode {:s}.\n'.format(unicode_version))
  73         translit_file.write('\n')
  74         translit_file.write('LC_CTYPE\n')
  75         translit_file.write('\n')
  76         translit_file.write('translit_start\n')
  77
  78 def output_tail(translit_file, tail=''):
  79     '''Write the tail of the output file'''
  80     if ARGS.input_file and tail:
  81         translit_file.write(tail)
  82     else:
  83         translit_file.write('translit_end\n')
  84         translit_file.write('\n')
  85         translit_file.write('END LC_CTYPE\n')
  86
  87 def compatibility_decompose(code_point):
  88     '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
  89
  90     “The compatibility decomposition is formed by recursively applying
  91     the canonical and compatibility mappings, then applying the
  92     Canonical Ordering Algorithm.”
  93
  94     We don’t do the canonical decomposition here because this is
  95     done in gen_translit_combining.py to generate translit_combining.
  96
  97     And we ignore some of the possible compatibility formatting tags
  98     here. Some of them are used in other translit_* files, not
  99     translit_compat:
 100
 101     <font>:   translit_font
 102     <circle>: translit_circle
 103     <wide>:   translit_wide
 104     <narrow>: translit_narrow
 105     <square>: translit_cjk_compat
 106     <fraction>: translit_fraction
 107
 108     And we ignore
 109
 110     <noBreak>, <initial>, <medial>, <final>, <isolated>
 111
 112     because they seem to be not useful for transliteration.
 113     '''
 114     decomposition = unicode_utils.UNICODE_ATTRIBUTES[
 115         code_point]['decomposition']
 116     compatibility_tags = (
 117         '<compat>', '<super>', '<sub>', '<vertical>')
 118     for compatibility_tag in compatibility_tags:
 119         if decomposition.startswith(compatibility_tag):
 120             decomposition = decomposition[len(compatibility_tag)+1:]
 121             decomposed_code_points = [int(x, 16)
 122                                       for x in decomposition.split(' ')]
 123             if (len(decomposed_code_points) > 1
 124                     and decomposed_code_points[0] == 0x0020
 125                     and decomposed_code_points[1] >= 0x0300
 126                     and decomposed_code_points[1] <= 0x03FF):
 127                 # Decomposes into a space followed by a combining character.
 128                 # This is not useful fo transliteration.
 129                 return []
 130             else:
 131                 return_value = []
 132                 for index in range(0, len(decomposed_code_points)):
 133                     cd_code_points = compatibility_decompose(
 134                         decomposed_code_points[index])
 135                     if cd_code_points:
 136                         return_value += cd_code_points
 137                     else:
 138                         return_value += [decomposed_code_points[index]]
 139                 return return_value
 140     return []
 141
 142 def special_decompose(code_point_list):
 143     '''
 144     Decompositions which are not in UnicodeData.txt at all but which
 145     were used in the original translit_compat file in glibc and
 146     which seem to make sense.  I want to keep the update of
 147     translit_compat close to the spirit of the original file,
 148     therefore I added this special decomposition rules here.
 149     '''
 150     special_decompose_dict = {
 151         (0x03BC,): [0x0075], # μ → u
 152         (0x02BC,): [0x0027], # ʼ → '
 153     }
 154     if tuple(code_point_list) in special_decompose_dict:
 155         return special_decompose_dict[tuple(code_point_list)]
 156     else:
 157         return code_point_list
 158
 159 def special_ligature_decompose(code_point):
 160     '''
 161     Decompositions for ligatures which are not in UnicodeData.txt at
 162     all but which were used in the original translit_compat file in
 163     glibc and which seem to make sense.  I want to keep the update of
 164     translit_compat close to the spirit of the original file,
 165     therefore I added these special ligature decomposition rules here.
 166
 167     '''
 168     special_ligature_decompose_dict = {
 169         0x00E6: [0x0061, 0x0065], # æ → ae
 170         0x00C6: [0x0041, 0x0045], # Æ → AE
 171         # These following 5 special ligature decompositions were
 172         # in the original glibc/localedata/locales/translit_compat file
 173         0x0152: [0x004F, 0x0045], # Œ → OE
 174         0x0153: [0x006F, 0x0065], # œ → oe
 175         0x05F0: [0x05D5, 0x05D5], # װ → וו
 176         0x05F1: [0x05D5, 0x05D9], # ױ → וי
 177         0x05F2: [0x05D9, 0x05D9], # ײ → יי
 178         # The following special ligature decompositions were
 179         # not in the original glibc/localedata/locales/translit_compat file
 180         # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
 181         # → U+041D CYRILLIC CAPITAL LETTER EN,
 182         #   U+0413 CYRILLIC CAPITAL LETTER GHE
 183         0x04A4: [0x041D, 0x0413], # Ҥ → НГ
 184         # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
 185         # → U+043D CYRILLIC SMALL LETTER EN,
 186         #   U+0433 CYRILLIC SMALL LETTER GHE
 187         0x04A5: [0x043D, 0x0433], # ҥ → нг
 188         # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
 189         # → U+0422 CYRILLIC CAPITAL LETTER TE,
 190         #   U+0426 CYRILLIC CAPITAL LETTER TSE
 191         0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
 192         # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
 193         # → U+0442 CYRILLIC SMALL LETTER TE,
 194         #   U+0446 CYRILLIC SMALL LETTER TSE
 195         0x04B5: [0x0442, 0x0446], # ҵ → тц
 196         # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
 197         # → U+0410 CYRILLIC CAPITAL LETTER A
 198         #   U+0415;CYRILLIC CAPITAL LETTER IE
 199         0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
 200         # U+04D5 CYRILLIC SMALL LIGATURE A IE
 201         # → U+0430 CYRILLIC SMALL LETTER A,
 202         #   U+0435 CYRILLIC SMALL LETTER IE
 203         0x04D5: [0x0430, 0x0435], # ӕ → ае
 204         # I am not sure what to do with the following ligatures
 205         # maybe it makes no sense to decompose them:
 206         # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
 207         # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
 208         # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
 209         # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
 210         # U+fe20 COMBINING LIGATURE LEFT HALF
 211         # U+fe21 COMBINING LIGATURE RIGHT HALF
 212         # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
 213         # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
 214         # U+11176 MAHAJANI LIGATURE SHRI
 215         # U+1f670 SCRIPT LIGATURE ET ORNAMENT
 216         # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
 217         # U+1f672 LIGATURE OPEN ET ORNAMENT
 218         # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
 219     }
 220     if code_point in special_ligature_decompose_dict:
 221         return special_ligature_decompose_dict[code_point]
 222     else:
 223         return [code_point]
 224
 225 def output_transliteration(translit_file):
 226     '''Write the new transliteration to the output file'''
 227     translit_file.write('\n')
 228     for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
 229         name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
 230         decomposed_code_points = [compatibility_decompose(code_point)]
 231         if not decomposed_code_points[0]:
 232             if special_decompose([code_point]) != [code_point]:
 233                 decomposed_code_points[0] = special_decompose([code_point])
 234         else:
 235             special_decomposed_code_points = []
 236             while True:
 237                 special_decomposed_code_points = special_decompose(
 238                     decomposed_code_points[-1])
 239                 if (special_decomposed_code_points
 240                         != decomposed_code_points[-1]):
 241                     decomposed_code_points.append(
 242                         special_decomposed_code_points)
 243                     continue
 244                 special_decomposed_code_points = []
 245                 for decomposed_code_point in decomposed_code_points[-1]:
 246                     special_decomposed_code_points += special_decompose(
 247                         [decomposed_code_point])
 248                 if (special_decomposed_code_points
 249                         == decomposed_code_points[-1]):
 250                     break
 251                 decomposed_code_points.append(
 252                     special_decomposed_code_points)
 253         if decomposed_code_points[0]:
 254             translit_file.write('% {:s}\n'.format(name))
 255             translit_file.write('{:s} '.format(
 256                 unicode_utils.ucs_symbol(code_point)))
 257             for index in range(0, len(decomposed_code_points)):
 258                 if index > 0:
 259                     translit_file.write(';')
 260                 translit_file.write('"')
 261                 for decomposed_code_point in decomposed_code_points[index]:
 262                     translit_file.write('{:s}'.format(
 263                         unicode_utils.ucs_symbol(decomposed_code_point)))
 264                 translit_file.write('"')
 265             translit_file.write('\n')
 266         elif 'LIGATURE' in name and 'ARABIC' not in name:
 267             decomposed_code_points = special_ligature_decompose(code_point)
 268             if decomposed_code_points[0] != code_point:
 269                 translit_file.write('% {:s}\n'.format(name))
 270                 translit_file.write('{:s} '.format(
 271                     unicode_utils.ucs_symbol(code_point)))
 272                 translit_file.write('"')
 273                 for decomposed_code_point in decomposed_code_points:
 274                     translit_file.write('{:s}'.format(
 275                         unicode_utils.ucs_symbol(decomposed_code_point)))
 276                 translit_file.write('"')
 277                 translit_file.write('\n')
 278             else:
 279                 print('Warning: unhandled ligature: {:x} {:s}'.format(
 280                     code_point, name))
 281     translit_file.write('\n')
 282
 283 if __name__ == "__main__":
 284     PARSER = argparse.ArgumentParser(
 285         description='''
 286         Generate a translit_compat file from UnicodeData.txt.
 287         ''')
 288     PARSER.add_argument(
 289         '-u', '--unicode_data_file',
 290         nargs='?',
 291         type=str,
 292         default='UnicodeData.txt',
 293         help=('The UnicodeData.txt file to read, '
 294               + 'default: %(default)s'))
 295     PARSER.add_argument(
 296         '-i', '--input_file',
 297         nargs='?',
 298         type=str,
 299         help=''' The original glibc/localedata/locales/translit_compat
 300         file.''')
 301     PARSER.add_argument(
 302         '-o', '--output_file',
 303         nargs='?',
 304         type=str,
 305         default='translit_compat.new',
 306         help='''The new translit_compat file, default: %(default)s.  If the
 307         original glibc/localedata/locales/translit_compat file has
 308         been given as an option, the header up to the
 309         “translit_start” line and the tail from the “translit_end”
 310         line to the end of the file will be copied unchanged into the
 311         output file.  ''')
 312     PARSER.add_argument(
 313         '--unicode_version',
 314         nargs='?',
 315         required=True,
 316         type=str,
 317         help='The Unicode version of the input files used.')
 318     ARGS = PARSER.parse_args()
 319
 320     unicode_utils.fill_attributes(ARGS.unicode_data_file)
 321     HEAD = TAIL = ''
 322     if ARGS.input_file:
 323         (HEAD, TAIL) = read_input_file(ARGS.input_file)
 324     with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
 325         output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
 326         output_transliteration(TRANSLIT_FILE)
 327         output_tail(TRANSLIT_FILE, tail=TAIL)