glib/update-gtranslit.py

   1 #!/usr/bin/env python3
   2
   3 # Run this script like so:
   4 #
   5 #  ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
   6
   7 import sys, os
   8
   9 localedir = sys.argv[1]
  10
  11 # returns true if the name looks like a POSIX locale name
  12 def looks_like_locale(name):
  13     name, _, variant = name.partition('@')
  14
  15     if '_' not in name:
  16         return False
  17
  18     lang, _, land = name.partition('_')
  19
  20     return len(lang) == 2 or len(lang) == 3 and len(land) == 2
  21
  22 # handles <U1234> style escapes
  23 def unescape(string):
  24     chunks = []
  25
  26     n = len(string)
  27     i = 0
  28
  29     while i < n:
  30         start_escape = string.find('<', i)
  31
  32         if start_escape == -1:
  33             chunks.append(string[i:])
  34             break
  35
  36         assert string[start_escape:start_escape + 2] == '<U'
  37         start_escape += 2
  38
  39         end_escape = string.find('>', start_escape)
  40         assert end_escape != -1
  41
  42         chunks.append(chr(int(string[start_escape:end_escape], 16)))
  43         i = end_escape + 1
  44
  45     return ''.join(chunks)
  46
  47 # Checks if a string is ascii
  48 def is_ascii(string):
  49     return all(ord(c) < 0x80 for c in string)
  50
  51 # A Mapping is a map from non-ascii strings to ascii strings.
  52 #
  53 # It corresponds to a sequence of one or more mapping lines:
  54 #
  55 #   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
  56 #
  57 # in a file.
  58 class Mapping:
  59     def __init__(self):
  60         self.serialised = None
  61         self.mapping = {}
  62
  63     # Scans a string like
  64     #
  65     #   <U00C4> "<U0041><U0308>";"<U0041><U0045>" % LATIN CAPITAL LETTER A WITH DIAERESIS.
  66     #
  67     # and adds the first all-ascii choice (or IGNORE) to the mapping
  68     # dictionary, with the origin string as the key.  In the case of
  69     # IGNORE, stores the empty string.
  70     def consider_mapping_line(self, line):
  71         key, value, rest = (line + ' % comment').split(maxsplit=2)
  72
  73         key = unescape(key)
  74
  75         for alternative in value.split(';'):
  76             if alternative[0] == '"' and alternative[-1] == '"':
  77                 unescaped = unescape(alternative[1:-1])
  78                 if is_ascii(unescaped):
  79                     self.mapping[key] = unescaped
  80                     break
  81
  82             elif alternative[0] == '<' and alternative[-1] == '>':
  83                 unescaped = unescape(alternative)
  84                 if is_ascii(unescaped):
  85                     self.mapping[key] = unescaped
  86                     break
  87
  88             elif alternative == 'IGNORE':
  89                 self.mapping[key] = ''
  90                 break
  91
  92     # Performs a normal dictionary merge, but ensures that there are no
  93     # conflicting entries between the original dictionary and the requested
  94     # changes
  95     def merge_mapping(self, changes):
  96         for key in changes.mapping:
  97             if key in self.mapping:
  98                 assert self.mapping[key] == changes.mapping[key]
  99
 100         self.mapping.update(changes.mapping)
 101
 102     # Can't get much flatter...
 103     def get_flattened(self):
 104         return [self]
 105
 106     def serialise(self, serialiser):
 107         if self.serialised == None:
 108             self.serialised = serialiser.add_mapping(self.mapping)
 109
 110         return self.serialised
 111
 112 # A Chain is a sequence of mappings and chains.
 113 #
 114 # A chain contains another chain whenever "copy" or "include" is
 115 # encountered in a source file.
 116 #
 117 # A chain contains a mapping whenever a sequence of mapping lines:
 118 #
 119 #   <U00C4> "<U0041><U0308>";"<U0041><U0045>"
 120 #
 121 # is encountered in a file.
 122 #
 123 # The order of lookup is reverse: later entries override earlier ones.
 124 class Chain:
 125     def __init__(self, name):
 126         self.serialised = None
 127         self.name = name
 128         self.chain = []
 129         self.links = 0
 130
 131         self.read_from_file(os.path.join(localedir, name))
 132
 133     def read_from_file(self, filename):
 134         current_mapping = None
 135         in_lc_ctype = False
 136         in_translit = False
 137
 138         fp = open(filename, encoding='ascii', errors='surrogateescape')
 139
 140         for line in fp:
 141             line = line.strip()
 142
 143             if in_lc_ctype:
 144                 if line == 'END LC_CTYPE':
 145                     break
 146
 147                 if line.startswith('copy') or line.startswith('include'):
 148                     if current_mapping:
 149                         self.chain.append(current_mapping)
 150
 151                     copyname = unescape(line.split('"', 3)[1])
 152                     copyfile = get_chain(copyname)
 153                     self.chain.append(copyfile)
 154                     copyfile.links += 1
 155
 156                     current_mapping = None
 157
 158                 elif line == 'translit_start':
 159                     in_translit = True
 160
 161                 elif line == 'translit_end':
 162                     in_translit = False
 163
 164                 elif in_translit and line.startswith('<U'):
 165                     if not current_mapping:
 166                         current_mapping = Mapping()
 167
 168                     current_mapping.consider_mapping_line(line)
 169
 170                 elif line == '' or line.startswith('%'):
 171                     pass
 172
 173                 elif 'default_missing <U003F>':
 174                     pass
 175
 176                 elif in_translit:
 177                     print('unknown line:', line)
 178                     assert False
 179
 180             elif line == 'LC_CTYPE':
 181                 in_lc_ctype = True
 182
 183         if current_mapping:
 184             self.chain.append(current_mapping)
 185
 186     # If there is only one link to this chain, we may as well just
 187     # return the contents of the chain so that they can be merged into
 188     # our sole parent directly.  Otherwise, return ourselves.
 189     def get_flattened(self):
 190         if self.links == 1:
 191             return sum((item.get_flattened() for item in self.chain), [])
 192         else:
 193             return [self]
 194
 195     def serialise(self, serialiser):
 196         if self.serialised == None:
 197             # Before we serialise, see if we can optimise a bit
 198             self.chain = sum((item.get_flattened() for item in self.chain), [])
 199
 200             i = 0
 201             while i < len(self.chain) - 1:
 202                 if isinstance(self.chain[i], Mapping) and isinstance(self.chain[i + 1], Mapping):
 203                     # We have two mappings in a row.  Try to merge them.
 204                     self.chain[i].merge_mapping(self.chain[i + 1])
 205                     del self.chain[i + 1]
 206                 else:
 207                     i += 1
 208
 209             # If all that is left is one item, just serialise that directly
 210             if len(self.chain) == 1:
 211                 self.serialised = self.chain[0].serialise(serialiser)
 212             else:
 213                 ids = [item.serialise(serialiser) for item in self.chain]
 214                 self.serialised = serialiser.add_chain(ids)
 215
 216         return self.serialised
 217
 218 # Chain cache -- allows sharing of common chains
 219 chains = {}
 220 def get_chain(name):
 221     if not name in chains:
 222         chains[name] = Chain(name)
 223
 224     return chains[name]
 225
 226
 227 # Remove the country name from a locale, preserving variant
 228 # eg: 'sr_RS@latin' -> 'sr@latin'
 229 def remove_country(string):
 230     base, at, variant = string.partition('@')
 231     lang, _, land = base.partition('_')
 232     return lang + at + variant
 233
 234 def encode_range(start, end):
 235     assert start <= end
 236     length = end - start
 237
 238     assert start < 0x1000
 239     assert length < 0x8
 240
 241     result = 0x8000 + (length << 12) + start
 242
 243     assert result < 0x10000
 244
 245     return result
 246
 247 def c_pair_array(array):
 248     return '{ ' + ', '.join ('{ %u, %u }' % pair for pair in array) + ' };'
 249
 250 class Serialiser:
 251     def __init__(self):
 252         self.mappings = []
 253         self.chains = []
 254         self.locales = {}
 255
 256     def add_mapping(self, mapping):
 257         if mapping in self.mappings:
 258             mapping_id = self.mappings.index(mapping)
 259         else:
 260             mapping_id = len(self.mappings)
 261             self.mappings.append(mapping)
 262
 263         assert mapping_id < 128
 264         return mapping_id
 265
 266     def add_chain(self, chain):
 267         if chain in self.chains:
 268             chain_id = self.chains.index(chain)
 269         else:
 270             chain_id = len(self.chains)
 271             self.chains.append(chain)
 272
 273         assert chain_id < 128
 274         return 128 + chain_id
 275
 276     def add_locale(self, name, item_id):
 277         self.locales[name] = item_id
 278
 279     def add_default(self, item_id):
 280         self.default = item_id
 281
 282     def optimise_locales(self):
 283         # Check if all regions of a language/variant agree
 284         languages = list(set(remove_country(locale) for locale in self.locales))
 285
 286         for language in languages:
 287             locales = [locale for locale in self.locales if remove_country(locale) == language]
 288
 289             item_id = self.locales[locales[0]]
 290             if all(self.locales[locale] == item_id for locale in locales):
 291                 self.locales[language] = item_id
 292                 for locale in locales:
 293                     del self.locales[locale]
 294
 295         # Check if a variant is the same as the non-variant form
 296         # eg: 'de@euro' and 'de'
 297         for variant in list(locale for locale in self.locales if '@' in locale):
 298             base, _, _ = variant.partition('@')
 299             if base in self.locales and self.locales[base] == self.locales[variant]:
 300                 del self.locales[variant]
 301
 302         # Eliminate any entries that are just the same as the C locale
 303         for locale in list(self.locales):
 304             if self.locales[locale] == self.default:
 305                 del self.locales[locale]
 306
 307     def to_c(self):
 308         src_table = ''
 309         ascii_table = ''
 310         mappings_table = []
 311         mapping_ranges = []
 312         chains_table = []
 313         chain_starts = []
 314         locale_names = ''
 315         locale_index = []
 316         max_lookup = 0
 317         max_localename = 0
 318
 319         for mapping in self.mappings:
 320             mapping_ranges.append ((len(mappings_table), len(mapping)))
 321
 322             for key in sorted(mapping):
 323                 if len(key) == 1 and ord(key[0]) < 0x8000:
 324                     src_range = ord(key[0])
 325                 else:
 326                     existing = src_table.find(key)
 327                     if existing == -1:
 328                         start = len(src_table)
 329                         assert all(ord(c) <= 0x10ffff for c in key)
 330                         src_table += key
 331                         src_range = encode_range(start, len(src_table))
 332                         max_lookup = max(max_lookup, len(key))
 333                     else:
 334                         src_range = encode_range(existing, existing + len(key))
 335
 336                 value = mapping[key]
 337                 if len(value) == 1 and ord(value[0]) < 0x80:
 338                     ascii_range = ord(value[0])
 339                 else:
 340                     existing = ascii_table.find(value)
 341                     if existing == -1:
 342                         start = len(ascii_table)
 343                         assert all(ord(c) < 0x80 for c in value)
 344                         ascii_table += value
 345                         ascii_range = encode_range(start, len(ascii_table))
 346                     else:
 347                         ascii_range = encode_range(existing, existing + len(value))
 348
 349                 mappings_table.append ((src_range, ascii_range))
 350
 351             mapping_end = len(mappings_table)
 352
 353         for chain in self.chains:
 354             chain_starts.append(len(chains_table))
 355
 356             for item_id in reversed(chain):
 357                 assert item_id < 0xff
 358                 chains_table.append(item_id)
 359             chains_table.append(0xff)
 360
 361         for locale in sorted(self.locales):
 362             max_localename = max(max_localename, len(locale))
 363             name_offset = len(locale_names)
 364             assert all(ord(c) <= 0x7f for c in locale)
 365             locale_names += (locale + '\0')
 366
 367             item_id = self.locales[locale]
 368
 369             assert name_offset < 256
 370             assert item_id < 256
 371             locale_index.append((name_offset, item_id))
 372
 373         print('/* Generated by update-gtranslit.py */')
 374         print('#define MAX_KEY_SIZE', max_lookup)
 375         print('#define MAX_LOCALE_NAME', max_localename)
 376         print('static const gunichar src_table[] = {', ', '.join(str(ord(c)) for c in src_table), '};')
 377         # cannot do this in plain ascii because of trigraphs... :(
 378         print('static const gchar ascii_table[] = {', ', '.join(str(ord(c)) for c in ascii_table), '};')
 379         print('static const struct mapping_entry mappings_table[] =', c_pair_array (mappings_table))
 380         print('static const struct mapping_range mapping_ranges[] =', c_pair_array (mapping_ranges))
 381         print('static const guint8 chains_table[] = {', ', '.join(str(i) for i in chains_table), '};')
 382         print('static const guint8 chain_starts[] = {', ', '.join(str(i) for i in chain_starts), '};')
 383         print('static const gchar locale_names[] = "' + locale_names.replace('\0', '\\0') + '";')
 384         print('static const struct locale_entry locale_index[] = ', c_pair_array (locale_index))
 385         print('static const guint8 default_item_id = %u;' % (self.default,))
 386
 387     def dump(self):
 388         print(self.mappings)
 389         print(self.chains)
 390         print(self.locales)
 391
 392 locales = []
 393 for name in os.listdir(localedir):
 394     if looks_like_locale(name):
 395         chain = get_chain(name)
 396         locales.append (chain)
 397         chain.links += 1
 398
 399 serialiser = Serialiser()
 400
 401 for locale in locales:
 402     serialiser.add_locale(locale.name, locale.serialise(serialiser))
 403
 404 i18n = get_chain('i18n').serialise(serialiser)
 405 combining = get_chain('translit_combining').serialise(serialiser)
 406 serialiser.add_default(serialiser.add_chain([i18n, combining]))
 407
 408 serialiser.optimise_locales()
 409
 410 serialiser.to_c()