3 # Run this script like so:
5 # ./update-gtranslit.py /path/to/glibc/localedata/locales > gtranslit-data.h
9 localedir
= sys
.argv
[1]
11 # returns true if the name looks like a POSIX locale name
12 def looks_like_locale(name
):
13 name
, _
, variant
= name
.partition('@')
18 lang
, _
, land
= name
.partition('_')
20 return len(lang
) == 2 or len(lang
) == 3 and len(land
) == 2
22 # handles <U1234> style escapes
30 start_escape
= string
.find('<', i
)
32 if start_escape
== -1:
33 chunks
.append(string
[i
:])
36 assert string
[start_escape
:start_escape
+ 2] == '<U'
39 end_escape
= string
.find('>', start_escape
)
40 assert end_escape
!= -1
42 chunks
.append(chr(int(string
[start_escape
:end_escape
], 16)))
45 return ''.join(chunks
)
47 # Checks if a string is ascii
49 return all(ord(c
) < 0x80 for c
in string
)
51 # A Mapping is a map from non-ascii strings to ascii strings.
53 # It corresponds to a sequence of one or more mapping lines:
55 # <U00C4> "<U0041><U0308>";"<U0041><U0045>"
60 self
.serialised
= None
65 # <U00C4> "<U0041><U0308>";"<U0041><U0045>" % LATIN CAPITAL LETTER A WITH DIAERESIS.
67 # and adds the first all-ascii choice (or IGNORE) to the mapping
68 # dictionary, with the origin string as the key. In the case of
69 # IGNORE, stores the empty string.
70 def consider_mapping_line(self
, line
):
71 key
, value
, rest
= (line
+ ' % comment').split(maxsplit
=2)
75 for alternative
in value
.split(';'):
76 if alternative
[0] == '"' and alternative
[-1] == '"':
77 unescaped
= unescape(alternative
[1:-1])
78 if is_ascii(unescaped
):
79 self
.mapping
[key
] = unescaped
82 elif alternative
[0] == '<' and alternative
[-1] == '>':
83 unescaped
= unescape(alternative
)
84 if is_ascii(unescaped
):
85 self
.mapping
[key
] = unescaped
88 elif alternative
== 'IGNORE':
89 self
.mapping
[key
] = ''
92 # Performs a normal dictionary merge, but ensures that there are no
93 # conflicting entries between the original dictionary and the requested
95 def merge_mapping(self
, changes
):
96 for key
in changes
.mapping
:
97 if key
in self
.mapping
:
98 assert self
.mapping
[key
] == changes
.mapping
[key
]
100 self
.mapping
.update(changes
.mapping
)
102 # Can't get much flatter...
103 def get_flattened(self
):
106 def serialise(self
, serialiser
):
107 if self
.serialised
== None:
108 self
.serialised
= serialiser
.add_mapping(self
.mapping
)
110 return self
.serialised
112 # A Chain is a sequence of mappings and chains.
114 # A chain contains another chain whenever "copy" or "include" is
115 # encountered in a source file.
117 # A chain contains a mapping whenever a sequence of mapping lines:
119 # <U00C4> "<U0041><U0308>";"<U0041><U0045>"
121 # is encountered in a file.
123 # The order of lookup is reverse: later entries override earlier ones.
125 def __init__(self
, name
):
126 self
.serialised
= None
131 self
.read_from_file(os
.path
.join(localedir
, name
))
133 def read_from_file(self
, filename
):
134 current_mapping
= None
138 fp
= open(filename
, encoding
='ascii', errors
='surrogateescape')
144 if line
== 'END LC_CTYPE':
147 if line
.startswith('copy') or line
.startswith('include'):
149 self
.chain
.append(current_mapping
)
151 copyname
= unescape(line
.split('"', 3)[1])
152 copyfile
= get_chain(copyname
)
153 self
.chain
.append(copyfile
)
156 current_mapping
= None
158 elif line
== 'translit_start':
161 elif line
== 'translit_end':
164 elif in_translit
and line
.startswith('<U'):
165 if not current_mapping
:
166 current_mapping
= Mapping()
168 current_mapping
.consider_mapping_line(line
)
170 elif line
== '' or line
.startswith('%'):
173 elif 'default_missing <U003F>':
177 print('unknown line:', line
)
180 elif line
== 'LC_CTYPE':
184 self
.chain
.append(current_mapping
)
186 # If there is only one link to this chain, we may as well just
187 # return the contents of the chain so that they can be merged into
188 # our sole parent directly. Otherwise, return ourselves.
189 def get_flattened(self
):
191 return sum((item
.get_flattened() for item
in self
.chain
), [])
195 def serialise(self
, serialiser
):
196 if self
.serialised
== None:
197 # Before we serialise, see if we can optimise a bit
198 self
.chain
= sum((item
.get_flattened() for item
in self
.chain
), [])
201 while i
< len(self
.chain
) - 1:
202 if isinstance(self
.chain
[i
], Mapping
) and isinstance(self
.chain
[i
+ 1], Mapping
):
203 # We have two mappings in a row. Try to merge them.
204 self
.chain
[i
].merge_mapping(self
.chain
[i
+ 1])
205 del self
.chain
[i
+ 1]
209 # If all that is left is one item, just serialise that directly
210 if len(self
.chain
) == 1:
211 self
.serialised
= self
.chain
[0].serialise(serialiser
)
213 ids
= [item
.serialise(serialiser
) for item
in self
.chain
]
214 self
.serialised
= serialiser
.add_chain(ids
)
216 return self
.serialised
218 # Chain cache -- allows sharing of common chains
221 if not name
in chains
:
222 chains
[name
] = Chain(name
)
227 # Remove the country name from a locale, preserving variant
228 # eg: 'sr_RS@latin' -> 'sr@latin'
229 def remove_country(string
):
230 base
, at
, variant
= string
.partition('@')
231 lang
, _
, land
= base
.partition('_')
232 return lang
+ at
+ variant
234 def encode_range(start
, end
):
238 assert start
< 0x1000
241 result
= 0x8000 + (length
<< 12) + start
243 assert result
< 0x10000
247 def c_pair_array(array
):
248 return '{ ' + ', '.join ('{ %u, %u }' % pair
for pair
in array
) + ' };'
256 def add_mapping(self
, mapping
):
257 if mapping
in self
.mappings
:
258 mapping_id
= self
.mappings
.index(mapping
)
260 mapping_id
= len(self
.mappings
)
261 self
.mappings
.append(mapping
)
263 assert mapping_id
< 128
266 def add_chain(self
, chain
):
267 if chain
in self
.chains
:
268 chain_id
= self
.chains
.index(chain
)
270 chain_id
= len(self
.chains
)
271 self
.chains
.append(chain
)
273 assert chain_id
< 128
274 return 128 + chain_id
276 def add_locale(self
, name
, item_id
):
277 self
.locales
[name
] = item_id
279 def add_default(self
, item_id
):
280 self
.default
= item_id
282 def optimise_locales(self
):
283 # Check if all regions of a language/variant agree
284 languages
= list(set(remove_country(locale
) for locale
in self
.locales
))
286 for language
in languages
:
287 locales
= [locale
for locale
in self
.locales
if remove_country(locale
) == language
]
289 item_id
= self
.locales
[locales
[0]]
290 if all(self
.locales
[locale
] == item_id
for locale
in locales
):
291 self
.locales
[language
] = item_id
292 for locale
in locales
:
293 del self
.locales
[locale
]
295 # Check if a variant is the same as the non-variant form
296 # eg: 'de@euro' and 'de'
297 for variant
in list(locale
for locale
in self
.locales
if '@' in locale
):
298 base
, _
, _
= variant
.partition('@')
299 if base
in self
.locales
and self
.locales
[base
] == self
.locales
[variant
]:
300 del self
.locales
[variant
]
302 # Eliminate any entries that are just the same as the C locale
303 for locale
in list(self
.locales
):
304 if self
.locales
[locale
] == self
.default
:
305 del self
.locales
[locale
]
319 for mapping
in self
.mappings
:
320 mapping_ranges
.append ((len(mappings_table
), len(mapping
)))
322 for key
in sorted(mapping
):
323 if len(key
) == 1 and ord(key
[0]) < 0x8000:
324 src_range
= ord(key
[0])
326 existing
= src_table
.find(key
)
328 start
= len(src_table
)
329 assert all(ord(c
) <= 0x10ffff for c
in key
)
331 src_range
= encode_range(start
, len(src_table
))
332 max_lookup
= max(max_lookup
, len(key
))
334 src_range
= encode_range(existing
, existing
+ len(key
))
337 if len(value
) == 1 and ord(value
[0]) < 0x80:
338 ascii_range
= ord(value
[0])
340 existing
= ascii_table
.find(value
)
342 start
= len(ascii_table
)
343 assert all(ord(c
) < 0x80 for c
in value
)
345 ascii_range
= encode_range(start
, len(ascii_table
))
347 ascii_range
= encode_range(existing
, existing
+ len(value
))
349 mappings_table
.append ((src_range
, ascii_range
))
351 mapping_end
= len(mappings_table
)
353 for chain
in self
.chains
:
354 chain_starts
.append(len(chains_table
))
356 for item_id
in reversed(chain
):
357 assert item_id
< 0xff
358 chains_table
.append(item_id
)
359 chains_table
.append(0xff)
361 for locale
in sorted(self
.locales
):
362 max_localename
= max(max_localename
, len(locale
))
363 name_offset
= len(locale_names
)
364 assert all(ord(c
) <= 0x7f for c
in locale
)
365 locale_names
+= (locale
+ '\0')
367 item_id
= self
.locales
[locale
]
369 assert name_offset
< 256
371 locale_index
.append((name_offset
, item_id
))
373 print('/* Generated by update-gtranslit.py */')
374 print('#define MAX_KEY_SIZE', max_lookup
)
375 print('#define MAX_LOCALE_NAME', max_localename
)
376 print('static const gunichar src_table[] = {', ', '.join(str(ord(c
)) for c
in src_table
), '};')
377 # cannot do this in plain ascii because of trigraphs... :(
378 print('static const gchar ascii_table[] = {', ', '.join(str(ord(c
)) for c
in ascii_table
), '};')
379 print('static const struct mapping_entry mappings_table[] =', c_pair_array (mappings_table
))
380 print('static const struct mapping_range mapping_ranges[] =', c_pair_array (mapping_ranges
))
381 print('static const guint8 chains_table[] = {', ', '.join(str(i
) for i
in chains_table
), '};')
382 print('static const guint8 chain_starts[] = {', ', '.join(str(i
) for i
in chain_starts
), '};')
383 print('static const gchar locale_names[] = "' + locale_names
.replace('\0', '\\0') + '";')
384 print('static const struct locale_entry locale_index[] = ', c_pair_array (locale_index
))
385 print('static const guint8 default_item_id = %u;' % (self
.default
,))
393 for name
in os
.listdir(localedir
):
394 if looks_like_locale(name
):
395 chain
= get_chain(name
)
396 locales
.append (chain
)
399 serialiser
= Serialiser()
401 for locale
in locales
:
402 serialiser
.add_locale(locale
.name
, locale
.serialise(serialiser
))
404 i18n
= get_chain('i18n').serialise(serialiser
)
405 combining
= get_chain('translit_combining').serialise(serialiser
)
406 serialiser
.add_default(serialiser
.add_chain([i18n
, combining
]))
408 serialiser
.optimise_locales()