2 # -*- coding: utf-8 -*-
4 # Generate a translit_combining file from a UnicodeData file.
5 # Copyright (C) 2015-2024 Free Software Foundation, Inc.
6 # This file is part of the GNU C Library.
8 # The GNU C Library is free software; you can redistribute it and/or
9 # modify it under the terms of the GNU Lesser General Public
10 # License as published by the Free Software Foundation; either
11 # version 2.1 of the License, or (at your option) any later version.
13 # The GNU C Library is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 # Lesser General Public License for more details.
18 # You should have received a copy of the GNU Lesser General Public
19 # License along with the GNU C Library; if not, see
20 # <https://www.gnu.org/licenses/>.
23 Generate a translit_combining file from UnicodeData.txt
25 To see how this script is used, call it with the “-h” option:
27 $ ./gen_translit_combining -h
28 … prints usage message …
35 def read_input_file(filename
):
36 '''Reads the original glibc translit_combining file to get the
37 original head and tail.
39 We want to replace only the part of the file between
40 “translit_start” and “translit_end”
43 with
open(filename
, mode
='r') as translit_file
:
44 for line
in translit_file
:
46 if line
.startswith('translit_start'):
48 for line
in translit_file
:
49 if line
.startswith('translit_end'):
52 for line
in translit_file
:
56 def output_head(translit_file
, unicode_version
, head
=''):
57 '''Write the header of the output file, i.e. the part of the file
58 before the “translit_start” line.
60 if ARGS
.input_file
and head
:
61 translit_file
.write(head
)
63 translit_file
.write('escape_char /\n')
64 translit_file
.write('comment_char %\n')
65 translit_file
.write(unicode_utils
.COMMENT_HEADER
)
66 translit_file
.write('\n')
67 translit_file
.write('% Transliterations that remove all ')
68 translit_file
.write('combining characters (accents,\n')
69 translit_file
.write('% pronounciation marks, etc.).\n')
70 translit_file
.write('% Generated automatically from UnicodeData.txt '
71 + 'by gen_translit_combining.py '
72 + 'on {:s} '.format(time
.strftime('%Y-%m-%d'))
73 + 'for Unicode {:s}.\n'.format(unicode_version
))
74 translit_file
.write('\n')
75 translit_file
.write('LC_CTYPE\n')
76 translit_file
.write('\n')
77 translit_file
.write('translit_start\n')
79 def output_tail(translit_file
, tail
=''):
80 '''Write the tail of the output file'''
81 if ARGS
.input_file
and tail
:
82 translit_file
.write(tail
)
84 translit_file
.write('translit_end\n')
85 translit_file
.write('\n')
86 translit_file
.write('END LC_CTYPE\n')
88 def is_combining_remove(code_point
):
89 '''Check whether this is a combining character which should be listed
90 in the section of the translit_combining file where combining
91 characters are replaced by empty strings.
93 We ignore combining characters from many scripts here because
94 the original translit_combining file didn’t do this for the
95 combining characters from these scripts either and I am not
96 sure yet whether this would be useful to do for all combining
97 characters or not. For the moment I think it is better to keep
98 close to the spirit of the original file.
100 if not unicode_utils
.is_combining(code_point
):
102 name
= unicode_utils
.UNICODE_ATTRIBUTES
[code_point
]['name']
103 for substring
in ('DEVANAGARI',
141 'VARIATION SELECTOR',
177 if substring
in name
:
181 def canonical_decompose(code_point
):
182 '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
184 In some instances a canonical mapping or a compatibility mapping
185 may consist of a single character. For a canonical mapping, this
186 indicates that the character is a canonical equivalent of another
187 single character. For a compatibility mapping, this indicates that
188 the character is a compatibility equivalent of another single
191 A canonical mapping may also consist of a pair of characters, but
192 is never longer than two characters. When a canonical mapping
193 consists of a pair of characters, the first character may itself
194 be a character with a decomposition mapping, but the second
195 character never has a decomposition mapping.
197 We ignore the canonical decomposition for code points
198 matching certain substrings because the original translit_combining
199 file didn’t include these types of characters either. I am unsure
200 about the usefulness of including them and want to keep close
201 to the spirit of the original file for the moment.
203 name
= unicode_utils
.UNICODE_ATTRIBUTES
[code_point
]['name']
204 for substring
in ('MUSICAL SYMBOL',
205 'CJK COMPATIBILITY IDEOGRAPH',
212 if substring
in name
:
214 decomposition
= unicode_utils
.UNICODE_ATTRIBUTES
[
215 code_point
]['decomposition']
216 if decomposition
and not decomposition
.startswith('<'):
217 decomposed_code_points
= [int(x
, 16) for x
in decomposition
.split(' ')]
218 if decomposed_code_points
:
219 cd0
= canonical_decompose(decomposed_code_points
[0])
221 decomposed_code_points
= cd0
+ decomposed_code_points
[1:]
222 return decomposed_code_points
226 def special_decompose(code_point_list
):
228 Decompositions which are not canonical or which are not in
229 UnicodeData.txt at all but some of these were used in the original
230 translit_combining file in glibc and they seemed to make sense.
231 I want to keep the update of translit_combining close to the
232 spirit of the original file, therefore I added these special
233 decomposition rules here.
235 special_decompose_dict
= {
236 # Ø U+00D8 is already handled in translit_neutral. But
237 # translit_combining is usually included after translit_neutral
238 # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
239 # has a canonical decomposition to Ø U+00D8 and we want to
240 # further decompose this to U+004F.
241 (0x00D8,): [0x004F], # Ø → O
242 # ø U+00F8 is already handled in translit_neutral. But
243 # translit_combining is usually included after translit_neutral
244 # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE
245 # has a canonical decomposition to ø U+00F8 and we want to
246 # further decompose this to U+006F.
247 (0x00F8,): [0x006F], # ø → o
248 # æ U+00E6 is already in translit_compat because ligatures
249 # are handled in translit_compat. But ǣ U+01E3 has a
250 # canonical decomposition to U+00E6, U+0304 and we want to
251 # further decompose this to “ae”.
252 (0x00E6,): [0x0061, 0x0065], # æ → ae
253 # Æ U+00C6 is already in translit_compat because ligatures
254 # are handled in translit_compat. But Ǣ U+01E2 has a
255 # canonical decomposition to U+00C6, U+0304 and we want to
256 # further decompose this to “AE”
257 (0x00C6,): [0x0041, 0x0045], # Æ → AE
258 # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in
259 # translit_compat because ligatures are handled in translit_compat.
260 # But U+FB1F has a canonical decomposition to U+05F2 and
261 # we want to further decompose this to U+05D9, U+05D9.
262 (0x05F2,): [0x05D9, 0x05D9], # ײ → יי
263 # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt
264 # But U+2000 EN QUAD has a canonical decomposition U+2002
265 # and we want to further decompose this to U+0020.
266 (0x2002,): [0x0020], # EN SPACE → SPACE
267 # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt
268 # But U+2001 EM QUAD has a canonical decomposition to U+2003
269 # and we want to further decompose this to U+0020.
270 (0x2003,): [0x0020], # EM SPACE → SPACE
271 # U+2260 ≠ has the canonical decomposition U+003D U+0338
272 # (= followed by ̸). After stripping the combining characters,
273 # the result is only = which reverses the meaning.
274 # Therefore, we add a special rules here for such mathematical
276 (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<->
277 (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<=
278 (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=>
279 (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=>
280 (0x2204,): [0x0021, 0x2203], # ∄ → !∃
281 (0x2209,): [0x0021, 0x2208], # ∉ → !∈
282 (0x220C,): [0x0021, 0x220B], # ∌ → !∋
283 (0x2224,): [0x0021, 0x2223], # ∤ → !∣
284 (0x2226,): [0x0021, 0x2225], # ∦ → !∥
285 (0x2241,): [0x0021, 0x007E], # ≁ → !~
286 (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
287 (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
288 (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
289 (0x2260,): [0x0021, 0x003D], # ≠ → !=
290 (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
291 (0x226D,): [0x0021, 0x224D], # ≭ → !≍
292 (0x226E,): [0x0021, 0x003C], # ≮ → !<
293 (0x226F,): [0x0021, 0x003E], # ≯ → !>
294 (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<=
295 (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>=
296 (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~
297 (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~
298 (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<>
299 (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !><
300 (0x2280,): [0x0021, 0x227A], # ⊀ → !≺
301 (0x2281,): [0x0021, 0x227B], # ⊁ → !≻
302 (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂
303 (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃
304 (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂=
305 (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃=
306 (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢
307 (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨
308 (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩
309 (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫
310 (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼
311 (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽
312 (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑
313 (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒
314 (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲
315 (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳
316 (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴
317 (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵
318 (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝
319 # Special rule for 〈 U+3008 is added
320 # because 〉 U+2329 has the canonical decomposition U+3008
321 # and we want to further decompose this to > U+003C.
322 (0x3008,): [0x003C], # 〈 → <
323 # Special rule for 〉 U+3009 is added
324 # because 〉 U+232A has the canonical decomposition U+3009
325 # and we want to further decompose this to < U+003E.
326 (0x3009,): [0x003E], # 〉→ >
328 if tuple(code_point_list
) in special_decompose_dict
:
329 return special_decompose_dict
[tuple(code_point_list
)]
331 return code_point_list
333 def output_combining_remove(translit_file
):
334 '''Write the section of the translit_combining file where combining
335 characters are replaced by empty strings.
337 translit_file
.write('\n')
338 for code_point
in sorted(unicode_utils
.UNICODE_ATTRIBUTES
):
339 name
= unicode_utils
.UNICODE_ATTRIBUTES
[code_point
]['name']
340 if is_combining_remove(code_point
):
341 translit_file
.write('% {:s}\n'.format(name
))
342 translit_file
.write('{:s} ""\n'.format(
343 unicode_utils
.ucs_symbol(code_point
)))
344 translit_file
.write('\n')
346 def output_decompositions(translit_file
):
347 '''Write the section of the translit_combining file where characters
348 characters are decomposed and combining characters stripped from
351 for code_point
in sorted(unicode_utils
.UNICODE_ATTRIBUTES
):
352 if special_decompose([code_point
]) != [code_point
]:
353 decomposed_code_points
= [special_decompose([code_point
])]
355 decomposed_code_points
= [canonical_decompose(code_point
)]
356 if decomposed_code_points
[0]:
358 special_decomposed_code_points
= special_decompose(
359 decomposed_code_points
[-1])
360 if (special_decomposed_code_points
361 != decomposed_code_points
[-1]):
362 decomposed_code_points
.append(
363 special_decomposed_code_points
)
365 special_decomposed_code_points
= []
366 for decomposed_code_point
in decomposed_code_points
[-1]:
367 special_decomposed_code_points
+= special_decompose(
368 [decomposed_code_point
])
369 if (special_decomposed_code_points
370 == decomposed_code_points
[-1]):
372 decomposed_code_points
.append(
373 special_decomposed_code_points
)
374 for index
in range(0, len(decomposed_code_points
)):
375 decomposed_code_points
[index
] = [
376 x
for x
in decomposed_code_points
[index
]
377 if not is_combining_remove(x
)]
378 if decomposed_code_points
[0]:
379 translit_file
.write('% {:s}\n'.format(
380 unicode_utils
.UNICODE_ATTRIBUTES
[code_point
]['name']))
381 translit_file
.write('{:s} '.format(
382 unicode_utils
.ucs_symbol(code_point
)))
383 for index
in range(0, len(decomposed_code_points
)):
385 translit_file
.write(';')
386 if len(decomposed_code_points
[index
]) > 1:
387 translit_file
.write('"')
388 for decomposed_code_point
in decomposed_code_points
[index
]:
389 translit_file
.write('{:s}'.format(
390 unicode_utils
.ucs_symbol(decomposed_code_point
)))
391 if len(decomposed_code_points
[index
]) > 1:
392 translit_file
.write('"')
393 translit_file
.write('\n')
394 translit_file
.write('\n')
396 def output_transliteration(translit_file
):
397 '''Write the new transliteration to the output file'''
398 output_combining_remove(translit_file
)
399 output_decompositions(translit_file
)
401 if __name__
== "__main__":
402 PARSER
= argparse
.ArgumentParser(
404 Generate a translit_combining file from UnicodeData.txt.
407 '-u', '--unicode_data_file',
410 default
='UnicodeData.txt',
411 help=('The UnicodeData.txt file to read, '
412 + 'default: %(default)s'))
414 '-i', '--input_file',
417 help=''' The original glibc/localedata/locales/translit_combining
420 '-o', '--output_file',
423 default
='translit_combining.new',
424 help='''The new translit_combining file, default: %(default)s. If the
425 original glibc/localedata/locales/translit_combining file has
426 been given as an option, the header up to the
427 “translit_start” line and the tail from the “translit_end”
428 line to the end of the file will be copied unchanged into the
435 help='The Unicode version of the input files used.')
436 ARGS
= PARSER
.parse_args()
438 unicode_utils
.fill_attributes(ARGS
.unicode_data_file
)
441 (HEAD
, TAIL
) = read_input_file(ARGS
.input_file
)
442 with
open(ARGS
.output_file
, mode
='w') as TRANSLIT_FILE
:
443 output_head(TRANSLIT_FILE
, ARGS
.unicode_version
, head
=HEAD
)
444 output_transliteration(TRANSLIT_FILE
)
445 output_tail(TRANSLIT_FILE
, tail
=TAIL
)