1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
3 # Copyright (C) 2014-2023 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <https://www.gnu.org/licenses/>.
21 This module contains utilities used by the scripts to generate
22 Unicode data for glibc from upstream Unicode data files.
29 # Common locale header.
31 % This file is part of the GNU C Library and contains locale data.
32 % The Free Software Foundation does not claim any copyright interest
33 % in the locale data contained in this file. The foregoing does not
34 % affect the license of the GNU C Library as a whole. It does not
35 % exempt you from the conditions of the license if your use would
36 % otherwise be governed by that license.
39 # Dictionary holding the entire contents of the UnicodeData.txt file
41 # Contents of this dictionary look like this:
43 # {0: {'category': 'Cc',
46 # 'name': '<control>',
51 # 'decomposition': '',
59 UNICODE_ATTRIBUTES
= {}
61 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
63 # Contents of this dictionary look like this:
65 # {917504: ['Default_Ignorable_Code_Point'],
66 # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
69 DERIVED_CORE_PROPERTIES
= {}
71 # Dictionary holding the entire contents of the EastAsianWidths.txt file
73 # Contents of this dictionary look like this:
75 # {0: 'N', … , 45430: 'W', …}
76 EAST_ASIAN_WIDTHS
= {}
78 def fill_attribute(code_point
, fields
):
79 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
81 One entry in the UNICODE_ATTRIBUTES dictionary represents one line
82 in the UnicodeData.txt file.
85 UNICODE_ATTRIBUTES
[code_point
] = {
86 'name': fields
[1], # Character name
87 'category': fields
[2], # General category
88 'combining': fields
[3], # Canonical combining classes
89 'bidi': fields
[4], # Bidirectional category
90 'decomposition': fields
[5], # Character decomposition mapping
91 'decdigit': fields
[6], # Decimal digit value
92 'digit': fields
[7], # Digit value
93 'numeric': fields
[8], # Numeric value
94 'mirrored': fields
[9], # mirrored
95 'oldname': fields
[10], # Old Unicode 1.0 name
96 'comment': fields
[11], # comment
98 'upper': int(fields
[12], 16) if fields
[12] else None,
100 'lower': int(fields
[13], 16) if fields
[13] else None,
102 'title': int(fields
[14], 16) if fields
[14] else None,
105 def fill_attributes(filename
):
106 '''Stores the entire contents of the UnicodeData.txt file
107 in the UNICODE_ATTRIBUTES dictionary.
109 A typical line for a single code point in UnicodeData.txt looks
112 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
114 Code point ranges are indicated by pairs of lines like this:
116 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
117 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
119 with
open(filename
, mode
='r') as unicode_data_file
:
121 for line
in unicode_data_file
:
122 fields
= line
.strip().split(';')
123 if len(fields
) != 15:
125 'short line in file "%(f)s": %(l)s\n' %{
126 'f': filename
, 'l': line
})
128 if fields
[2] == 'Cs':
129 # Surrogates are UTF-16 artefacts,
130 # not real characters. Ignore them.
133 if fields
[1].endswith(', First>'):
134 fields_start
= fields
135 fields_start
[1] = fields_start
[1].split(',')[0][1:]
137 if fields
[1].endswith(', Last>'):
138 fields
[1] = fields
[1].split(',')[0][1:]
139 if fields
[1:] != fields_start
[1:]:
141 'broken code point range in file "%(f)s": %(l)s\n' %{
142 'f': filename
, 'l': line
})
144 for code_point
in range(
145 int(fields_start
[0], 16),
146 int(fields
[0], 16)+1):
147 fill_attribute(code_point
, fields
)
150 fill_attribute(int(fields
[0], 16), fields
)
153 def fill_derived_core_properties(filename
):
154 '''Stores the entire contents of the DerivedCoreProperties.txt file
155 in the DERIVED_CORE_PROPERTIES dictionary.
157 Lines in DerivedCoreProperties.txt are either a code point range like
160 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
162 or a single code point like this:
164 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
167 with
open(filename
, mode
='r') as derived_core_properties_file
:
168 for line
in derived_core_properties_file
:
170 r
'^(?P<codepoint1>[0-9A-F]{4,6})'
171 + r
'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
172 + r
'\s*;\s*(?P<property>[a-zA-Z_]+)',
176 start
= match
.group('codepoint1')
177 end
= match
.group('codepoint2')
180 for code_point
in range(int(start
, 16), int(end
, 16)+1):
181 prop
= match
.group('property')
182 if code_point
in DERIVED_CORE_PROPERTIES
:
183 DERIVED_CORE_PROPERTIES
[code_point
].append(prop
)
185 DERIVED_CORE_PROPERTIES
[code_point
] = [prop
]
187 def fill_east_asian_widths(filename
):
188 '''Stores the entire contents of the EastAsianWidths.txt file
189 in the EAST_ASIAN_WIDTHS dictionary.
191 Lines in EastAsianWidths.txt are either a code point range like
194 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
196 or a single code point like this:
198 A015;W # Lm YI SYLLABLE WU
200 with
open(filename
, mode
='r') as east_asian_widths_file
:
201 for line
in east_asian_widths_file
:
203 r
'^(?P<codepoint1>[0-9A-F]{4,6})'
204 +r
'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
205 +r
'\s*;\s*(?P<property>[a-zA-Z]+)',
209 start
= match
.group('codepoint1')
210 end
= match
.group('codepoint2')
213 for code_point
in range(int(start
, 16), int(end
, 16)+1):
214 EAST_ASIAN_WIDTHS
[code_point
] = match
.group('property')
216 def to_upper(code_point
):
217 '''Returns the code point of the uppercase version
218 of the given code point'''
219 if (UNICODE_ATTRIBUTES
[code_point
]['name']
220 and UNICODE_ATTRIBUTES
[code_point
]['upper']):
221 return UNICODE_ATTRIBUTES
[code_point
]['upper']
225 def to_lower(code_point
):
226 '''Returns the code point of the lowercase version
227 of the given code point'''
228 if (UNICODE_ATTRIBUTES
[code_point
]['name']
229 and UNICODE_ATTRIBUTES
[code_point
]['lower']):
230 return UNICODE_ATTRIBUTES
[code_point
]['lower']
234 def to_upper_turkish(code_point
):
235 '''Returns the code point of the Turkish uppercase version
236 of the given code point'''
237 if code_point
== 0x0069:
239 return to_upper(code_point
)
241 def to_lower_turkish(code_point
):
242 '''Returns the code point of the Turkish lowercase version
243 of the given code point'''
244 if code_point
== 0x0049:
246 return to_lower(code_point
)
248 def to_title(code_point
):
249 '''Returns the code point of the titlecase version
250 of the given code point'''
251 if (UNICODE_ATTRIBUTES
[code_point
]['name']
252 and UNICODE_ATTRIBUTES
[code_point
]['title']):
253 return UNICODE_ATTRIBUTES
[code_point
]['title']
257 def is_upper(code_point
):
258 '''Checks whether the character with this code point is uppercase'''
259 return (to_lower(code_point
) != code_point
260 or (code_point
in DERIVED_CORE_PROPERTIES
261 and 'Uppercase' in DERIVED_CORE_PROPERTIES
[code_point
]))
263 def is_lower(code_point
):
264 '''Checks whether the character with this code point is lowercase'''
265 # Some characters are defined as “Lowercase” in
266 # DerivedCoreProperties.txt but do not have a mapping to upper
267 # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
269 return (to_upper(code_point
) != code_point
270 # <U00DF> is lowercase, but without simple to_upper mapping.
271 or code_point
== 0x00DF
272 or (code_point
in DERIVED_CORE_PROPERTIES
273 and 'Lowercase' in DERIVED_CORE_PROPERTIES
[code_point
]))
275 def is_alpha(code_point
):
276 '''Checks whether the character with this code point is alphabetic'''
277 return ((code_point
in DERIVED_CORE_PROPERTIES
279 'Alphabetic' in DERIVED_CORE_PROPERTIES
[code_point
])
281 # Consider all the non-ASCII digits as alphabetic.
282 # ISO C 99 forbids us to have them in category “digit”,
283 # but we want iswalnum to return true on them.
284 (UNICODE_ATTRIBUTES
[code_point
]['category'] == 'Nd'
285 and not (code_point
>= 0x0030 and code_point
<= 0x0039)))
287 def is_digit(code_point
):
288 '''Checks whether the character with this code point is a digit'''
290 return (UNICODE_ATTRIBUTES
[code_point
]['name']
291 and UNICODE_ATTRIBUTES
[code_point
]['category'] == 'Nd')
292 # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
293 # a zero. Must add <0> in front of them by hand.
295 # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
298 # The iswdigit function tests for any wide character that
299 # corresponds to a decimal-digit character (as defined in 5.2.1).
301 # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
302 return (code_point
>= 0x0030 and code_point
<= 0x0039)
304 def is_outdigit(code_point
):
305 '''Checks whether the character with this code point is outdigit'''
306 return (code_point
>= 0x0030 and code_point
<= 0x0039)
308 def is_blank(code_point
):
309 '''Checks whether the character with this code point is blank'''
310 return (code_point
== 0x0009 # '\t'
311 # Category Zs without mention of '<noBreak>'
312 or (UNICODE_ATTRIBUTES
[code_point
]['name']
313 and UNICODE_ATTRIBUTES
[code_point
]['category'] == 'Zs'
314 and '<noBreak>' not in
315 UNICODE_ATTRIBUTES
[code_point
]['decomposition']))
317 def is_space(code_point
):
318 '''Checks whether the character with this code point is a space'''
319 # Don’t make U+00A0 a space. Non-breaking space means that all programs
320 # should treat it like a punctuation character, not like a space.
321 return (code_point
== 0x0020 # ' '
322 or code_point
== 0x000C # '\f'
323 or code_point
== 0x000A # '\n'
324 or code_point
== 0x000D # '\r'
325 or code_point
== 0x0009 # '\t'
326 or code_point
== 0x000B # '\v'
327 # Categories Zl, Zp, and Zs without mention of "<noBreak>"
328 or (UNICODE_ATTRIBUTES
[code_point
]['name']
330 (UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Zl', 'Zp']
332 (UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Zs']
335 UNICODE_ATTRIBUTES
[code_point
]['decomposition']))))
337 def is_cntrl(code_point
):
338 '''Checks whether the character with this code point is
339 a control character'''
340 return (UNICODE_ATTRIBUTES
[code_point
]['name']
341 and (UNICODE_ATTRIBUTES
[code_point
]['name'] == '<control>'
343 UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Zl', 'Zp']))
345 def is_xdigit(code_point
):
346 '''Checks whether the character with this code point is
347 a hexadecimal digit'''
349 return (is_digit(code_point
)
350 or (code_point
>= 0x0041 and code_point
<= 0x0046)
351 or (code_point
>= 0x0061 and code_point
<= 0x0066))
353 # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
356 # The iswxdigit function tests for any wide character that
357 # corresponds to a hexadecimal-digit character (as defined
360 # hexadecimal-digit: one of
361 # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
362 return ((code_point
>= 0x0030 and code_point
<= 0x0039)
363 or (code_point
>= 0x0041 and code_point
<= 0x0046)
364 or (code_point
>= 0x0061 and code_point
<= 0x0066))
366 def is_graph(code_point
):
367 '''Checks whether the character with this code point is
368 a graphical character'''
369 return (UNICODE_ATTRIBUTES
[code_point
]['name']
370 and UNICODE_ATTRIBUTES
[code_point
]['name'] != '<control>'
371 and not is_space(code_point
))
373 def is_print(code_point
):
374 '''Checks whether the character with this code point is printable'''
375 return (UNICODE_ATTRIBUTES
[code_point
]['name']
376 and UNICODE_ATTRIBUTES
[code_point
]['name'] != '<control>'
377 and UNICODE_ATTRIBUTES
[code_point
]['category'] not in ['Zl', 'Zp'])
379 def is_punct(code_point
):
380 '''Checks whether the character with this code point is punctuation'''
382 return (UNICODE_ATTRIBUTES
[code_point
]['name']
383 and UNICODE_ATTRIBUTES
[code_point
]['category'].startswith('P'))
385 # The traditional POSIX definition of punctuation is every graphic,
386 # non-alphanumeric character.
387 return (is_graph(code_point
)
388 and not is_alpha(code_point
)
389 and not is_digit(code_point
))
391 def is_combining(code_point
):
392 '''Checks whether the character with this code point is
393 a combining character'''
394 # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
395 # file. In 3.0.1 it was identical to the union of the general categories
396 # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
397 # PropList.txt file, so we take the latter definition.
398 return (UNICODE_ATTRIBUTES
[code_point
]['name']
400 UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Mn', 'Mc', 'Me'])
402 def is_combining_level3(code_point
):
403 '''Checks whether the character with this code point is
404 a combining level3 character'''
405 return (is_combining(code_point
)
407 int(UNICODE_ATTRIBUTES
[code_point
]['combining']) in range(0, 200))
409 def ucs_symbol(code_point
):
410 '''Return the UCS symbol string for a Unicode character.'''
411 if code_point
< 0x10000:
412 return '<U{:04X}>'.format(code_point
)
414 return '<U{:08X}>'.format(code_point
)
416 def ucs_symbol_range(code_point_low
, code_point_high
):
417 '''Returns a string UCS symbol string for a code point range.
423 return ucs_symbol(code_point_low
) + '..' + ucs_symbol(code_point_high
)
426 '''Tests whether the is_* functions observe the known restrictions'''
427 for code_point
in sorted(UNICODE_ATTRIBUTES
):
428 # toupper restriction: "Only characters specified for the keywords
429 # lower and upper shall be specified.
430 if (to_upper(code_point
) != code_point
431 and not (is_lower(code_point
) or is_upper(code_point
))):
433 ('%(sym)s is not upper|lower '
434 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
435 'sym': ucs_symbol(code_point
),
437 'uc': to_upper(code_point
)})
438 # tolower restriction: "Only characters specified for the keywords
439 # lower and upper shall be specified.
440 if (to_lower(code_point
) != code_point
441 and not (is_lower(code_point
) or is_upper(code_point
))):
443 ('%(sym)s is not upper|lower '
444 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
445 'sym': ucs_symbol(code_point
),
447 'uc': to_lower(code_point
)})
448 # alpha restriction: "Characters classified as either upper or lower
449 # shall automatically belong to this class.
450 if ((is_lower(code_point
) or is_upper(code_point
))
451 and not is_alpha(code_point
)):
452 sys
.stderr
.write('%(sym)s is upper|lower but not alpha\n' %{
453 'sym': ucs_symbol(code_point
)})
454 # alpha restriction: “No character specified for the keywords cntrl,
455 # digit, punct or space shall be specified.”
456 if (is_alpha(code_point
) and is_cntrl(code_point
)):
457 sys
.stderr
.write('%(sym)s is alpha and cntrl\n' %{
458 'sym': ucs_symbol(code_point
)})
459 if (is_alpha(code_point
) and is_digit(code_point
)):
460 sys
.stderr
.write('%(sym)s is alpha and digit\n' %{
461 'sym': ucs_symbol(code_point
)})
462 if (is_alpha(code_point
) and is_punct(code_point
)):
463 sys
.stderr
.write('%(sym)s is alpha and punct\n' %{
464 'sym': ucs_symbol(code_point
)})
465 if (is_alpha(code_point
) and is_space(code_point
)):
466 sys
.stderr
.write('%(sym)s is alpha and space\n' %{
467 'sym': ucs_symbol(code_point
)})
468 # space restriction: “No character specified for the keywords upper,
469 # lower, alpha, digit, graph or xdigit shall be specified.”
470 # upper, lower, alpha already checked above.
471 if (is_space(code_point
) and is_digit(code_point
)):
472 sys
.stderr
.write('%(sym)s is space and digit\n' %{
473 'sym': ucs_symbol(code_point
)})
474 if (is_space(code_point
) and is_graph(code_point
)):
475 sys
.stderr
.write('%(sym)s is space and graph\n' %{
476 'sym': ucs_symbol(code_point
)})
477 if (is_space(code_point
) and is_xdigit(code_point
)):
478 sys
.stderr
.write('%(sym)s is space and xdigit\n' %{
479 'sym': ucs_symbol(code_point
)})
480 # cntrl restriction: “No character specified for the keywords upper,
481 # lower, alpha, digit, punct, graph, print or xdigit shall be
482 # specified.” upper, lower, alpha already checked above.
483 if (is_cntrl(code_point
) and is_digit(code_point
)):
484 sys
.stderr
.write('%(sym)s is cntrl and digit\n' %{
485 'sym': ucs_symbol(code_point
)})
486 if (is_cntrl(code_point
) and is_punct(code_point
)):
487 sys
.stderr
.write('%(sym)s is cntrl and punct\n' %{
488 'sym': ucs_symbol(code_point
)})
489 if (is_cntrl(code_point
) and is_graph(code_point
)):
490 sys
.stderr
.write('%(sym)s is cntrl and graph\n' %{
491 'sym': ucs_symbol(code_point
)})
492 if (is_cntrl(code_point
) and is_print(code_point
)):
493 sys
.stderr
.write('%(sym)s is cntrl and print\n' %{
494 'sym': ucs_symbol(code_point
)})
495 if (is_cntrl(code_point
) and is_xdigit(code_point
)):
496 sys
.stderr
.write('%(sym)s is cntrl and xdigit\n' %{
497 'sym': ucs_symbol(code_point
)})
498 # punct restriction: “No character specified for the keywords upper,
499 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
500 # be specified.” upper, lower, alpha, cntrl already checked above.
501 if (is_punct(code_point
) and is_digit(code_point
)):
502 sys
.stderr
.write('%(sym)s is punct and digit\n' %{
503 'sym': ucs_symbol(code_point
)})
504 if (is_punct(code_point
) and is_xdigit(code_point
)):
505 sys
.stderr
.write('%(sym)s is punct and xdigit\n' %{
506 'sym': ucs_symbol(code_point
)})
507 if (is_punct(code_point
) and code_point
== 0x0020):
508 sys
.stderr
.write('%(sym)s is punct\n' %{
509 'sym': ucs_symbol(code_point
)})
510 # graph restriction: “No character specified for the keyword cntrl
511 # shall be specified.” Already checked above.
513 # print restriction: “No character specified for the keyword cntrl
514 # shall be specified.” Already checked above.
516 # graph - print relation: differ only in the <space> character.
517 # How is this possible if there are more than one space character?!
518 # I think susv2/xbd/locale.html should speak of “space characters”,
519 # not “space character”.
520 if (is_print(code_point
)
521 and not (is_graph(code_point
) or is_space(code_point
))):
522 sys
.stderr
.write('%(sym)s is print but not graph|<space>\n' %{
523 'sym': unicode_utils
.ucs_symbol(code_point
)})
524 if (not is_print(code_point
)
525 and (is_graph(code_point
) or code_point
== 0x0020)):
526 sys
.stderr
.write('%(sym)s is graph|<space> but not print\n' %{
527 'sym': unicode_utils
.ucs_symbol(code_point
)})