malloc/Makefile: Split and sort tests
[glibc.git] / localedata / unicode-gen / unicode_utils.py
blobb993d93c78b70726600ccbcb08ffa987990caa01
1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
3 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <https://www.gnu.org/licenses/>.
20 '''
21 This module contains utilities used by the scripts to generate
22 Unicode data for glibc from upstream Unicode data files.
23 '''
25 import sys
26 import re
29 # Common locale header.
30 COMMENT_HEADER = """
31 % This file is part of the GNU C Library and contains locale data.
32 % The Free Software Foundation does not claim any copyright interest
33 % in the locale data contained in this file. The foregoing does not
34 % affect the license of the GNU C Library as a whole. It does not
35 % exempt you from the conditions of the license if your use would
36 % otherwise be governed by that license.
37 """
39 # Dictionary holding the entire contents of the UnicodeData.txt file
41 # Contents of this dictionary look like this:
43 # {0: {'category': 'Cc',
44 # 'title': None,
45 # 'digit': '',
46 # 'name': '<control>',
47 # 'bidi': 'BN',
48 # 'combining': '0',
49 # 'comment': '',
50 # 'oldname': 'NULL',
51 # 'decomposition': '',
52 # 'upper': None,
53 # 'mirrored': 'N',
54 # 'lower': None,
55 # 'decdigit': '',
56 # 'numeric': ''},
57 # …
58 # }
59 UNICODE_ATTRIBUTES = {}
61 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
63 # Contents of this dictionary look like this:
65 # {917504: ['Default_Ignorable_Code_Point'],
66 # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
67 # …
68 # }
69 DERIVED_CORE_PROPERTIES = {}
71 # Dictionary holding the entire contents of the EastAsianWidths.txt file
73 # Contents of this dictionary look like this:
75 # {0: 'N', … , 45430: 'W', …}
76 EAST_ASIAN_WIDTHS = {}
78 def fill_attribute(code_point, fields):
79 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
81 One entry in the UNICODE_ATTRIBUTES dictionary represents one line
82 in the UnicodeData.txt file.
84 '''
85 UNICODE_ATTRIBUTES[code_point] = {
86 'name': fields[1], # Character name
87 'category': fields[2], # General category
88 'combining': fields[3], # Canonical combining classes
89 'bidi': fields[4], # Bidirectional category
90 'decomposition': fields[5], # Character decomposition mapping
91 'decdigit': fields[6], # Decimal digit value
92 'digit': fields[7], # Digit value
93 'numeric': fields[8], # Numeric value
94 'mirrored': fields[9], # mirrored
95 'oldname': fields[10], # Old Unicode 1.0 name
96 'comment': fields[11], # comment
97 # Uppercase mapping
98 'upper': int(fields[12], 16) if fields[12] else None,
99 # Lowercase mapping
100 'lower': int(fields[13], 16) if fields[13] else None,
101 # Titlecase mapping
102 'title': int(fields[14], 16) if fields[14] else None,
105 def fill_attributes(filename):
106 '''Stores the entire contents of the UnicodeData.txt file
107 in the UNICODE_ATTRIBUTES dictionary.
109 A typical line for a single code point in UnicodeData.txt looks
110 like this:
112 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
114 Code point ranges are indicated by pairs of lines like this:
116 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
117 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
119 with open(filename, mode='r') as unicode_data_file:
120 fields_start = []
121 for line in unicode_data_file:
122 fields = line.strip().split(';')
123 if len(fields) != 15:
124 sys.stderr.write(
125 'short line in file "%(f)s": %(l)s\n' %{
126 'f': filename, 'l': line})
127 exit(1)
128 if fields[2] == 'Cs':
129 # Surrogates are UTF-16 artefacts,
130 # not real characters. Ignore them.
131 fields_start = []
132 continue
133 if fields[1].endswith(', First>'):
134 fields_start = fields
135 fields_start[1] = fields_start[1].split(',')[0][1:]
136 continue
137 if fields[1].endswith(', Last>'):
138 fields[1] = fields[1].split(',')[0][1:]
139 if fields[1:] != fields_start[1:]:
140 sys.stderr.write(
141 'broken code point range in file "%(f)s": %(l)s\n' %{
142 'f': filename, 'l': line})
143 exit(1)
144 for code_point in range(
145 int(fields_start[0], 16),
146 int(fields[0], 16)+1):
147 fill_attribute(code_point, fields)
148 fields_start = []
149 continue
150 fill_attribute(int(fields[0], 16), fields)
151 fields_start = []
153 def fill_derived_core_properties(filename):
154 '''Stores the entire contents of the DerivedCoreProperties.txt file
155 in the DERIVED_CORE_PROPERTIES dictionary.
157 Lines in DerivedCoreProperties.txt are either a code point range like
158 this:
160 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
162 or a single code point like this:
164 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
167 with open(filename, mode='r') as derived_core_properties_file:
168 for line in derived_core_properties_file:
169 match = re.match(
170 r'^(?P<codepoint1>[0-9A-F]{4,6})'
171 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
172 + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
173 line)
174 if not match:
175 continue
176 start = match.group('codepoint1')
177 end = match.group('codepoint2')
178 if not end:
179 end = start
180 for code_point in range(int(start, 16), int(end, 16)+1):
181 prop = match.group('property')
182 if code_point in DERIVED_CORE_PROPERTIES:
183 DERIVED_CORE_PROPERTIES[code_point].append(prop)
184 else:
185 DERIVED_CORE_PROPERTIES[code_point] = [prop]
187 def fill_east_asian_widths(filename):
188 '''Stores the entire contents of the EastAsianWidths.txt file
189 in the EAST_ASIAN_WIDTHS dictionary.
191 Lines in EastAsianWidths.txt are either a code point range like
192 this:
194 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
196 or a single code point like this:
198 A015;W # Lm YI SYLLABLE WU
200 with open(filename, mode='r') as east_asian_widths_file:
201 for line in east_asian_widths_file:
202 match = re.match(
203 r'^(?P<codepoint1>[0-9A-F]{4,6})'
204 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
205 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
206 line)
207 if not match:
208 continue
209 start = match.group('codepoint1')
210 end = match.group('codepoint2')
211 if not end:
212 end = start
213 for code_point in range(int(start, 16), int(end, 16)+1):
214 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
216 def to_upper(code_point):
217 '''Returns the code point of the uppercase version
218 of the given code point'''
219 if (UNICODE_ATTRIBUTES[code_point]['name']
220 and UNICODE_ATTRIBUTES[code_point]['upper']):
221 return UNICODE_ATTRIBUTES[code_point]['upper']
222 else:
223 return code_point
225 def to_lower(code_point):
226 '''Returns the code point of the lowercase version
227 of the given code point'''
228 if (UNICODE_ATTRIBUTES[code_point]['name']
229 and UNICODE_ATTRIBUTES[code_point]['lower']):
230 return UNICODE_ATTRIBUTES[code_point]['lower']
231 else:
232 return code_point
234 def to_upper_turkish(code_point):
235 '''Returns the code point of the Turkish uppercase version
236 of the given code point'''
237 if code_point == 0x0069:
238 return 0x0130
239 return to_upper(code_point)
241 def to_lower_turkish(code_point):
242 '''Returns the code point of the Turkish lowercase version
243 of the given code point'''
244 if code_point == 0x0049:
245 return 0x0131
246 return to_lower(code_point)
248 def to_title(code_point):
249 '''Returns the code point of the titlecase version
250 of the given code point'''
251 if (UNICODE_ATTRIBUTES[code_point]['name']
252 and UNICODE_ATTRIBUTES[code_point]['title']):
253 return UNICODE_ATTRIBUTES[code_point]['title']
254 else:
255 return code_point
257 def is_upper(code_point):
258 '''Checks whether the character with this code point is uppercase'''
259 return (to_lower(code_point) != code_point
260 or (code_point in DERIVED_CORE_PROPERTIES
261 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
263 def is_lower(code_point):
264 '''Checks whether the character with this code point is lowercase'''
265 # Some characters are defined as “Lowercase” in
266 # DerivedCoreProperties.txt but do not have a mapping to upper
267 # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
268 # one of these.
269 return (to_upper(code_point) != code_point
270 # <U00DF> is lowercase, but without simple to_upper mapping.
271 or code_point == 0x00DF
272 or (code_point in DERIVED_CORE_PROPERTIES
273 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
275 def is_alpha(code_point):
276 '''Checks whether the character with this code point is alphabetic'''
277 return ((code_point in DERIVED_CORE_PROPERTIES
279 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
281 # Consider all the non-ASCII digits as alphabetic.
282 # ISO C 99 forbids us to have them in category “digit”,
283 # but we want iswalnum to return true on them.
284 (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
285 and not (code_point >= 0x0030 and code_point <= 0x0039)))
287 def is_digit(code_point):
288 '''Checks whether the character with this code point is a digit'''
289 if False:
290 return (UNICODE_ATTRIBUTES[code_point]['name']
291 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
292 # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
293 # a zero. Must add <0> in front of them by hand.
294 else:
295 # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
296 # takes it away:
297 # 7.25.2.1.5:
298 # The iswdigit function tests for any wide character that
299 # corresponds to a decimal-digit character (as defined in 5.2.1).
300 # 5.2.1:
301 # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
302 return (code_point >= 0x0030 and code_point <= 0x0039)
304 def is_outdigit(code_point):
305 '''Checks whether the character with this code point is outdigit'''
306 return (code_point >= 0x0030 and code_point <= 0x0039)
308 def is_blank(code_point):
309 '''Checks whether the character with this code point is blank'''
310 return (code_point == 0x0009 # '\t'
311 # Category Zs without mention of '<noBreak>'
312 or (UNICODE_ATTRIBUTES[code_point]['name']
313 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
314 and '<noBreak>' not in
315 UNICODE_ATTRIBUTES[code_point]['decomposition']))
317 def is_space(code_point):
318 '''Checks whether the character with this code point is a space'''
319 # Don’t make U+00A0 a space. Non-breaking space means that all programs
320 # should treat it like a punctuation character, not like a space.
321 return (code_point == 0x0020 # ' '
322 or code_point == 0x000C # '\f'
323 or code_point == 0x000A # '\n'
324 or code_point == 0x000D # '\r'
325 or code_point == 0x0009 # '\t'
326 or code_point == 0x000B # '\v'
327 # Categories Zl, Zp, and Zs without mention of "<noBreak>"
328 or (UNICODE_ATTRIBUTES[code_point]['name']
330 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
332 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
334 '<noBreak>' not in
335 UNICODE_ATTRIBUTES[code_point]['decomposition']))))
337 def is_cntrl(code_point):
338 '''Checks whether the character with this code point is
339 a control character'''
340 return (UNICODE_ATTRIBUTES[code_point]['name']
341 and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
343 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
345 def is_xdigit(code_point):
346 '''Checks whether the character with this code point is
347 a hexadecimal digit'''
348 if False:
349 return (is_digit(code_point)
350 or (code_point >= 0x0041 and code_point <= 0x0046)
351 or (code_point >= 0x0061 and code_point <= 0x0066))
352 else:
353 # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
354 # takes it away:
355 # 7.25.2.1.12:
356 # The iswxdigit function tests for any wide character that
357 # corresponds to a hexadecimal-digit character (as defined
358 # in 6.4.4.1).
359 # 6.4.4.1:
360 # hexadecimal-digit: one of
361 # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
362 return ((code_point >= 0x0030 and code_point <= 0x0039)
363 or (code_point >= 0x0041 and code_point <= 0x0046)
364 or (code_point >= 0x0061 and code_point <= 0x0066))
366 def is_graph(code_point):
367 '''Checks whether the character with this code point is
368 a graphical character'''
369 return (UNICODE_ATTRIBUTES[code_point]['name']
370 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
371 and not is_space(code_point))
373 def is_print(code_point):
374 '''Checks whether the character with this code point is printable'''
375 return (UNICODE_ATTRIBUTES[code_point]['name']
376 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
377 and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
379 def is_punct(code_point):
380 '''Checks whether the character with this code point is punctuation'''
381 if False:
382 return (UNICODE_ATTRIBUTES[code_point]['name']
383 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
384 else:
385 # The traditional POSIX definition of punctuation is every graphic,
386 # non-alphanumeric character.
387 return (is_graph(code_point)
388 and not is_alpha(code_point)
389 and not is_digit(code_point))
391 def is_combining(code_point):
392 '''Checks whether the character with this code point is
393 a combining character'''
394 # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
395 # file. In 3.0.1 it was identical to the union of the general categories
396 # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
397 # PropList.txt file, so we take the latter definition.
398 return (UNICODE_ATTRIBUTES[code_point]['name']
400 UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
402 def is_combining_level3(code_point):
403 '''Checks whether the character with this code point is
404 a combining level3 character'''
405 return (is_combining(code_point)
407 int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
409 def ucs_symbol(code_point):
410 '''Return the UCS symbol string for a Unicode character.'''
411 if code_point < 0x10000:
412 return '<U{:04X}>'.format(code_point)
413 else:
414 return '<U{:08X}>'.format(code_point)
416 def ucs_symbol_range(code_point_low, code_point_high):
417 '''Returns a string UCS symbol string for a code point range.
419 Example:
421 <U0041>..<U005A>
423 return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
425 def verifications():
426 '''Tests whether the is_* functions observe the known restrictions'''
427 for code_point in sorted(UNICODE_ATTRIBUTES):
428 # toupper restriction: "Only characters specified for the keywords
429 # lower and upper shall be specified.
430 if (to_upper(code_point) != code_point
431 and not (is_lower(code_point) or is_upper(code_point))):
432 sys.stderr.write(
433 ('%(sym)s is not upper|lower '
434 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
435 'sym': ucs_symbol(code_point),
436 'c': code_point,
437 'uc': to_upper(code_point)})
438 # tolower restriction: "Only characters specified for the keywords
439 # lower and upper shall be specified.
440 if (to_lower(code_point) != code_point
441 and not (is_lower(code_point) or is_upper(code_point))):
442 sys.stderr.write(
443 ('%(sym)s is not upper|lower '
444 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
445 'sym': ucs_symbol(code_point),
446 'c': code_point,
447 'uc': to_lower(code_point)})
448 # alpha restriction: "Characters classified as either upper or lower
449 # shall automatically belong to this class.
450 if ((is_lower(code_point) or is_upper(code_point))
451 and not is_alpha(code_point)):
452 sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
453 'sym': ucs_symbol(code_point)})
454 # alpha restriction: “No character specified for the keywords cntrl,
455 # digit, punct or space shall be specified.”
456 if (is_alpha(code_point) and is_cntrl(code_point)):
457 sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
458 'sym': ucs_symbol(code_point)})
459 if (is_alpha(code_point) and is_digit(code_point)):
460 sys.stderr.write('%(sym)s is alpha and digit\n' %{
461 'sym': ucs_symbol(code_point)})
462 if (is_alpha(code_point) and is_punct(code_point)):
463 sys.stderr.write('%(sym)s is alpha and punct\n' %{
464 'sym': ucs_symbol(code_point)})
465 if (is_alpha(code_point) and is_space(code_point)):
466 sys.stderr.write('%(sym)s is alpha and space\n' %{
467 'sym': ucs_symbol(code_point)})
468 # space restriction: “No character specified for the keywords upper,
469 # lower, alpha, digit, graph or xdigit shall be specified.”
470 # upper, lower, alpha already checked above.
471 if (is_space(code_point) and is_digit(code_point)):
472 sys.stderr.write('%(sym)s is space and digit\n' %{
473 'sym': ucs_symbol(code_point)})
474 if (is_space(code_point) and is_graph(code_point)):
475 sys.stderr.write('%(sym)s is space and graph\n' %{
476 'sym': ucs_symbol(code_point)})
477 if (is_space(code_point) and is_xdigit(code_point)):
478 sys.stderr.write('%(sym)s is space and xdigit\n' %{
479 'sym': ucs_symbol(code_point)})
480 # cntrl restriction: “No character specified for the keywords upper,
481 # lower, alpha, digit, punct, graph, print or xdigit shall be
482 # specified.” upper, lower, alpha already checked above.
483 if (is_cntrl(code_point) and is_digit(code_point)):
484 sys.stderr.write('%(sym)s is cntrl and digit\n' %{
485 'sym': ucs_symbol(code_point)})
486 if (is_cntrl(code_point) and is_punct(code_point)):
487 sys.stderr.write('%(sym)s is cntrl and punct\n' %{
488 'sym': ucs_symbol(code_point)})
489 if (is_cntrl(code_point) and is_graph(code_point)):
490 sys.stderr.write('%(sym)s is cntrl and graph\n' %{
491 'sym': ucs_symbol(code_point)})
492 if (is_cntrl(code_point) and is_print(code_point)):
493 sys.stderr.write('%(sym)s is cntrl and print\n' %{
494 'sym': ucs_symbol(code_point)})
495 if (is_cntrl(code_point) and is_xdigit(code_point)):
496 sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
497 'sym': ucs_symbol(code_point)})
498 # punct restriction: “No character specified for the keywords upper,
499 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
500 # be specified.” upper, lower, alpha, cntrl already checked above.
501 if (is_punct(code_point) and is_digit(code_point)):
502 sys.stderr.write('%(sym)s is punct and digit\n' %{
503 'sym': ucs_symbol(code_point)})
504 if (is_punct(code_point) and is_xdigit(code_point)):
505 sys.stderr.write('%(sym)s is punct and xdigit\n' %{
506 'sym': ucs_symbol(code_point)})
507 if (is_punct(code_point) and code_point == 0x0020):
508 sys.stderr.write('%(sym)s is punct\n' %{
509 'sym': ucs_symbol(code_point)})
510 # graph restriction: “No character specified for the keyword cntrl
511 # shall be specified.” Already checked above.
513 # print restriction: “No character specified for the keyword cntrl
514 # shall be specified.” Already checked above.
516 # graph - print relation: differ only in the <space> character.
517 # How is this possible if there are more than one space character?!
518 # I think susv2/xbd/locale.html should speak of “space characters”,
519 # not “space character”.
520 if (is_print(code_point)
521 and not (is_graph(code_point) or is_space(code_point))):
522 sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
523 'sym': unicode_utils.ucs_symbol(code_point)})
524 if (not is_print(code_point)
525 and (is_graph(code_point) or code_point == 0x0020)):
526 sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
527 'sym': unicode_utils.ucs_symbol(code_point)})