localedata/unicode-gen/unicode_utils.py

   1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
   2 #
   3 # Copyright (C) 2014-2024 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <https://www.gnu.org/licenses/>.
  19
  20 '''
  21 This module contains utilities used by the scripts to generate
  22 Unicode data for glibc from upstream Unicode data files.
  23 '''
  24
  25 import sys
  26 import re
  27
  28
  29 # Common locale header.
  30 COMMENT_HEADER = """
  31 % This file is part of the GNU C Library and contains locale data.
  32 % The Free Software Foundation does not claim any copyright interest
  33 % in the locale data contained in this file.  The foregoing does not
  34 % affect the license of the GNU C Library as a whole.  It does not
  35 % exempt you from the conditions of the license if your use would
  36 % otherwise be governed by that license.
  37 """
  38
  39 # Dictionary holding the entire contents of the UnicodeData.txt file
  40 #
  41 # Contents of this dictionary look like this:
  42 #
  43 # {0: {'category': 'Cc',
  44 #      'title': None,
  45 #      'digit': '',
  46 #      'name': '<control>',
  47 #      'bidi': 'BN',
  48 #      'combining': '0',
  49 #      'comment': '',
  50 #      'oldname': 'NULL',
  51 #      'decomposition': '',
  52 #      'upper': None,
  53 #      'mirrored': 'N',
  54 #      'lower': None,
  55 #      'decdigit': '',
  56 #      'numeric': ''},
  57 #      …
  58 # }
  59 UNICODE_ATTRIBUTES = {}
  60
  61 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
  62 #
  63 # Contents of this dictionary look like this:
  64 #
  65 # {917504: ['Default_Ignorable_Code_Point'],
  66 #  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
  67 #  …
  68 # }
  69 DERIVED_CORE_PROPERTIES = {}
  70
  71 # Dictionary holding the entire contents of the EastAsianWidths.txt file
  72 #
  73 # Contents of this dictionary look like this:
  74 #
  75 # {0: 'N', … , 45430: 'W', …}
  76 EAST_ASIAN_WIDTHS = {}
  77
  78 def fill_attribute(code_point, fields):
  79     '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
  80
  81     One entry in the UNICODE_ATTRIBUTES dictionary represents one line
  82     in the UnicodeData.txt file.
  83
  84     '''
  85     UNICODE_ATTRIBUTES[code_point] =  {
  86         'name': fields[1],          # Character name
  87         'category': fields[2],      # General category
  88         'combining': fields[3],     # Canonical combining classes
  89         'bidi': fields[4],          # Bidirectional category
  90         'decomposition': fields[5], # Character decomposition mapping
  91         'decdigit': fields[6],      # Decimal digit value
  92         'digit': fields[7],         # Digit value
  93         'numeric': fields[8],       # Numeric value
  94         'mirrored': fields[9],      # mirrored
  95         'oldname': fields[10],      # Old Unicode 1.0 name
  96         'comment': fields[11],      # comment
  97         # Uppercase mapping
  98         'upper': int(fields[12], 16) if fields[12] else None,
  99         # Lowercase mapping
 100         'lower': int(fields[13], 16) if fields[13] else None,
 101         # Titlecase mapping
 102         'title': int(fields[14], 16) if fields[14] else None,
 103     }
 104
 105 def fill_attributes(filename):
 106     '''Stores the entire contents of the UnicodeData.txt file
 107     in the UNICODE_ATTRIBUTES dictionary.
 108
 109     A typical line for a single code point in UnicodeData.txt looks
 110     like this:
 111
 112     0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
 113
 114     Code point ranges are indicated by pairs of lines like this:
 115
 116     4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
 117     9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
 118     '''
 119     with open(filename, mode='r') as unicode_data_file:
 120         fields_start = []
 121         for line in unicode_data_file:
 122             fields = line.strip().split(';')
 123             if len(fields) != 15:
 124                 sys.stderr.write(
 125                     'short line in file "%(f)s": %(l)s\n' %{
 126                     'f': filename, 'l': line})
 127                 exit(1)
 128             if fields[2] == 'Cs':
 129                 # Surrogates are UTF-16 artefacts,
 130                 # not real characters. Ignore them.
 131                 fields_start = []
 132                 continue
 133             if fields[1].endswith(', First>'):
 134                 fields_start = fields
 135                 fields_start[1] = fields_start[1].split(',')[0][1:]
 136                 continue
 137             if fields[1].endswith(', Last>'):
 138                 fields[1] = fields[1].split(',')[0][1:]
 139                 if fields[1:] != fields_start[1:]:
 140                     sys.stderr.write(
 141                         'broken code point range in file "%(f)s": %(l)s\n' %{
 142                             'f': filename, 'l': line})
 143                     exit(1)
 144                 for code_point in range(
 145                         int(fields_start[0], 16),
 146                         int(fields[0], 16)+1):
 147                     fill_attribute(code_point, fields)
 148                 fields_start = []
 149                 continue
 150             fill_attribute(int(fields[0], 16), fields)
 151             fields_start = []
 152
 153 def fill_derived_core_properties(filename):
 154     '''Stores the entire contents of the DerivedCoreProperties.txt file
 155     in the DERIVED_CORE_PROPERTIES dictionary.
 156
 157     Lines in DerivedCoreProperties.txt are either a code point range like
 158     this:
 159
 160     0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
 161
 162     or a single code point like this:
 163
 164     00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
 165
 166     '''
 167     with open(filename, mode='r') as derived_core_properties_file:
 168         for line in derived_core_properties_file:
 169             match = re.match(
 170                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
 171                 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
 172                 + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
 173                 line)
 174             if not match:
 175                 continue
 176             start = match.group('codepoint1')
 177             end = match.group('codepoint2')
 178             if not end:
 179                 end = start
 180             for code_point in range(int(start, 16), int(end, 16)+1):
 181                 prop = match.group('property')
 182                 if code_point in DERIVED_CORE_PROPERTIES:
 183                     DERIVED_CORE_PROPERTIES[code_point].append(prop)
 184                 else:
 185                     DERIVED_CORE_PROPERTIES[code_point] = [prop]
 186
 187 def fill_east_asian_widths(filename):
 188     '''Stores the entire contents of the EastAsianWidths.txt file
 189     in the EAST_ASIAN_WIDTHS dictionary.
 190
 191     Lines in EastAsianWidths.txt are either a code point range like
 192     this:
 193
 194     9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
 195
 196     or a single code point like this:
 197
 198     A015;W           # Lm         YI SYLLABLE WU
 199     '''
 200     with open(filename, mode='r') as east_asian_widths_file:
 201         for line in east_asian_widths_file:
 202             match = re.match(
 203                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
 204                 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
 205                 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
 206                 line)
 207             if not match:
 208                 continue
 209             start = match.group('codepoint1')
 210             end = match.group('codepoint2')
 211             if not end:
 212                 end = start
 213             for code_point in range(int(start, 16), int(end, 16)+1):
 214                 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
 215
 216 def to_upper(code_point):
 217     '''Returns the code point of the uppercase version
 218     of the given code point'''
 219     if (UNICODE_ATTRIBUTES[code_point]['name']
 220         and UNICODE_ATTRIBUTES[code_point]['upper']):
 221         return UNICODE_ATTRIBUTES[code_point]['upper']
 222     else:
 223         return code_point
 224
 225 def to_lower(code_point):
 226     '''Returns the code point of the lowercase version
 227     of the given code point'''
 228     if (UNICODE_ATTRIBUTES[code_point]['name']
 229         and UNICODE_ATTRIBUTES[code_point]['lower']):
 230         return UNICODE_ATTRIBUTES[code_point]['lower']
 231     else:
 232         return code_point
 233
 234 def to_upper_turkish(code_point):
 235     '''Returns the code point of the Turkish uppercase version
 236     of the given code point'''
 237     if code_point == 0x0069:
 238         return 0x0130
 239     return to_upper(code_point)
 240
 241 def to_lower_turkish(code_point):
 242     '''Returns the code point of the Turkish lowercase version
 243     of the given code point'''
 244     if code_point == 0x0049:
 245         return 0x0131
 246     return to_lower(code_point)
 247
 248 def to_title(code_point):
 249     '''Returns the code point of the titlecase version
 250     of the given code point'''
 251     if (UNICODE_ATTRIBUTES[code_point]['name']
 252         and UNICODE_ATTRIBUTES[code_point]['title']):
 253         return UNICODE_ATTRIBUTES[code_point]['title']
 254     else:
 255         return code_point
 256
 257 def is_upper(code_point):
 258     '''Checks whether the character with this code point is uppercase'''
 259     return (to_lower(code_point) != code_point
 260             or (code_point in DERIVED_CORE_PROPERTIES
 261                 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
 262
 263 def is_lower(code_point):
 264     '''Checks whether the character with this code point is lowercase'''
 265     # Some characters are defined as “Lowercase” in
 266     # DerivedCoreProperties.txt but do not have a mapping to upper
 267     # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
 268     # one of these.
 269     return (to_upper(code_point) != code_point
 270             # <U00DF> is lowercase, but without simple to_upper mapping.
 271             or code_point == 0x00DF
 272             or (code_point in DERIVED_CORE_PROPERTIES
 273                 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
 274
 275 def is_alpha(code_point):
 276     '''Checks whether the character with this code point is alphabetic'''
 277     return ((code_point in DERIVED_CORE_PROPERTIES
 278              and
 279              'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
 280             or
 281             # Consider all the non-ASCII digits as alphabetic.
 282             # ISO C 99 forbids us to have them in category “digit”,
 283             # but we want iswalnum to return true on them.
 284             (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
 285              and not (code_point >= 0x0030 and code_point <= 0x0039)))
 286
 287 def is_digit(code_point):
 288     '''Checks whether the character with this code point is a digit'''
 289     if False:
 290         return (UNICODE_ATTRIBUTES[code_point]['name']
 291                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
 292         # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
 293         # a zero.  Must add <0> in front of them by hand.
 294     else:
 295         # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
 296         # takes it away:
 297         # 7.25.2.1.5:
 298         #    The iswdigit function tests for any wide character that
 299         #    corresponds to a decimal-digit character (as defined in 5.2.1).
 300         # 5.2.1:
 301         #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
 302         return (code_point >= 0x0030 and code_point <= 0x0039)
 303
 304 def is_outdigit(code_point):
 305     '''Checks whether the character with this code point is outdigit'''
 306     return (code_point >= 0x0030 and code_point <= 0x0039)
 307
 308 def is_blank(code_point):
 309     '''Checks whether the character with this code point is blank'''
 310     return (code_point == 0x0009 # '\t'
 311             # Category Zs without mention of '<noBreak>'
 312             or (UNICODE_ATTRIBUTES[code_point]['name']
 313                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
 314                 and '<noBreak>' not in
 315                 UNICODE_ATTRIBUTES[code_point]['decomposition']))
 316
 317 def is_space(code_point):
 318     '''Checks whether the character with this code point is a space'''
 319     # Don’t make U+00A0 a space. Non-breaking space means that all programs
 320     # should treat it like a punctuation character, not like a space.
 321     return (code_point == 0x0020 # ' '
 322             or code_point == 0x000C # '\f'
 323             or code_point == 0x000A # '\n'
 324             or code_point == 0x000D # '\r'
 325             or code_point == 0x0009 # '\t'
 326             or code_point == 0x000B # '\v'
 327             # Categories Zl, Zp, and Zs without mention of "<noBreak>"
 328             or (UNICODE_ATTRIBUTES[code_point]['name']
 329                 and
 330                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
 331                  or
 332                  (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
 333                   and
 334                   '<noBreak>' not in
 335                   UNICODE_ATTRIBUTES[code_point]['decomposition']))))
 336
 337 def is_cntrl(code_point):
 338     '''Checks whether the character with this code point is
 339     a control character'''
 340     return (UNICODE_ATTRIBUTES[code_point]['name']
 341             and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
 342                  or
 343                  UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
 344
 345 def is_xdigit(code_point):
 346     '''Checks whether the character with this code point is
 347     a hexadecimal digit'''
 348     if False:
 349         return (is_digit(code_point)
 350                 or (code_point >= 0x0041 and code_point <= 0x0046)
 351                 or (code_point >= 0x0061 and code_point <= 0x0066))
 352     else:
 353         # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
 354         # takes it away:
 355         # 7.25.2.1.12:
 356         #    The iswxdigit function tests for any wide character that
 357         #    corresponds to a hexadecimal-digit character (as defined
 358         #    in 6.4.4.1).
 359         # 6.4.4.1:
 360         #    hexadecimal-digit: one of
 361         #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
 362         return ((code_point >= 0x0030 and code_point  <= 0x0039)
 363                 or (code_point >= 0x0041 and code_point <= 0x0046)
 364                 or (code_point >= 0x0061 and code_point <= 0x0066))
 365
 366 def is_graph(code_point):
 367     '''Checks whether the character with this code point is
 368     a graphical character'''
 369     return (UNICODE_ATTRIBUTES[code_point]['name']
 370             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
 371             and not is_space(code_point))
 372
 373 def is_print(code_point):
 374     '''Checks whether the character with this code point is printable'''
 375     return (UNICODE_ATTRIBUTES[code_point]['name']
 376             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
 377             and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
 378
 379 def is_punct(code_point):
 380     '''Checks whether the character with this code point is punctuation'''
 381     if False:
 382         return (UNICODE_ATTRIBUTES[code_point]['name']
 383                 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
 384     else:
 385         # The traditional POSIX definition of punctuation is every graphic,
 386         # non-alphanumeric character.
 387         return (is_graph(code_point)
 388                 and not is_alpha(code_point)
 389                 and not is_digit(code_point))
 390
 391 def is_combining(code_point):
 392     '''Checks whether the character with this code point is
 393     a combining character'''
 394     # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
 395     # file. In 3.0.1 it was identical to the union of the general categories
 396     # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
 397     # PropList.txt file, so we take the latter definition.
 398     return (UNICODE_ATTRIBUTES[code_point]['name']
 399             and
 400             UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
 401
 402 def is_combining_level3(code_point):
 403     '''Checks whether the character with this code point is
 404     a combining level3 character'''
 405     return (is_combining(code_point)
 406             and
 407             int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
 408
 409 def ucs_symbol(code_point):
 410     '''Return the UCS symbol string for a Unicode character.'''
 411     if code_point < 0x10000:
 412         return '<U{:04X}>'.format(code_point)
 413     else:
 414         return '<U{:08X}>'.format(code_point)
 415
 416 def ucs_symbol_range(code_point_low, code_point_high):
 417     '''Returns a string UCS symbol string for a code point range.
 418
 419     Example:
 420
 421     <U0041>..<U005A>
 422     '''
 423     return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
 424
 425 def verifications():
 426     '''Tests whether the is_* functions observe the known restrictions'''
 427     for code_point in sorted(UNICODE_ATTRIBUTES):
 428         # toupper restriction: "Only characters specified for the keywords
 429         # lower and upper shall be specified.
 430         if (to_upper(code_point) != code_point
 431             and not (is_lower(code_point) or is_upper(code_point))):
 432             sys.stderr.write(
 433                 ('%(sym)s is not upper|lower '
 434                  + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
 435                     'sym': ucs_symbol(code_point),
 436                     'c': code_point,
 437                     'uc': to_upper(code_point)})
 438         # tolower restriction: "Only characters specified for the keywords
 439         # lower and upper shall be specified.
 440         if (to_lower(code_point) != code_point
 441             and not (is_lower(code_point) or is_upper(code_point))):
 442             sys.stderr.write(
 443                 ('%(sym)s is not upper|lower '
 444                  + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
 445                     'sym': ucs_symbol(code_point),
 446                     'c': code_point,
 447                     'uc': to_lower(code_point)})
 448         # alpha restriction: "Characters classified as either upper or lower
 449         # shall automatically belong to this class.
 450         if ((is_lower(code_point) or is_upper(code_point))
 451              and not is_alpha(code_point)):
 452             sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
 453                 'sym': ucs_symbol(code_point)})
 454         # alpha restriction: “No character specified for the keywords cntrl,
 455         # digit, punct or space shall be specified.”
 456         if (is_alpha(code_point) and is_cntrl(code_point)):
 457             sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
 458                 'sym': ucs_symbol(code_point)})
 459         if (is_alpha(code_point) and is_digit(code_point)):
 460             sys.stderr.write('%(sym)s is alpha and digit\n' %{
 461                 'sym': ucs_symbol(code_point)})
 462         if (is_alpha(code_point) and is_punct(code_point)):
 463             sys.stderr.write('%(sym)s is alpha and punct\n' %{
 464                 'sym': ucs_symbol(code_point)})
 465         if (is_alpha(code_point) and is_space(code_point)):
 466             sys.stderr.write('%(sym)s is alpha and space\n' %{
 467                 'sym': ucs_symbol(code_point)})
 468         # space restriction: “No character specified for the keywords upper,
 469         # lower, alpha, digit, graph or xdigit shall be specified.”
 470         # upper, lower, alpha already checked above.
 471         if (is_space(code_point) and is_digit(code_point)):
 472             sys.stderr.write('%(sym)s is space and digit\n' %{
 473                 'sym': ucs_symbol(code_point)})
 474         if (is_space(code_point) and is_graph(code_point)):
 475             sys.stderr.write('%(sym)s is space and graph\n' %{
 476                 'sym': ucs_symbol(code_point)})
 477         if (is_space(code_point) and is_xdigit(code_point)):
 478             sys.stderr.write('%(sym)s is space and xdigit\n' %{
 479                 'sym': ucs_symbol(code_point)})
 480         # cntrl restriction: “No character specified for the keywords upper,
 481         # lower, alpha, digit, punct, graph, print or xdigit shall be
 482         # specified.”  upper, lower, alpha already checked above.
 483         if (is_cntrl(code_point) and is_digit(code_point)):
 484             sys.stderr.write('%(sym)s is cntrl and digit\n' %{
 485                 'sym': ucs_symbol(code_point)})
 486         if (is_cntrl(code_point) and is_punct(code_point)):
 487             sys.stderr.write('%(sym)s is cntrl and punct\n' %{
 488                 'sym': ucs_symbol(code_point)})
 489         if (is_cntrl(code_point) and is_graph(code_point)):
 490             sys.stderr.write('%(sym)s is cntrl and graph\n' %{
 491                 'sym': ucs_symbol(code_point)})
 492         if (is_cntrl(code_point) and is_print(code_point)):
 493             sys.stderr.write('%(sym)s is cntrl and print\n' %{
 494                 'sym': ucs_symbol(code_point)})
 495         if (is_cntrl(code_point) and is_xdigit(code_point)):
 496             sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
 497                 'sym': ucs_symbol(code_point)})
 498         # punct restriction: “No character specified for the keywords upper,
 499         # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 500         # be specified.”  upper, lower, alpha, cntrl already checked above.
 501         if (is_punct(code_point) and is_digit(code_point)):
 502             sys.stderr.write('%(sym)s is punct and digit\n' %{
 503                 'sym': ucs_symbol(code_point)})
 504         if (is_punct(code_point) and is_xdigit(code_point)):
 505             sys.stderr.write('%(sym)s is punct and xdigit\n' %{
 506                 'sym': ucs_symbol(code_point)})
 507         if (is_punct(code_point) and code_point == 0x0020):
 508             sys.stderr.write('%(sym)s is punct\n' %{
 509                 'sym': ucs_symbol(code_point)})
 510         # graph restriction: “No character specified for the keyword cntrl
 511         # shall be specified.”  Already checked above.
 512
 513         # print restriction: “No character specified for the keyword cntrl
 514         # shall be specified.”  Already checked above.
 515
 516         # graph - print relation: differ only in the <space> character.
 517         # How is this possible if there are more than one space character?!
 518         # I think susv2/xbd/locale.html should speak of “space characters”,
 519         # not “space character”.
 520         if (is_print(code_point)
 521             and not (is_graph(code_point) or is_space(code_point))):
 522             sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
 523                 'sym': unicode_utils.ucs_symbol(code_point)})
 524         if (not is_print(code_point)
 525             and (is_graph(code_point) or code_point == 0x0020)):
 526             sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
 527                 'sym': unicode_utils.ucs_symbol(code_point)})