localedata/unicode-gen/unicode_utils.py

   1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
   2 #
   3 # Copyright (C) 2014-2016 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <http://www.gnu.org/licenses/>.
  19
  20 '''
  21 This module contains utilities used by the scripts to generate
  22 Unicode data for glibc from upstream Unicode data files.
  23 '''
  24
  25 import sys
  26 import re
  27
  28 # Dictionary holding the entire contents of the UnicodeData.txt file
  29 #
  30 # Contents of this dictionary look like this:
  31 #
  32 # {0: {'category': 'Cc',
  33 #      'title': None,
  34 #      'digit': '',
  35 #      'name': '<control>',
  36 #      'bidi': 'BN',
  37 #      'combining': '0',
  38 #      'comment': '',
  39 #      'oldname': 'NULL',
  40 #      'decomposition': '',
  41 #      'upper': None,
  42 #      'mirrored': 'N',
  43 #      'lower': None,
  44 #      'decdigit': '',
  45 #      'numeric': ''},
  46 #      …
  47 # }
  48 UNICODE_ATTRIBUTES = {}
  49
  50 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
  51 #
  52 # Contents of this dictionary look like this:
  53 #
  54 # {917504: ['Default_Ignorable_Code_Point'],
  55 #  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
  56 #  …
  57 # }
  58 DERIVED_CORE_PROPERTIES = {}
  59
  60 # Dictionary holding the entire contents of the EastAsianWidths.txt file
  61 #
  62 # Contents of this dictionary look like this:
  63 #
  64 # {0: 'N', … , 45430: 'W', …}
  65 EAST_ASIAN_WIDTHS = {}
  66
  67 def fill_attribute(code_point, fields):
  68     '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
  69
  70     One entry in the UNICODE_ATTRIBUTES dictionary represents one line
  71     in the UnicodeData.txt file.
  72
  73     '''
  74     UNICODE_ATTRIBUTES[code_point] =  {
  75         'name': fields[1],          # Character name
  76         'category': fields[2],      # General category
  77         'combining': fields[3],     # Canonical combining classes
  78         'bidi': fields[4],          # Bidirectional category
  79         'decomposition': fields[5], # Character decomposition mapping
  80         'decdigit': fields[6],      # Decimal digit value
  81         'digit': fields[7],         # Digit value
  82         'numeric': fields[8],       # Numeric value
  83         'mirrored': fields[9],      # mirrored
  84         'oldname': fields[10],      # Old Unicode 1.0 name
  85         'comment': fields[11],      # comment
  86         # Uppercase mapping
  87         'upper': int(fields[12], 16) if fields[12] else None,
  88         # Lowercase mapping
  89         'lower': int(fields[13], 16) if fields[13] else None,
  90         # Titlecase mapping
  91         'title': int(fields[14], 16) if fields[14] else None,
  92     }
  93
  94 def fill_attributes(filename):
  95     '''Stores the entire contents of the UnicodeData.txt file
  96     in the UNICODE_ATTRIBUTES dictionary.
  97
  98     A typical line for a single code point in UnicodeData.txt looks
  99     like this:
 100
 101     0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
 102
 103     Code point ranges are indicated by pairs of lines like this:
 104
 105     4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
 106     9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
 107     '''
 108     with open(filename, mode='r') as unicode_data_file:
 109         fields_start = []
 110         for line in unicode_data_file:
 111             fields = line.strip().split(';')
 112             if len(fields) != 15:
 113                 sys.stderr.write(
 114                     'short line in file "%(f)s": %(l)s\n' %{
 115                     'f': filename, 'l': line})
 116                 exit(1)
 117             if fields[2] == 'Cs':
 118                 # Surrogates are UTF-16 artefacts,
 119                 # not real characters. Ignore them.
 120                 fields_start = []
 121                 continue
 122             if fields[1].endswith(', First>'):
 123                 fields_start = fields
 124                 fields_start[1] = fields_start[1].split(',')[0][1:]
 125                 continue
 126             if fields[1].endswith(', Last>'):
 127                 fields[1] = fields[1].split(',')[0][1:]
 128                 if fields[1:] != fields_start[1:]:
 129                     sys.stderr.write(
 130                         'broken code point range in file "%(f)s": %(l)s\n' %{
 131                             'f': filename, 'l': line})
 132                     exit(1)
 133                 for code_point in range(
 134                         int(fields_start[0], 16),
 135                         int(fields[0], 16)+1):
 136                     fill_attribute(code_point, fields)
 137                 fields_start = []
 138                 continue
 139             fill_attribute(int(fields[0], 16), fields)
 140             fields_start = []
 141
 142 def fill_derived_core_properties(filename):
 143     '''Stores the entire contents of the DerivedCoreProperties.txt file
 144     in the DERIVED_CORE_PROPERTIES dictionary.
 145
 146     Lines in DerivedCoreProperties.txt are either a code point range like
 147     this:
 148
 149     0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
 150
 151     or a single code point like this:
 152
 153     00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
 154
 155     '''
 156     with open(filename, mode='r') as derived_core_properties_file:
 157         for line in derived_core_properties_file:
 158             match = re.match(
 159                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
 160                 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
 161                 + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
 162                 line)
 163             if not match:
 164                 continue
 165             start = match.group('codepoint1')
 166             end = match.group('codepoint2')
 167             if not end:
 168                 end = start
 169             for code_point in range(int(start, 16), int(end, 16)+1):
 170                 prop = match.group('property')
 171                 if code_point in DERIVED_CORE_PROPERTIES:
 172                     DERIVED_CORE_PROPERTIES[code_point].append(prop)
 173                 else:
 174                     DERIVED_CORE_PROPERTIES[code_point] = [prop]
 175
 176 def fill_east_asian_widths(filename):
 177     '''Stores the entire contents of the EastAsianWidths.txt file
 178     in the EAST_ASIAN_WIDTHS dictionary.
 179
 180     Lines in EastAsianWidths.txt are either a code point range like
 181     this:
 182
 183     9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
 184
 185     or a single code point like this:
 186
 187     A015;W           # Lm         YI SYLLABLE WU
 188     '''
 189     with open(filename, mode='r') as east_asian_widths_file:
 190         for line in east_asian_widths_file:
 191             match = re.match(
 192                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
 193                 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
 194                 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
 195                 line)
 196             if not match:
 197                 continue
 198             start = match.group('codepoint1')
 199             end = match.group('codepoint2')
 200             if not end:
 201                 end = start
 202             for code_point in range(int(start, 16), int(end, 16)+1):
 203                 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
 204
 205 def to_upper(code_point):
 206     '''Returns the code point of the uppercase version
 207     of the given code point'''
 208     if (UNICODE_ATTRIBUTES[code_point]['name']
 209         and UNICODE_ATTRIBUTES[code_point]['upper']):
 210         return UNICODE_ATTRIBUTES[code_point]['upper']
 211     else:
 212         return code_point
 213
 214 def to_lower(code_point):
 215     '''Returns the code point of the lowercase version
 216     of the given code point'''
 217     if (UNICODE_ATTRIBUTES[code_point]['name']
 218         and UNICODE_ATTRIBUTES[code_point]['lower']):
 219         return UNICODE_ATTRIBUTES[code_point]['lower']
 220     else:
 221         return code_point
 222
 223 def to_upper_turkish(code_point):
 224     '''Returns the code point of the Turkish uppercase version
 225     of the given code point'''
 226     if code_point == 0x0069:
 227         return 0x0130
 228     return to_upper(code_point)
 229
 230 def to_lower_turkish(code_point):
 231     '''Returns the code point of the Turkish lowercase version
 232     of the given code point'''
 233     if code_point == 0x0049:
 234         return 0x0131
 235     return to_lower(code_point)
 236
 237 def to_title(code_point):
 238     '''Returns the code point of the titlecase version
 239     of the given code point'''
 240     if (UNICODE_ATTRIBUTES[code_point]['name']
 241         and UNICODE_ATTRIBUTES[code_point]['title']):
 242         return UNICODE_ATTRIBUTES[code_point]['title']
 243     else:
 244         return code_point
 245
 246 def is_upper(code_point):
 247     '''Checks whether the character with this code point is uppercase'''
 248     return (to_lower(code_point) != code_point
 249             or (code_point in DERIVED_CORE_PROPERTIES
 250                 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
 251
 252 def is_lower(code_point):
 253     '''Checks whether the character with this code point is lowercase'''
 254     # Some characters are defined as “Lowercase” in
 255     # DerivedCoreProperties.txt but do not have a mapping to upper
 256     # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
 257     # one of these.
 258     return (to_upper(code_point) != code_point
 259             # <U00DF> is lowercase, but without simple to_upper mapping.
 260             or code_point == 0x00DF
 261             or (code_point in DERIVED_CORE_PROPERTIES
 262                 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
 263
 264 def is_alpha(code_point):
 265     '''Checks whether the character with this code point is alphabetic'''
 266     return ((code_point in DERIVED_CORE_PROPERTIES
 267              and
 268              'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
 269             or
 270             # Consider all the non-ASCII digits as alphabetic.
 271             # ISO C 99 forbids us to have them in category “digit”,
 272             # but we want iswalnum to return true on them.
 273             (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
 274              and not (code_point >= 0x0030 and code_point <= 0x0039)))
 275
 276 def is_digit(code_point):
 277     '''Checks whether the character with this code point is a digit'''
 278     if False:
 279         return (UNICODE_ATTRIBUTES[code_point]['name']
 280                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
 281         # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
 282         # a zero.  Must add <0> in front of them by hand.
 283     else:
 284         # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
 285         # takes it away:
 286         # 7.25.2.1.5:
 287         #    The iswdigit function tests for any wide character that
 288         #    corresponds to a decimal-digit character (as defined in 5.2.1).
 289         # 5.2.1:
 290         #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
 291         return (code_point >= 0x0030 and code_point <= 0x0039)
 292
 293 def is_outdigit(code_point):
 294     '''Checks whether the character with this code point is outdigit'''
 295     return (code_point >= 0x0030 and code_point <= 0x0039)
 296
 297 def is_blank(code_point):
 298     '''Checks whether the character with this code point is blank'''
 299     return (code_point == 0x0009 # '\t'
 300             # Category Zs without mention of '<noBreak>'
 301             or (UNICODE_ATTRIBUTES[code_point]['name']
 302                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
 303                 and '<noBreak>' not in
 304                 UNICODE_ATTRIBUTES[code_point]['decomposition']))
 305
 306 def is_space(code_point):
 307     '''Checks whether the character with this code point is a space'''
 308     # Don’t make U+00A0 a space. Non-breaking space means that all programs
 309     # should treat it like a punctuation character, not like a space.
 310     return (code_point == 0x0020 # ' '
 311             or code_point == 0x000C # '\f'
 312             or code_point == 0x000A # '\n'
 313             or code_point == 0x000D # '\r'
 314             or code_point == 0x0009 # '\t'
 315             or code_point == 0x000B # '\v'
 316             # Categories Zl, Zp, and Zs without mention of "<noBreak>"
 317             or (UNICODE_ATTRIBUTES[code_point]['name']
 318                 and
 319                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
 320                  or
 321                  (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
 322                   and
 323                   '<noBreak>' not in
 324                   UNICODE_ATTRIBUTES[code_point]['decomposition']))))
 325
 326 def is_cntrl(code_point):
 327     '''Checks whether the character with this code point is
 328     a control character'''
 329     return (UNICODE_ATTRIBUTES[code_point]['name']
 330             and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
 331                  or
 332                  UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
 333
 334 def is_xdigit(code_point):
 335     '''Checks whether the character with this code point is
 336     a hexadecimal digit'''
 337     if False:
 338         return (is_digit(code_point)
 339                 or (code_point >= 0x0041 and code_point <= 0x0046)
 340                 or (code_point >= 0x0061 and code_point <= 0x0066))
 341     else:
 342         # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
 343         # takes it away:
 344         # 7.25.2.1.12:
 345         #    The iswxdigit function tests for any wide character that
 346         #    corresponds to a hexadecimal-digit character (as defined
 347         #    in 6.4.4.1).
 348         # 6.4.4.1:
 349         #    hexadecimal-digit: one of
 350         #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
 351         return ((code_point >= 0x0030 and code_point  <= 0x0039)
 352                 or (code_point >= 0x0041 and code_point <= 0x0046)
 353                 or (code_point >= 0x0061 and code_point <= 0x0066))
 354
 355 def is_graph(code_point):
 356     '''Checks whether the character with this code point is
 357     a graphical character'''
 358     return (UNICODE_ATTRIBUTES[code_point]['name']
 359             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
 360             and not is_space(code_point))
 361
 362 def is_print(code_point):
 363     '''Checks whether the character with this code point is printable'''
 364     return (UNICODE_ATTRIBUTES[code_point]['name']
 365             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
 366             and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
 367
 368 def is_punct(code_point):
 369     '''Checks whether the character with this code point is punctuation'''
 370     if False:
 371         return (UNICODE_ATTRIBUTES[code_point]['name']
 372                 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
 373     else:
 374         # The traditional POSIX definition of punctuation is every graphic,
 375         # non-alphanumeric character.
 376         return (is_graph(code_point)
 377                 and not is_alpha(code_point)
 378                 and not is_digit(code_point))
 379
 380 def is_combining(code_point):
 381     '''Checks whether the character with this code point is
 382     a combining character'''
 383     # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
 384     # file. In 3.0.1 it was identical to the union of the general categories
 385     # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
 386     # PropList.txt file, so we take the latter definition.
 387     return (UNICODE_ATTRIBUTES[code_point]['name']
 388             and
 389             UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
 390
 391 def is_combining_level3(code_point):
 392     '''Checks whether the character with this code point is
 393     a combining level3 character'''
 394     return (is_combining(code_point)
 395             and
 396             int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
 397
 398 def ucs_symbol(code_point):
 399     '''Return the UCS symbol string for a Unicode character.'''
 400     if code_point < 0x10000:
 401         return '<U{:04X}>'.format(code_point)
 402     else:
 403         return '<U{:08X}>'.format(code_point)
 404
 405 def ucs_symbol_range(code_point_low, code_point_high):
 406     '''Returns a string UCS symbol string for a code point range.
 407
 408     Example:
 409
 410     <U0041>..<U005A>
 411     '''
 412     return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
 413
 414 def verifications():
 415     '''Tests whether the is_* functions observe the known restrictions'''
 416     for code_point in sorted(UNICODE_ATTRIBUTES):
 417         # toupper restriction: "Only characters specified for the keywords
 418         # lower and upper shall be specified.
 419         if (to_upper(code_point) != code_point
 420             and not (is_lower(code_point) or is_upper(code_point))):
 421             sys.stderr.write(
 422                 ('%(sym)s is not upper|lower '
 423                  + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
 424                     'sym': ucs_symbol(code_point),
 425                     'c': code_point,
 426                     'uc': to_upper(code_point)})
 427         # tolower restriction: "Only characters specified for the keywords
 428         # lower and upper shall be specified.
 429         if (to_lower(code_point) != code_point
 430             and not (is_lower(code_point) or is_upper(code_point))):
 431             sys.stderr.write(
 432                 ('%(sym)s is not upper|lower '
 433                  + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
 434                     'sym': ucs_symbol(code_point),
 435                     'c': code_point,
 436                     'uc': to_lower(code_point)})
 437         # alpha restriction: "Characters classified as either upper or lower
 438         # shall automatically belong to this class.
 439         if ((is_lower(code_point) or is_upper(code_point))
 440              and not is_alpha(code_point)):
 441             sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
 442                 'sym': ucs_symbol(code_point)})
 443         # alpha restriction: “No character specified for the keywords cntrl,
 444         # digit, punct or space shall be specified.”
 445         if (is_alpha(code_point) and is_cntrl(code_point)):
 446             sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
 447                 'sym': ucs_symbol(code_point)})
 448         if (is_alpha(code_point) and is_digit(code_point)):
 449             sys.stderr.write('%(sym)s is alpha and digit\n' %{
 450                 'sym': ucs_symbol(code_point)})
 451         if (is_alpha(code_point) and is_punct(code_point)):
 452             sys.stderr.write('%(sym)s is alpha and punct\n' %{
 453                 'sym': ucs_symbol(code_point)})
 454         if (is_alpha(code_point) and is_space(code_point)):
 455             sys.stderr.write('%(sym)s is alpha and space\n' %{
 456                 'sym': ucs_symbol(code_point)})
 457         # space restriction: “No character specified for the keywords upper,
 458         # lower, alpha, digit, graph or xdigit shall be specified.”
 459         # upper, lower, alpha already checked above.
 460         if (is_space(code_point) and is_digit(code_point)):
 461             sys.stderr.write('%(sym)s is space and digit\n' %{
 462                 'sym': ucs_symbol(code_point)})
 463         if (is_space(code_point) and is_graph(code_point)):
 464             sys.stderr.write('%(sym)s is space and graph\n' %{
 465                 'sym': ucs_symbol(code_point)})
 466         if (is_space(code_point) and is_xdigit(code_point)):
 467             sys.stderr.write('%(sym)s is space and xdigit\n' %{
 468                 'sym': ucs_symbol(code_point)})
 469         # cntrl restriction: “No character specified for the keywords upper,
 470         # lower, alpha, digit, punct, graph, print or xdigit shall be
 471         # specified.”  upper, lower, alpha already checked above.
 472         if (is_cntrl(code_point) and is_digit(code_point)):
 473             sys.stderr.write('%(sym)s is cntrl and digit\n' %{
 474                 'sym': ucs_symbol(code_point)})
 475         if (is_cntrl(code_point) and is_punct(code_point)):
 476             sys.stderr.write('%(sym)s is cntrl and punct\n' %{
 477                 'sym': ucs_symbol(code_point)})
 478         if (is_cntrl(code_point) and is_graph(code_point)):
 479             sys.stderr.write('%(sym)s is cntrl and graph\n' %{
 480                 'sym': ucs_symbol(code_point)})
 481         if (is_cntrl(code_point) and is_print(code_point)):
 482             sys.stderr.write('%(sym)s is cntrl and print\n' %{
 483                 'sym': ucs_symbol(code_point)})
 484         if (is_cntrl(code_point) and is_xdigit(code_point)):
 485             sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
 486                 'sym': ucs_symbol(code_point)})
 487         # punct restriction: “No character specified for the keywords upper,
 488         # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 489         # be specified.”  upper, lower, alpha, cntrl already checked above.
 490         if (is_punct(code_point) and is_digit(code_point)):
 491             sys.stderr.write('%(sym)s is punct and digit\n' %{
 492                 'sym': ucs_symbol(code_point)})
 493         if (is_punct(code_point) and is_xdigit(code_point)):
 494             sys.stderr.write('%(sym)s is punct and xdigit\n' %{
 495                 'sym': ucs_symbol(code_point)})
 496         if (is_punct(code_point) and code_point == 0x0020):
 497             sys.stderr.write('%(sym)s is punct\n' %{
 498                 'sym': ucs_symbol(code_point)})
 499         # graph restriction: “No character specified for the keyword cntrl
 500         # shall be specified.”  Already checked above.
 501
 502         # print restriction: “No character specified for the keyword cntrl
 503         # shall be specified.”  Already checked above.
 504
 505         # graph - print relation: differ only in the <space> character.
 506         # How is this possible if there are more than one space character?!
 507         # I think susv2/xbd/locale.html should speak of “space characters”,
 508         # not “space character”.
 509         if (is_print(code_point)
 510             and not (is_graph(code_point) or is_space(code_point))):
 511             sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
 512                 'sym': unicode_utils.ucs_symbol(code_point)})
 513         if (not is_print(code_point)
 514             and (is_graph(code_point) or code_point == 0x0020)):
 515             sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
 516                 'sym': unicode_utils.ucs_symbol(code_point)})