1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
3 # Copyright (C) 2014-2016 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
21 This module contains utilities used by the scripts to generate
22 Unicode data for glibc from upstream Unicode data files.
28 # Dictionary holding the entire contents of the UnicodeData.txt file
30 # Contents of this dictionary look like this:
32 # {0: {'category': 'Cc',
35 # 'name': '<control>',
40 # 'decomposition': '',
48 UNICODE_ATTRIBUTES
= {}
50 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
52 # Contents of this dictionary look like this:
54 # {917504: ['Default_Ignorable_Code_Point'],
55 # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
58 DERIVED_CORE_PROPERTIES
= {}
60 # Dictionary holding the entire contents of the EastAsianWidths.txt file
62 # Contents of this dictionary look like this:
64 # {0: 'N', … , 45430: 'W', …}
65 EAST_ASIAN_WIDTHS
= {}
67 def fill_attribute(code_point
, fields
):
68 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
70 One entry in the UNICODE_ATTRIBUTES dictionary represents one line
71 in the UnicodeData.txt file.
74 UNICODE_ATTRIBUTES
[code_point
] = {
75 'name': fields
[1], # Character name
76 'category': fields
[2], # General category
77 'combining': fields
[3], # Canonical combining classes
78 'bidi': fields
[4], # Bidirectional category
79 'decomposition': fields
[5], # Character decomposition mapping
80 'decdigit': fields
[6], # Decimal digit value
81 'digit': fields
[7], # Digit value
82 'numeric': fields
[8], # Numeric value
83 'mirrored': fields
[9], # mirrored
84 'oldname': fields
[10], # Old Unicode 1.0 name
85 'comment': fields
[11], # comment
87 'upper': int(fields
[12], 16) if fields
[12] else None,
89 'lower': int(fields
[13], 16) if fields
[13] else None,
91 'title': int(fields
[14], 16) if fields
[14] else None,
94 def fill_attributes(filename
):
95 '''Stores the entire contents of the UnicodeData.txt file
96 in the UNICODE_ATTRIBUTES dictionary.
98 A typical line for a single code point in UnicodeData.txt looks
101 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
103 Code point ranges are indicated by pairs of lines like this:
105 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
106 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
108 with
open(filename
, mode
='r') as unicode_data_file
:
110 for line
in unicode_data_file
:
111 fields
= line
.strip().split(';')
112 if len(fields
) != 15:
114 'short line in file "%(f)s": %(l)s\n' %{
115 'f': filename
, 'l': line
})
117 if fields
[2] == 'Cs':
118 # Surrogates are UTF-16 artefacts,
119 # not real characters. Ignore them.
122 if fields
[1].endswith(', First>'):
123 fields_start
= fields
124 fields_start
[1] = fields_start
[1].split(',')[0][1:]
126 if fields
[1].endswith(', Last>'):
127 fields
[1] = fields
[1].split(',')[0][1:]
128 if fields
[1:] != fields_start
[1:]:
130 'broken code point range in file "%(f)s": %(l)s\n' %{
131 'f': filename
, 'l': line
})
133 for code_point
in range(
134 int(fields_start
[0], 16),
135 int(fields
[0], 16)+1):
136 fill_attribute(code_point
, fields
)
139 fill_attribute(int(fields
[0], 16), fields
)
142 def fill_derived_core_properties(filename
):
143 '''Stores the entire contents of the DerivedCoreProperties.txt file
144 in the DERIVED_CORE_PROPERTIES dictionary.
146 Lines in DerivedCoreProperties.txt are either a code point range like
149 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
151 or a single code point like this:
153 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
156 with
open(filename
, mode
='r') as derived_core_properties_file
:
157 for line
in derived_core_properties_file
:
159 r
'^(?P<codepoint1>[0-9A-F]{4,6})'
160 + r
'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
161 + r
'\s*;\s*(?P<property>[a-zA-Z_]+)',
165 start
= match
.group('codepoint1')
166 end
= match
.group('codepoint2')
169 for code_point
in range(int(start
, 16), int(end
, 16)+1):
170 prop
= match
.group('property')
171 if code_point
in DERIVED_CORE_PROPERTIES
:
172 DERIVED_CORE_PROPERTIES
[code_point
].append(prop
)
174 DERIVED_CORE_PROPERTIES
[code_point
] = [prop
]
176 def fill_east_asian_widths(filename
):
177 '''Stores the entire contents of the EastAsianWidths.txt file
178 in the EAST_ASIAN_WIDTHS dictionary.
180 Lines in EastAsianWidths.txt are either a code point range like
183 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
185 or a single code point like this:
187 A015;W # Lm YI SYLLABLE WU
189 with
open(filename
, mode
='r') as east_asian_widths_file
:
190 for line
in east_asian_widths_file
:
192 r
'^(?P<codepoint1>[0-9A-F]{4,6})'
193 +r
'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
194 +r
'\s*;\s*(?P<property>[a-zA-Z]+)',
198 start
= match
.group('codepoint1')
199 end
= match
.group('codepoint2')
202 for code_point
in range(int(start
, 16), int(end
, 16)+1):
203 EAST_ASIAN_WIDTHS
[code_point
] = match
.group('property')
205 def to_upper(code_point
):
206 '''Returns the code point of the uppercase version
207 of the given code point'''
208 if (UNICODE_ATTRIBUTES
[code_point
]['name']
209 and UNICODE_ATTRIBUTES
[code_point
]['upper']):
210 return UNICODE_ATTRIBUTES
[code_point
]['upper']
214 def to_lower(code_point
):
215 '''Returns the code point of the lowercase version
216 of the given code point'''
217 if (UNICODE_ATTRIBUTES
[code_point
]['name']
218 and UNICODE_ATTRIBUTES
[code_point
]['lower']):
219 return UNICODE_ATTRIBUTES
[code_point
]['lower']
223 def to_upper_turkish(code_point
):
224 '''Returns the code point of the Turkish uppercase version
225 of the given code point'''
226 if code_point
== 0x0069:
228 return to_upper(code_point
)
230 def to_lower_turkish(code_point
):
231 '''Returns the code point of the Turkish lowercase version
232 of the given code point'''
233 if code_point
== 0x0049:
235 return to_lower(code_point
)
237 def to_title(code_point
):
238 '''Returns the code point of the titlecase version
239 of the given code point'''
240 if (UNICODE_ATTRIBUTES
[code_point
]['name']
241 and UNICODE_ATTRIBUTES
[code_point
]['title']):
242 return UNICODE_ATTRIBUTES
[code_point
]['title']
246 def is_upper(code_point
):
247 '''Checks whether the character with this code point is uppercase'''
248 return (to_lower(code_point
) != code_point
249 or (code_point
in DERIVED_CORE_PROPERTIES
250 and 'Uppercase' in DERIVED_CORE_PROPERTIES
[code_point
]))
252 def is_lower(code_point
):
253 '''Checks whether the character with this code point is lowercase'''
254 # Some characters are defined as “Lowercase” in
255 # DerivedCoreProperties.txt but do not have a mapping to upper
256 # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
258 return (to_upper(code_point
) != code_point
259 # <U00DF> is lowercase, but without simple to_upper mapping.
260 or code_point
== 0x00DF
261 or (code_point
in DERIVED_CORE_PROPERTIES
262 and 'Lowercase' in DERIVED_CORE_PROPERTIES
[code_point
]))
264 def is_alpha(code_point
):
265 '''Checks whether the character with this code point is alphabetic'''
266 return ((code_point
in DERIVED_CORE_PROPERTIES
268 'Alphabetic' in DERIVED_CORE_PROPERTIES
[code_point
])
270 # Consider all the non-ASCII digits as alphabetic.
271 # ISO C 99 forbids us to have them in category “digit”,
272 # but we want iswalnum to return true on them.
273 (UNICODE_ATTRIBUTES
[code_point
]['category'] == 'Nd'
274 and not (code_point
>= 0x0030 and code_point
<= 0x0039)))
276 def is_digit(code_point
):
277 '''Checks whether the character with this code point is a digit'''
279 return (UNICODE_ATTRIBUTES
[code_point
]['name']
280 and UNICODE_ATTRIBUTES
[code_point
]['category'] == 'Nd')
281 # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
282 # a zero. Must add <0> in front of them by hand.
284 # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
287 # The iswdigit function tests for any wide character that
288 # corresponds to a decimal-digit character (as defined in 5.2.1).
290 # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
291 return (code_point
>= 0x0030 and code_point
<= 0x0039)
293 def is_outdigit(code_point
):
294 '''Checks whether the character with this code point is outdigit'''
295 return (code_point
>= 0x0030 and code_point
<= 0x0039)
297 def is_blank(code_point
):
298 '''Checks whether the character with this code point is blank'''
299 return (code_point
== 0x0009 # '\t'
300 # Category Zs without mention of '<noBreak>'
301 or (UNICODE_ATTRIBUTES
[code_point
]['name']
302 and UNICODE_ATTRIBUTES
[code_point
]['category'] == 'Zs'
303 and '<noBreak>' not in
304 UNICODE_ATTRIBUTES
[code_point
]['decomposition']))
306 def is_space(code_point
):
307 '''Checks whether the character with this code point is a space'''
308 # Don’t make U+00A0 a space. Non-breaking space means that all programs
309 # should treat it like a punctuation character, not like a space.
310 return (code_point
== 0x0020 # ' '
311 or code_point
== 0x000C # '\f'
312 or code_point
== 0x000A # '\n'
313 or code_point
== 0x000D # '\r'
314 or code_point
== 0x0009 # '\t'
315 or code_point
== 0x000B # '\v'
316 # Categories Zl, Zp, and Zs without mention of "<noBreak>"
317 or (UNICODE_ATTRIBUTES
[code_point
]['name']
319 (UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Zl', 'Zp']
321 (UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Zs']
324 UNICODE_ATTRIBUTES
[code_point
]['decomposition']))))
326 def is_cntrl(code_point
):
327 '''Checks whether the character with this code point is
328 a control character'''
329 return (UNICODE_ATTRIBUTES
[code_point
]['name']
330 and (UNICODE_ATTRIBUTES
[code_point
]['name'] == '<control>'
332 UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Zl', 'Zp']))
334 def is_xdigit(code_point
):
335 '''Checks whether the character with this code point is
336 a hexadecimal digit'''
338 return (is_digit(code_point
)
339 or (code_point
>= 0x0041 and code_point
<= 0x0046)
340 or (code_point
>= 0x0061 and code_point
<= 0x0066))
342 # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
345 # The iswxdigit function tests for any wide character that
346 # corresponds to a hexadecimal-digit character (as defined
349 # hexadecimal-digit: one of
350 # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
351 return ((code_point
>= 0x0030 and code_point
<= 0x0039)
352 or (code_point
>= 0x0041 and code_point
<= 0x0046)
353 or (code_point
>= 0x0061 and code_point
<= 0x0066))
355 def is_graph(code_point
):
356 '''Checks whether the character with this code point is
357 a graphical character'''
358 return (UNICODE_ATTRIBUTES
[code_point
]['name']
359 and UNICODE_ATTRIBUTES
[code_point
]['name'] != '<control>'
360 and not is_space(code_point
))
362 def is_print(code_point
):
363 '''Checks whether the character with this code point is printable'''
364 return (UNICODE_ATTRIBUTES
[code_point
]['name']
365 and UNICODE_ATTRIBUTES
[code_point
]['name'] != '<control>'
366 and UNICODE_ATTRIBUTES
[code_point
]['category'] not in ['Zl', 'Zp'])
368 def is_punct(code_point
):
369 '''Checks whether the character with this code point is punctuation'''
371 return (UNICODE_ATTRIBUTES
[code_point
]['name']
372 and UNICODE_ATTRIBUTES
[code_point
]['category'].startswith('P'))
374 # The traditional POSIX definition of punctuation is every graphic,
375 # non-alphanumeric character.
376 return (is_graph(code_point
)
377 and not is_alpha(code_point
)
378 and not is_digit(code_point
))
380 def is_combining(code_point
):
381 '''Checks whether the character with this code point is
382 a combining character'''
383 # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
384 # file. In 3.0.1 it was identical to the union of the general categories
385 # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
386 # PropList.txt file, so we take the latter definition.
387 return (UNICODE_ATTRIBUTES
[code_point
]['name']
389 UNICODE_ATTRIBUTES
[code_point
]['category'] in ['Mn', 'Mc', 'Me'])
391 def is_combining_level3(code_point
):
392 '''Checks whether the character with this code point is
393 a combining level3 character'''
394 return (is_combining(code_point
)
396 int(UNICODE_ATTRIBUTES
[code_point
]['combining']) in range(0, 200))
398 def ucs_symbol(code_point
):
399 '''Return the UCS symbol string for a Unicode character.'''
400 if code_point
< 0x10000:
401 return '<U{:04X}>'.format(code_point
)
403 return '<U{:08X}>'.format(code_point
)
405 def ucs_symbol_range(code_point_low
, code_point_high
):
406 '''Returns a string UCS symbol string for a code point range.
412 return ucs_symbol(code_point_low
) + '..' + ucs_symbol(code_point_high
)
415 '''Tests whether the is_* functions observe the known restrictions'''
416 for code_point
in sorted(UNICODE_ATTRIBUTES
):
417 # toupper restriction: "Only characters specified for the keywords
418 # lower and upper shall be specified.
419 if (to_upper(code_point
) != code_point
420 and not (is_lower(code_point
) or is_upper(code_point
))):
422 ('%(sym)s is not upper|lower '
423 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
424 'sym': ucs_symbol(code_point
),
426 'uc': to_upper(code_point
)})
427 # tolower restriction: "Only characters specified for the keywords
428 # lower and upper shall be specified.
429 if (to_lower(code_point
) != code_point
430 and not (is_lower(code_point
) or is_upper(code_point
))):
432 ('%(sym)s is not upper|lower '
433 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
434 'sym': ucs_symbol(code_point
),
436 'uc': to_lower(code_point
)})
437 # alpha restriction: "Characters classified as either upper or lower
438 # shall automatically belong to this class.
439 if ((is_lower(code_point
) or is_upper(code_point
))
440 and not is_alpha(code_point
)):
441 sys
.stderr
.write('%(sym)s is upper|lower but not alpha\n' %{
442 'sym': ucs_symbol(code_point
)})
443 # alpha restriction: “No character specified for the keywords cntrl,
444 # digit, punct or space shall be specified.”
445 if (is_alpha(code_point
) and is_cntrl(code_point
)):
446 sys
.stderr
.write('%(sym)s is alpha and cntrl\n' %{
447 'sym': ucs_symbol(code_point
)})
448 if (is_alpha(code_point
) and is_digit(code_point
)):
449 sys
.stderr
.write('%(sym)s is alpha and digit\n' %{
450 'sym': ucs_symbol(code_point
)})
451 if (is_alpha(code_point
) and is_punct(code_point
)):
452 sys
.stderr
.write('%(sym)s is alpha and punct\n' %{
453 'sym': ucs_symbol(code_point
)})
454 if (is_alpha(code_point
) and is_space(code_point
)):
455 sys
.stderr
.write('%(sym)s is alpha and space\n' %{
456 'sym': ucs_symbol(code_point
)})
457 # space restriction: “No character specified for the keywords upper,
458 # lower, alpha, digit, graph or xdigit shall be specified.”
459 # upper, lower, alpha already checked above.
460 if (is_space(code_point
) and is_digit(code_point
)):
461 sys
.stderr
.write('%(sym)s is space and digit\n' %{
462 'sym': ucs_symbol(code_point
)})
463 if (is_space(code_point
) and is_graph(code_point
)):
464 sys
.stderr
.write('%(sym)s is space and graph\n' %{
465 'sym': ucs_symbol(code_point
)})
466 if (is_space(code_point
) and is_xdigit(code_point
)):
467 sys
.stderr
.write('%(sym)s is space and xdigit\n' %{
468 'sym': ucs_symbol(code_point
)})
469 # cntrl restriction: “No character specified for the keywords upper,
470 # lower, alpha, digit, punct, graph, print or xdigit shall be
471 # specified.” upper, lower, alpha already checked above.
472 if (is_cntrl(code_point
) and is_digit(code_point
)):
473 sys
.stderr
.write('%(sym)s is cntrl and digit\n' %{
474 'sym': ucs_symbol(code_point
)})
475 if (is_cntrl(code_point
) and is_punct(code_point
)):
476 sys
.stderr
.write('%(sym)s is cntrl and punct\n' %{
477 'sym': ucs_symbol(code_point
)})
478 if (is_cntrl(code_point
) and is_graph(code_point
)):
479 sys
.stderr
.write('%(sym)s is cntrl and graph\n' %{
480 'sym': ucs_symbol(code_point
)})
481 if (is_cntrl(code_point
) and is_print(code_point
)):
482 sys
.stderr
.write('%(sym)s is cntrl and print\n' %{
483 'sym': ucs_symbol(code_point
)})
484 if (is_cntrl(code_point
) and is_xdigit(code_point
)):
485 sys
.stderr
.write('%(sym)s is cntrl and xdigit\n' %{
486 'sym': ucs_symbol(code_point
)})
487 # punct restriction: “No character specified for the keywords upper,
488 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
489 # be specified.” upper, lower, alpha, cntrl already checked above.
490 if (is_punct(code_point
) and is_digit(code_point
)):
491 sys
.stderr
.write('%(sym)s is punct and digit\n' %{
492 'sym': ucs_symbol(code_point
)})
493 if (is_punct(code_point
) and is_xdigit(code_point
)):
494 sys
.stderr
.write('%(sym)s is punct and xdigit\n' %{
495 'sym': ucs_symbol(code_point
)})
496 if (is_punct(code_point
) and code_point
== 0x0020):
497 sys
.stderr
.write('%(sym)s is punct\n' %{
498 'sym': ucs_symbol(code_point
)})
499 # graph restriction: “No character specified for the keyword cntrl
500 # shall be specified.” Already checked above.
502 # print restriction: “No character specified for the keyword cntrl
503 # shall be specified.” Already checked above.
505 # graph - print relation: differ only in the <space> character.
506 # How is this possible if there are more than one space character?!
507 # I think susv2/xbd/locale.html should speak of “space characters”,
508 # not “space character”.
509 if (is_print(code_point
)
510 and not (is_graph(code_point
) or is_space(code_point
))):
511 sys
.stderr
.write('%(sym)s is print but not graph|<space>\n' %{
512 'sym': unicode_utils
.ucs_symbol(code_point
)})
513 if (not is_print(code_point
)
514 and (is_graph(code_point
) or code_point
== 0x0020)):
515 sys
.stderr
.write('%(sym)s is graph|<space> but not print\n' %{
516 'sym': unicode_utils
.ucs_symbol(code_point
)})