S390: Use s390-64 specific ionv-modules on s390-32, too.
[glibc.git] / localedata / unicode-gen / unicode_utils.py
blob8cc5f2ba2ae1593866d721cdc1db700418be8c88
1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
3 # Copyright (C) 2014-2016 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
20 '''
21 This module contains utilities used by the scripts to generate
22 Unicode data for glibc from upstream Unicode data files.
23 '''
25 import sys
26 import re
28 # Dictionary holding the entire contents of the UnicodeData.txt file
30 # Contents of this dictionary look like this:
32 # {0: {'category': 'Cc',
33 # 'title': None,
34 # 'digit': '',
35 # 'name': '<control>',
36 # 'bidi': 'BN',
37 # 'combining': '0',
38 # 'comment': '',
39 # 'oldname': 'NULL',
40 # 'decomposition': '',
41 # 'upper': None,
42 # 'mirrored': 'N',
43 # 'lower': None,
44 # 'decdigit': '',
45 # 'numeric': ''},
46 # …
47 # }
48 UNICODE_ATTRIBUTES = {}
50 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
52 # Contents of this dictionary look like this:
54 # {917504: ['Default_Ignorable_Code_Point'],
55 # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
56 # …
57 # }
58 DERIVED_CORE_PROPERTIES = {}
60 # Dictionary holding the entire contents of the EastAsianWidths.txt file
62 # Contents of this dictionary look like this:
64 # {0: 'N', … , 45430: 'W', …}
65 EAST_ASIAN_WIDTHS = {}
67 def fill_attribute(code_point, fields):
68 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
70 One entry in the UNICODE_ATTRIBUTES dictionary represents one line
71 in the UnicodeData.txt file.
73 '''
74 UNICODE_ATTRIBUTES[code_point] = {
75 'name': fields[1], # Character name
76 'category': fields[2], # General category
77 'combining': fields[3], # Canonical combining classes
78 'bidi': fields[4], # Bidirectional category
79 'decomposition': fields[5], # Character decomposition mapping
80 'decdigit': fields[6], # Decimal digit value
81 'digit': fields[7], # Digit value
82 'numeric': fields[8], # Numeric value
83 'mirrored': fields[9], # mirrored
84 'oldname': fields[10], # Old Unicode 1.0 name
85 'comment': fields[11], # comment
86 # Uppercase mapping
87 'upper': int(fields[12], 16) if fields[12] else None,
88 # Lowercase mapping
89 'lower': int(fields[13], 16) if fields[13] else None,
90 # Titlecase mapping
91 'title': int(fields[14], 16) if fields[14] else None,
94 def fill_attributes(filename):
95 '''Stores the entire contents of the UnicodeData.txt file
96 in the UNICODE_ATTRIBUTES dictionary.
98 A typical line for a single code point in UnicodeData.txt looks
99 like this:
101 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
103 Code point ranges are indicated by pairs of lines like this:
105 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
106 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
108 with open(filename, mode='r') as unicode_data_file:
109 fields_start = []
110 for line in unicode_data_file:
111 fields = line.strip().split(';')
112 if len(fields) != 15:
113 sys.stderr.write(
114 'short line in file "%(f)s": %(l)s\n' %{
115 'f': filename, 'l': line})
116 exit(1)
117 if fields[2] == 'Cs':
118 # Surrogates are UTF-16 artefacts,
119 # not real characters. Ignore them.
120 fields_start = []
121 continue
122 if fields[1].endswith(', First>'):
123 fields_start = fields
124 fields_start[1] = fields_start[1].split(',')[0][1:]
125 continue
126 if fields[1].endswith(', Last>'):
127 fields[1] = fields[1].split(',')[0][1:]
128 if fields[1:] != fields_start[1:]:
129 sys.stderr.write(
130 'broken code point range in file "%(f)s": %(l)s\n' %{
131 'f': filename, 'l': line})
132 exit(1)
133 for code_point in range(
134 int(fields_start[0], 16),
135 int(fields[0], 16)+1):
136 fill_attribute(code_point, fields)
137 fields_start = []
138 continue
139 fill_attribute(int(fields[0], 16), fields)
140 fields_start = []
142 def fill_derived_core_properties(filename):
143 '''Stores the entire contents of the DerivedCoreProperties.txt file
144 in the DERIVED_CORE_PROPERTIES dictionary.
146 Lines in DerivedCoreProperties.txt are either a code point range like
147 this:
149 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
151 or a single code point like this:
153 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
156 with open(filename, mode='r') as derived_core_properties_file:
157 for line in derived_core_properties_file:
158 match = re.match(
159 r'^(?P<codepoint1>[0-9A-F]{4,6})'
160 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
161 + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
162 line)
163 if not match:
164 continue
165 start = match.group('codepoint1')
166 end = match.group('codepoint2')
167 if not end:
168 end = start
169 for code_point in range(int(start, 16), int(end, 16)+1):
170 prop = match.group('property')
171 if code_point in DERIVED_CORE_PROPERTIES:
172 DERIVED_CORE_PROPERTIES[code_point].append(prop)
173 else:
174 DERIVED_CORE_PROPERTIES[code_point] = [prop]
176 def fill_east_asian_widths(filename):
177 '''Stores the entire contents of the EastAsianWidths.txt file
178 in the EAST_ASIAN_WIDTHS dictionary.
180 Lines in EastAsianWidths.txt are either a code point range like
181 this:
183 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
185 or a single code point like this:
187 A015;W # Lm YI SYLLABLE WU
189 with open(filename, mode='r') as east_asian_widths_file:
190 for line in east_asian_widths_file:
191 match = re.match(
192 r'^(?P<codepoint1>[0-9A-F]{4,6})'
193 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
194 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
195 line)
196 if not match:
197 continue
198 start = match.group('codepoint1')
199 end = match.group('codepoint2')
200 if not end:
201 end = start
202 for code_point in range(int(start, 16), int(end, 16)+1):
203 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
205 def to_upper(code_point):
206 '''Returns the code point of the uppercase version
207 of the given code point'''
208 if (UNICODE_ATTRIBUTES[code_point]['name']
209 and UNICODE_ATTRIBUTES[code_point]['upper']):
210 return UNICODE_ATTRIBUTES[code_point]['upper']
211 else:
212 return code_point
214 def to_lower(code_point):
215 '''Returns the code point of the lowercase version
216 of the given code point'''
217 if (UNICODE_ATTRIBUTES[code_point]['name']
218 and UNICODE_ATTRIBUTES[code_point]['lower']):
219 return UNICODE_ATTRIBUTES[code_point]['lower']
220 else:
221 return code_point
223 def to_upper_turkish(code_point):
224 '''Returns the code point of the Turkish uppercase version
225 of the given code point'''
226 if code_point == 0x0069:
227 return 0x0130
228 return to_upper(code_point)
230 def to_lower_turkish(code_point):
231 '''Returns the code point of the Turkish lowercase version
232 of the given code point'''
233 if code_point == 0x0049:
234 return 0x0131
235 return to_lower(code_point)
237 def to_title(code_point):
238 '''Returns the code point of the titlecase version
239 of the given code point'''
240 if (UNICODE_ATTRIBUTES[code_point]['name']
241 and UNICODE_ATTRIBUTES[code_point]['title']):
242 return UNICODE_ATTRIBUTES[code_point]['title']
243 else:
244 return code_point
246 def is_upper(code_point):
247 '''Checks whether the character with this code point is uppercase'''
248 return (to_lower(code_point) != code_point
249 or (code_point in DERIVED_CORE_PROPERTIES
250 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
252 def is_lower(code_point):
253 '''Checks whether the character with this code point is lowercase'''
254 # Some characters are defined as “Lowercase” in
255 # DerivedCoreProperties.txt but do not have a mapping to upper
256 # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
257 # one of these.
258 return (to_upper(code_point) != code_point
259 # <U00DF> is lowercase, but without simple to_upper mapping.
260 or code_point == 0x00DF
261 or (code_point in DERIVED_CORE_PROPERTIES
262 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
264 def is_alpha(code_point):
265 '''Checks whether the character with this code point is alphabetic'''
266 return ((code_point in DERIVED_CORE_PROPERTIES
268 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
270 # Consider all the non-ASCII digits as alphabetic.
271 # ISO C 99 forbids us to have them in category “digit”,
272 # but we want iswalnum to return true on them.
273 (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
274 and not (code_point >= 0x0030 and code_point <= 0x0039)))
276 def is_digit(code_point):
277 '''Checks whether the character with this code point is a digit'''
278 if False:
279 return (UNICODE_ATTRIBUTES[code_point]['name']
280 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
281 # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
282 # a zero. Must add <0> in front of them by hand.
283 else:
284 # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
285 # takes it away:
286 # 7.25.2.1.5:
287 # The iswdigit function tests for any wide character that
288 # corresponds to a decimal-digit character (as defined in 5.2.1).
289 # 5.2.1:
290 # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
291 return (code_point >= 0x0030 and code_point <= 0x0039)
293 def is_outdigit(code_point):
294 '''Checks whether the character with this code point is outdigit'''
295 return (code_point >= 0x0030 and code_point <= 0x0039)
297 def is_blank(code_point):
298 '''Checks whether the character with this code point is blank'''
299 return (code_point == 0x0009 # '\t'
300 # Category Zs without mention of '<noBreak>'
301 or (UNICODE_ATTRIBUTES[code_point]['name']
302 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
303 and '<noBreak>' not in
304 UNICODE_ATTRIBUTES[code_point]['decomposition']))
306 def is_space(code_point):
307 '''Checks whether the character with this code point is a space'''
308 # Don’t make U+00A0 a space. Non-breaking space means that all programs
309 # should treat it like a punctuation character, not like a space.
310 return (code_point == 0x0020 # ' '
311 or code_point == 0x000C # '\f'
312 or code_point == 0x000A # '\n'
313 or code_point == 0x000D # '\r'
314 or code_point == 0x0009 # '\t'
315 or code_point == 0x000B # '\v'
316 # Categories Zl, Zp, and Zs without mention of "<noBreak>"
317 or (UNICODE_ATTRIBUTES[code_point]['name']
319 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
321 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
323 '<noBreak>' not in
324 UNICODE_ATTRIBUTES[code_point]['decomposition']))))
326 def is_cntrl(code_point):
327 '''Checks whether the character with this code point is
328 a control character'''
329 return (UNICODE_ATTRIBUTES[code_point]['name']
330 and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
332 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
334 def is_xdigit(code_point):
335 '''Checks whether the character with this code point is
336 a hexadecimal digit'''
337 if False:
338 return (is_digit(code_point)
339 or (code_point >= 0x0041 and code_point <= 0x0046)
340 or (code_point >= 0x0061 and code_point <= 0x0066))
341 else:
342 # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
343 # takes it away:
344 # 7.25.2.1.12:
345 # The iswxdigit function tests for any wide character that
346 # corresponds to a hexadecimal-digit character (as defined
347 # in 6.4.4.1).
348 # 6.4.4.1:
349 # hexadecimal-digit: one of
350 # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
351 return ((code_point >= 0x0030 and code_point <= 0x0039)
352 or (code_point >= 0x0041 and code_point <= 0x0046)
353 or (code_point >= 0x0061 and code_point <= 0x0066))
355 def is_graph(code_point):
356 '''Checks whether the character with this code point is
357 a graphical character'''
358 return (UNICODE_ATTRIBUTES[code_point]['name']
359 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
360 and not is_space(code_point))
362 def is_print(code_point):
363 '''Checks whether the character with this code point is printable'''
364 return (UNICODE_ATTRIBUTES[code_point]['name']
365 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
366 and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
368 def is_punct(code_point):
369 '''Checks whether the character with this code point is punctuation'''
370 if False:
371 return (UNICODE_ATTRIBUTES[code_point]['name']
372 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
373 else:
374 # The traditional POSIX definition of punctuation is every graphic,
375 # non-alphanumeric character.
376 return (is_graph(code_point)
377 and not is_alpha(code_point)
378 and not is_digit(code_point))
380 def is_combining(code_point):
381 '''Checks whether the character with this code point is
382 a combining character'''
383 # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
384 # file. In 3.0.1 it was identical to the union of the general categories
385 # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
386 # PropList.txt file, so we take the latter definition.
387 return (UNICODE_ATTRIBUTES[code_point]['name']
389 UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
391 def is_combining_level3(code_point):
392 '''Checks whether the character with this code point is
393 a combining level3 character'''
394 return (is_combining(code_point)
396 int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
398 def ucs_symbol(code_point):
399 '''Return the UCS symbol string for a Unicode character.'''
400 if code_point < 0x10000:
401 return '<U{:04X}>'.format(code_point)
402 else:
403 return '<U{:08X}>'.format(code_point)
405 def ucs_symbol_range(code_point_low, code_point_high):
406 '''Returns a string UCS symbol string for a code point range.
408 Example:
410 <U0041>..<U005A>
412 return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
414 def verifications():
415 '''Tests whether the is_* functions observe the known restrictions'''
416 for code_point in sorted(UNICODE_ATTRIBUTES):
417 # toupper restriction: "Only characters specified for the keywords
418 # lower and upper shall be specified.
419 if (to_upper(code_point) != code_point
420 and not (is_lower(code_point) or is_upper(code_point))):
421 sys.stderr.write(
422 ('%(sym)s is not upper|lower '
423 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
424 'sym': ucs_symbol(code_point),
425 'c': code_point,
426 'uc': to_upper(code_point)})
427 # tolower restriction: "Only characters specified for the keywords
428 # lower and upper shall be specified.
429 if (to_lower(code_point) != code_point
430 and not (is_lower(code_point) or is_upper(code_point))):
431 sys.stderr.write(
432 ('%(sym)s is not upper|lower '
433 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
434 'sym': ucs_symbol(code_point),
435 'c': code_point,
436 'uc': to_lower(code_point)})
437 # alpha restriction: "Characters classified as either upper or lower
438 # shall automatically belong to this class.
439 if ((is_lower(code_point) or is_upper(code_point))
440 and not is_alpha(code_point)):
441 sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
442 'sym': ucs_symbol(code_point)})
443 # alpha restriction: “No character specified for the keywords cntrl,
444 # digit, punct or space shall be specified.”
445 if (is_alpha(code_point) and is_cntrl(code_point)):
446 sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
447 'sym': ucs_symbol(code_point)})
448 if (is_alpha(code_point) and is_digit(code_point)):
449 sys.stderr.write('%(sym)s is alpha and digit\n' %{
450 'sym': ucs_symbol(code_point)})
451 if (is_alpha(code_point) and is_punct(code_point)):
452 sys.stderr.write('%(sym)s is alpha and punct\n' %{
453 'sym': ucs_symbol(code_point)})
454 if (is_alpha(code_point) and is_space(code_point)):
455 sys.stderr.write('%(sym)s is alpha and space\n' %{
456 'sym': ucs_symbol(code_point)})
457 # space restriction: “No character specified for the keywords upper,
458 # lower, alpha, digit, graph or xdigit shall be specified.”
459 # upper, lower, alpha already checked above.
460 if (is_space(code_point) and is_digit(code_point)):
461 sys.stderr.write('%(sym)s is space and digit\n' %{
462 'sym': ucs_symbol(code_point)})
463 if (is_space(code_point) and is_graph(code_point)):
464 sys.stderr.write('%(sym)s is space and graph\n' %{
465 'sym': ucs_symbol(code_point)})
466 if (is_space(code_point) and is_xdigit(code_point)):
467 sys.stderr.write('%(sym)s is space and xdigit\n' %{
468 'sym': ucs_symbol(code_point)})
469 # cntrl restriction: “No character specified for the keywords upper,
470 # lower, alpha, digit, punct, graph, print or xdigit shall be
471 # specified.” upper, lower, alpha already checked above.
472 if (is_cntrl(code_point) and is_digit(code_point)):
473 sys.stderr.write('%(sym)s is cntrl and digit\n' %{
474 'sym': ucs_symbol(code_point)})
475 if (is_cntrl(code_point) and is_punct(code_point)):
476 sys.stderr.write('%(sym)s is cntrl and punct\n' %{
477 'sym': ucs_symbol(code_point)})
478 if (is_cntrl(code_point) and is_graph(code_point)):
479 sys.stderr.write('%(sym)s is cntrl and graph\n' %{
480 'sym': ucs_symbol(code_point)})
481 if (is_cntrl(code_point) and is_print(code_point)):
482 sys.stderr.write('%(sym)s is cntrl and print\n' %{
483 'sym': ucs_symbol(code_point)})
484 if (is_cntrl(code_point) and is_xdigit(code_point)):
485 sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
486 'sym': ucs_symbol(code_point)})
487 # punct restriction: “No character specified for the keywords upper,
488 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
489 # be specified.” upper, lower, alpha, cntrl already checked above.
490 if (is_punct(code_point) and is_digit(code_point)):
491 sys.stderr.write('%(sym)s is punct and digit\n' %{
492 'sym': ucs_symbol(code_point)})
493 if (is_punct(code_point) and is_xdigit(code_point)):
494 sys.stderr.write('%(sym)s is punct and xdigit\n' %{
495 'sym': ucs_symbol(code_point)})
496 if (is_punct(code_point) and code_point == 0x0020):
497 sys.stderr.write('%(sym)s is punct\n' %{
498 'sym': ucs_symbol(code_point)})
499 # graph restriction: “No character specified for the keyword cntrl
500 # shall be specified.” Already checked above.
502 # print restriction: “No character specified for the keyword cntrl
503 # shall be specified.” Already checked above.
505 # graph - print relation: differ only in the <space> character.
506 # How is this possible if there are more than one space character?!
507 # I think susv2/xbd/locale.html should speak of “space characters”,
508 # not “space character”.
509 if (is_print(code_point)
510 and not (is_graph(code_point) or is_space(code_point))):
511 sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
512 'sym': unicode_utils.ucs_symbol(code_point)})
513 if (not is_print(code_point)
514 and (is_graph(code_point) or code_point == 0x0020)):
515 sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
516 'sym': unicode_utils.ucs_symbol(code_point)})