3 # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
4 # Copyright (C) 2014-2023 Free Software Foundation, Inc.
5 # This file is part of the GNU C Library.
7 # The GNU C Library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # The GNU C Library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with the GNU C Library; if not, see
19 # <https://www.gnu.org/licenses/>.
22 Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
23 DerivedCoreProperties.txt files.
25 To see how this script is used, call it with the “-h” option:
27 $ ./gen_unicode_ctype.py -h
28 … prints usage message …
36 def code_point_ranges(is_class_function
):
37 '''Returns a list of ranges of code points for which is_class_function
42 [[65, 90], [192, 214], [216, 222], [256], … ]
45 for code_point
in sorted(unicode_utils
.UNICODE_ATTRIBUTES
):
46 if is_class_function(code_point
):
48 and cp_ranges
[-1][-1] == code_point
- 1):
49 if len(cp_ranges
[-1]) == 1:
50 cp_ranges
[-1].append(code_point
)
52 cp_ranges
[-1][-1] = code_point
54 cp_ranges
.append([code_point
])
57 def output_charclass(i18n_file
, class_name
, is_class_function
):
58 '''Output a LC_CTYPE character class section
63 <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
65 <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
66 <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
68 cp_ranges
= code_point_ranges(is_class_function
)
70 i18n_file
.write('%s /\n' %class
_name
)
75 for code_point_range
in cp_ranges
:
78 if len(code_point_range
) == 1:
79 range_string
= unicode_utils
.ucs_symbol(code_point_range
[0])
81 range_string
= unicode_utils
.ucs_symbol_range(
82 code_point_range
[0], code_point_range
[-1])
83 if len(line
+range_string
) > max_column
:
84 i18n_file
.write(line
+'/\n')
88 i18n_file
.write(line
+'\n')
91 def output_charmap(i18n_file
, map_name
, map_function
):
92 '''Output a LC_CTYPE character map section
97 (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
99 (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
100 (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
106 i18n_file
.write('%s /\n' %map_name
)
107 for code_point
in sorted(unicode_utils
.UNICODE_ATTRIBUTES
):
108 mapped
= map_function(code_point
)
109 if code_point
!= mapped
:
113 + unicode_utils
.ucs_symbol(code_point
) \
115 + unicode_utils
.ucs_symbol(mapped
) \
117 if len(line
+map_string
) > max_column
:
118 i18n_file
.write(line
+'/\n')
122 i18n_file
.write(line
+'\n')
123 i18n_file
.write('\n')
125 def read_input_file(filename
):
126 '''Reads the original glibc i18n file to get the original head
129 We want to replace only the character classes in LC_CTYPE, and the
130 date stamp. All the rest of the i18n file should stay unchanged.
131 To avoid having to cut and paste the generated data into the
132 original file, it is helpful to read the original file here
133 to be able to generate a complete result file.
136 with
open(filename
, mode
='r') as i18n_file
:
137 for line
in i18n_file
:
139 r
'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
142 line
= match
.group('key') \
143 + '"{:s}"\n'.format(time
.strftime('%Y-%m-%d'))
145 if line
.startswith('LC_CTYPE'):
147 for line
in i18n_file
:
148 if line
.startswith('translit_start'):
151 for line
in i18n_file
:
155 def output_head(i18n_file
, unicode_version
, head
=''):
156 '''Write the header of the output file, i.e. the part of the file
157 before the “LC_CTYPE” line.
159 if ARGS
.input_file
and head
:
160 i18n_file
.write(head
)
162 i18n_file
.write('escape_char /\n')
163 i18n_file
.write('comment_char %\n')
164 i18n_file
.write('\n')
165 i18n_file
.write('% Generated automatically by '
166 + 'gen_unicode_ctype.py '
167 + 'for Unicode {:s}.\n'.format(unicode_version
))
168 i18n_file
.write('\n')
169 i18n_file
.write('LC_IDENTIFICATION\n')
170 i18n_file
.write('title "Unicode {:s} FDCC-set"\n'.format(
172 i18n_file
.write('source "UnicodeData.txt, '
173 + 'DerivedCoreProperties.txt"\n')
174 i18n_file
.write('address ""\n')
175 i18n_file
.write('contact ""\n')
176 i18n_file
.write('email "bug-glibc-locales@gnu.org"\n')
177 i18n_file
.write('tel ""\n')
178 i18n_file
.write('fax ""\n')
179 i18n_file
.write('language ""\n')
180 i18n_file
.write('territory "Earth"\n')
181 i18n_file
.write('revision "{:s}"\n'.format(unicode_version
))
182 i18n_file
.write('date "{:s}"\n'.format(
183 time
.strftime('%Y-%m-%d')))
184 i18n_file
.write('category "i18n:2012";LC_CTYPE\n')
185 i18n_file
.write('END LC_IDENTIFICATION\n')
186 i18n_file
.write('\n')
187 i18n_file
.write('LC_CTYPE\n')
189 def output_tail(i18n_file
, tail
=''):
190 '''Write the tail of the output file, i.e. the part of the file
191 after the last “LC_CTYPE” character class.
193 if ARGS
.input_file
and tail
:
194 i18n_file
.write(tail
)
196 i18n_file
.write('END LC_CTYPE\n')
198 def output_tables(i18n_file
, unicode_version
, turkish
):
199 '''Write the new LC_CTYPE character classes to the output file'''
200 i18n_file
.write('% The following is the 14652 i18n fdcc-set '
201 + 'LC_CTYPE category.\n')
202 i18n_file
.write('% It covers Unicode version {:s}.\n'.format(
204 i18n_file
.write('% The character classes and mapping tables were '
206 i18n_file
.write('% generated using the gen_unicode_ctype.py '
208 i18n_file
.write('% The "upper" class reflects the uppercase '
209 + 'characters of class "alpha"\n')
210 output_charclass(i18n_file
, 'upper', unicode_utils
.is_upper
)
211 i18n_file
.write('% The "lower" class reflects the lowercase '
212 + 'characters of class "alpha"\n')
213 output_charclass(i18n_file
, 'lower', unicode_utils
.is_lower
)
214 i18n_file
.write('% The "alpha" class of the "i18n" FDCC-set is '
216 i18n_file
.write('% the recommendations in TR 10176 annex A\n')
217 output_charclass(i18n_file
, 'alpha', unicode_utils
.is_alpha
)
218 i18n_file
.write('% The "digit" class must only contain the '
219 + 'BASIC LATIN digits, says ISO C 99\n')
220 i18n_file
.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
221 output_charclass(i18n_file
, 'digit', unicode_utils
.is_digit
)
222 i18n_file
.write('% The "outdigit" information is by default '
223 + '"0" to "9". We don\'t have to\n')
224 i18n_file
.write('% provide it here since localedef will fill '
225 + 'in the bits and it would\n')
226 i18n_file
.write('% prevent locales copying this file define '
227 + 'their own values.\n')
228 i18n_file
.write('% outdigit /\n')
229 i18n_file
.write('% <U0030>..<U0039>\n\n')
230 # output_charclass(i18n_file, 'outdigit', is_outdigit)
231 output_charclass(i18n_file
, 'space', unicode_utils
.is_space
)
232 output_charclass(i18n_file
, 'cntrl', unicode_utils
.is_cntrl
)
233 output_charclass(i18n_file
, 'punct', unicode_utils
.is_punct
)
234 output_charclass(i18n_file
, 'graph', unicode_utils
.is_graph
)
235 output_charclass(i18n_file
, 'print', unicode_utils
.is_print
)
236 i18n_file
.write('% The "xdigit" class must only contain the '
237 + 'BASIC LATIN digits and A-F, a-f,\n')
238 i18n_file
.write('% says ISO C 99 '
239 + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
240 output_charclass(i18n_file
, 'xdigit', unicode_utils
.is_xdigit
)
241 output_charclass(i18n_file
, 'blank', unicode_utils
.is_blank
)
243 i18n_file
.write('% The case conversions reflect '
244 + 'Turkish conventions.\n')
245 output_charmap(i18n_file
, 'toupper', unicode_utils
.to_upper_turkish
)
246 output_charmap(i18n_file
, 'tolower', unicode_utils
.to_lower_turkish
)
248 output_charmap(i18n_file
, 'toupper', unicode_utils
.to_upper
)
249 output_charmap(i18n_file
, 'tolower', unicode_utils
.to_lower
)
250 output_charmap(i18n_file
, 'map "totitle";', unicode_utils
.to_title
)
251 i18n_file
.write('% The "combining" class reflects ISO/IEC 10646-1 '
253 i18n_file
.write('% That is, all combining characters (level 2+3).\n')
254 output_charclass(i18n_file
, 'class "combining";',
255 unicode_utils
.is_combining
)
256 i18n_file
.write('% The "combining_level3" class reflects '
257 + 'ISO/IEC 10646-1 annex B.2\n')
258 i18n_file
.write('% That is, combining characters of level 3.\n')
259 output_charclass(i18n_file
, 'class "combining_level3";',
260 unicode_utils
.is_combining_level3
)
262 if __name__
== "__main__":
263 PARSER
= argparse
.ArgumentParser(
265 Generate a Unicode conforming LC_CTYPE category from
266 UnicodeData.txt and DerivedCoreProperties.txt files.
269 '-u', '--unicode_data_file',
272 default
='UnicodeData.txt',
273 help=('The UnicodeData.txt file to read, '
274 + 'default: %(default)s'))
276 '-d', '--derived_core_properties_file',
279 default
='DerivedCoreProperties.txt',
280 help=('The DerivedCoreProperties.txt file to read, '
281 + 'default: %(default)s'))
283 '-i', '--input_file',
286 help='''The original glibc/localedata/locales/i18n file.''')
288 '-o', '--output_file',
292 help='''The file which shall contain the generated LC_CTYPE category,
293 default: %(default)s. If the original
294 glibc/localedata/locales/i18n has been given
295 as an option, all data from the original file
296 except the newly generated LC_CTYPE character
297 classes and the date stamp in
298 LC_IDENTIFICATION will be copied unchanged
299 into the output file. ''')
305 help='The Unicode version of the input files used.')
309 help='Use Turkish case conversions.')
310 ARGS
= PARSER
.parse_args()
312 unicode_utils
.fill_attributes(
313 ARGS
.unicode_data_file
)
314 unicode_utils
.fill_derived_core_properties(
315 ARGS
.derived_core_properties_file
)
316 unicode_utils
.verifications()
319 (HEAD
, TAIL
) = read_input_file(ARGS
.input_file
)
320 with
open(ARGS
.output_file
, mode
='w') as I18N_FILE
:
321 output_head(I18N_FILE
, ARGS
.unicode_version
, head
=HEAD
)
322 output_tables(I18N_FILE
, ARGS
.unicode_version
, ARGS
.turkish
)
323 output_tail(I18N_FILE
, tail
=TAIL
)