3 # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
4 # Copyright (C) 2014-2016 Free Software Foundation, Inc.
5 # This file is part of the GNU C Library.
6 # Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
8 # The GNU C Library is free software; you can redistribute it and/or
9 # modify it under the terms of the GNU Lesser General Public
10 # License as published by the Free Software Foundation; either
11 # version 2.1 of the License, or (at your option) any later version.
13 # The GNU C Library is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 # Lesser General Public License for more details.
18 # You should have received a copy of the GNU Lesser General Public
19 # License along with the GNU C Library; if not, see
20 # <http://www.gnu.org/licenses/>.
23 Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
24 DerivedCoreProperties.txt files.
26 To see how this script is used, call it with the “-h” option:
28 $ ./gen_unicode_ctype.py -h
29 … prints usage message …
37 def code_point_ranges(is_class_function
):
38 '''Returns a list of ranges of code points for which is_class_function
43 [[65, 90], [192, 214], [216, 222], [256], … ]
46 for code_point
in sorted(unicode_utils
.UNICODE_ATTRIBUTES
):
47 if is_class_function(code_point
):
49 and cp_ranges
[-1][-1] == code_point
- 1):
50 if len(cp_ranges
[-1]) == 1:
51 cp_ranges
[-1].append(code_point
)
53 cp_ranges
[-1][-1] = code_point
55 cp_ranges
.append([code_point
])
58 def output_charclass(i18n_file
, class_name
, is_class_function
):
59 '''Output a LC_CTYPE character class section
64 <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
66 <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
67 <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
69 cp_ranges
= code_point_ranges(is_class_function
)
71 i18n_file
.write('%s /\n' %class
_name
)
76 for code_point_range
in cp_ranges
:
79 if len(code_point_range
) == 1:
80 range_string
= unicode_utils
.ucs_symbol(code_point_range
[0])
82 range_string
= unicode_utils
.ucs_symbol_range(
83 code_point_range
[0], code_point_range
[-1])
84 if len(line
+range_string
) > max_column
:
85 i18n_file
.write(line
+'/\n')
89 i18n_file
.write(line
+'\n')
92 def output_charmap(i18n_file
, map_name
, map_function
):
93 '''Output a LC_CTYPE character map section
98 (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
100 (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
101 (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
107 i18n_file
.write('%s /\n' %map_name
)
108 for code_point
in sorted(unicode_utils
.UNICODE_ATTRIBUTES
):
109 mapped
= map_function(code_point
)
110 if code_point
!= mapped
:
114 + unicode_utils
.ucs_symbol(code_point
) \
116 + unicode_utils
.ucs_symbol(mapped
) \
118 if len(line
+map_string
) > max_column
:
119 i18n_file
.write(line
+'/\n')
123 i18n_file
.write(line
+'\n')
124 i18n_file
.write('\n')
126 def read_input_file(filename
):
127 '''Reads the original glibc i18n file to get the original head
130 We want to replace only the character classes in LC_CTYPE, and the
131 date stamp. All the rest of the i18n file should stay unchanged.
132 To avoid having to cut and paste the generated data into the
133 original file, it is helpful to read the original file here
134 to be able to generate a complete result file.
137 with
open(filename
, mode
='r') as i18n_file
:
138 for line
in i18n_file
:
140 r
'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
143 line
= match
.group('key') \
144 + '"{:s}"\n'.format(time
.strftime('%Y-%m-%d'))
146 if line
.startswith('LC_CTYPE'):
148 for line
in i18n_file
:
149 if line
.startswith('translit_start'):
152 for line
in i18n_file
:
156 def output_head(i18n_file
, unicode_version
, head
=''):
157 '''Write the header of the output file, i.e. the part of the file
158 before the “LC_CTYPE” line.
160 if ARGS
.input_file
and head
:
161 i18n_file
.write(head
)
163 i18n_file
.write('escape_char /\n')
164 i18n_file
.write('comment_char %\n')
165 i18n_file
.write('\n')
166 i18n_file
.write('% Generated automatically by '
167 + 'gen_unicode_ctype.py '
168 + 'for Unicode {:s}.\n'.format(unicode_version
))
169 i18n_file
.write('\n')
170 i18n_file
.write('LC_IDENTIFICATION\n')
171 i18n_file
.write('title "Unicode {:s} FDCC-set"\n'.format(
173 i18n_file
.write('source "UnicodeData.txt, '
174 + 'DerivedCoreProperties.txt"\n')
175 i18n_file
.write('address ""\n')
176 i18n_file
.write('contact ""\n')
177 i18n_file
.write('email "bug-glibc-locales@gnu.org"\n')
178 i18n_file
.write('tel ""\n')
179 i18n_file
.write('fax ""\n')
180 i18n_file
.write('language ""\n')
181 i18n_file
.write('territory "Earth"\n')
182 i18n_file
.write('revision "{:s}"\n'.format(unicode_version
))
183 i18n_file
.write('date "{:s}"\n'.format(
184 time
.strftime('%Y-%m-%d')))
185 i18n_file
.write('category "unicode:2014";LC_CTYPE\n')
186 i18n_file
.write('END LC_IDENTIFICATION\n')
187 i18n_file
.write('\n')
188 i18n_file
.write('LC_CTYPE\n')
190 def output_tail(i18n_file
, tail
=''):
191 '''Write the tail of the output file, i.e. the part of the file
192 after the last “LC_CTYPE” character class.
194 if ARGS
.input_file
and tail
:
195 i18n_file
.write(tail
)
197 i18n_file
.write('END LC_CTYPE\n')
199 def output_tables(i18n_file
, unicode_version
, turkish
):
200 '''Write the new LC_CTYPE character classes to the output file'''
201 i18n_file
.write('% The following is the 14652 i18n fdcc-set '
202 + 'LC_CTYPE category.\n')
203 i18n_file
.write('% It covers Unicode version {:s}.\n'.format(
205 i18n_file
.write('% The character classes and mapping tables were '
207 i18n_file
.write('% generated using the gen_unicode_ctype.py '
209 i18n_file
.write('% The "upper" class reflects the uppercase '
210 + 'characters of class "alpha"\n')
211 output_charclass(i18n_file
, 'upper', unicode_utils
.is_upper
)
212 i18n_file
.write('% The "lower" class reflects the lowercase '
213 + 'characters of class "alpha"\n')
214 output_charclass(i18n_file
, 'lower', unicode_utils
.is_lower
)
215 i18n_file
.write('% The "alpha" class of the "i18n" FDCC-set is '
217 i18n_file
.write('% the recommendations in TR 10176 annex A\n')
218 output_charclass(i18n_file
, 'alpha', unicode_utils
.is_alpha
)
219 i18n_file
.write('% The "digit" class must only contain the '
220 + 'BASIC LATIN digits, says ISO C 99\n')
221 i18n_file
.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
222 output_charclass(i18n_file
, 'digit', unicode_utils
.is_digit
)
223 i18n_file
.write('% The "outdigit" information is by default '
224 + '"0" to "9". We don\'t have to\n')
225 i18n_file
.write('% provide it here since localedef will fill '
226 + 'in the bits and it would\n')
227 i18n_file
.write('% prevent locales copying this file define '
228 + 'their own values.\n')
229 i18n_file
.write('% outdigit /\n')
230 i18n_file
.write('% <U0030>..<U0039>\n\n')
231 # output_charclass(i18n_file, 'outdigit', is_outdigit)
232 output_charclass(i18n_file
, 'space', unicode_utils
.is_space
)
233 output_charclass(i18n_file
, 'cntrl', unicode_utils
.is_cntrl
)
234 output_charclass(i18n_file
, 'punct', unicode_utils
.is_punct
)
235 output_charclass(i18n_file
, 'graph', unicode_utils
.is_graph
)
236 output_charclass(i18n_file
, 'print', unicode_utils
.is_print
)
237 i18n_file
.write('% The "xdigit" class must only contain the '
238 + 'BASIC LATIN digits and A-F, a-f,\n')
239 i18n_file
.write('% says ISO C 99 '
240 + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
241 output_charclass(i18n_file
, 'xdigit', unicode_utils
.is_xdigit
)
242 output_charclass(i18n_file
, 'blank', unicode_utils
.is_blank
)
244 i18n_file
.write('% The case conversions reflect '
245 + 'Turkish conventions.\n')
246 output_charmap(i18n_file
, 'toupper', unicode_utils
.to_upper_turkish
)
247 output_charmap(i18n_file
, 'tolower', unicode_utils
.to_lower_turkish
)
249 output_charmap(i18n_file
, 'toupper', unicode_utils
.to_upper
)
250 output_charmap(i18n_file
, 'tolower', unicode_utils
.to_lower
)
251 output_charmap(i18n_file
, 'map "totitle";', unicode_utils
.to_title
)
252 i18n_file
.write('% The "combining" class reflects ISO/IEC 10646-1 '
254 i18n_file
.write('% That is, all combining characters (level 2+3).\n')
255 output_charclass(i18n_file
, 'class "combining";',
256 unicode_utils
.is_combining
)
257 i18n_file
.write('% The "combining_level3" class reflects '
258 + 'ISO/IEC 10646-1 annex B.2\n')
259 i18n_file
.write('% That is, combining characters of level 3.\n')
260 output_charclass(i18n_file
, 'class "combining_level3";',
261 unicode_utils
.is_combining_level3
)
263 if __name__
== "__main__":
264 PARSER
= argparse
.ArgumentParser(
266 Generate a Unicode conforming LC_CTYPE category from
267 UnicodeData.txt and DerivedCoreProperties.txt files.
270 '-u', '--unicode_data_file',
273 default
='UnicodeData.txt',
274 help=('The UnicodeData.txt file to read, '
275 + 'default: %(default)s'))
277 '-d', '--derived_core_properties_file',
280 default
='DerivedCoreProperties.txt',
281 help=('The DerivedCoreProperties.txt file to read, '
282 + 'default: %(default)s'))
284 '-i', '--input_file',
287 help='''The original glibc/localedata/locales/i18n file.''')
289 '-o', '--output_file',
293 help='''The file which shall contain the generated LC_CTYPE category,
294 default: %(default)s. If the original
295 glibc/localedata/locales/i18n has been given
296 as an option, all data from the original file
297 except the newly generated LC_CTYPE character
298 classes and the date stamp in
299 LC_IDENTIFICATION will be copied unchanged
300 into the output file. ''')
306 help='The Unicode version of the input files used.')
310 help='Use Turkish case conversions.')
311 ARGS
= PARSER
.parse_args()
313 unicode_utils
.fill_attributes(
314 ARGS
.unicode_data_file
)
315 unicode_utils
.fill_derived_core_properties(
316 ARGS
.derived_core_properties_file
)
317 unicode_utils
.verifications()
320 (HEAD
, TAIL
) = read_input_file(ARGS
.input_file
)
321 with
open(ARGS
.output_file
, mode
='w') as I18N_FILE
:
322 output_head(I18N_FILE
, ARGS
.unicode_version
, head
=HEAD
)
323 output_tables(I18N_FILE
, ARGS
.unicode_version
, ARGS
.turkish
)
324 output_tail(I18N_FILE
, tail
=TAIL
)