2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2014-2018 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <http://www.gnu.org/licenses/>.
21 This script is useful for checking backward compatibility of newly
22 generated UTF-8 file from utf8_gen.py script
24 To see how this script is used, call it with the “-h” option:
26 $ ./utf8_compatibility.py -h
27 … prints usage message …
35 def create_charmap_dictionary(file_name
):
36 '''Create a dictionary for all code points found in the CHARMAP
39 with
open(file_name
, mode
='r') as utf8_file
:
40 charmap_dictionary
= {}
41 for line
in utf8_file
:
42 if line
.startswith('CHARMAP'):
44 for line
in utf8_file
:
45 if line
.startswith('END CHARMAP'):
46 return charmap_dictionary
47 if line
.startswith('%'):
50 r
'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
51 +r
'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
52 +r
'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
56 codepoint1
= match
.group('codepoint1')
57 codepoint2
= match
.group('codepoint2')
59 codepoint2
= codepoint1
60 for i
in range(int(codepoint1
, 16),
61 int(codepoint2
, 16) + 1):
62 charmap_dictionary
[i
] = match
.group('hexutf8')
63 sys
.stderr
.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
67 def check_charmap(original_file_name
, new_file_name
):
68 '''Report differences in the CHARMAP section between the old and the
71 print('************************************************************')
72 print('Report on CHARMAP:')
73 ocharmap
= create_charmap_dictionary(original_file_name
)
74 ncharmap
= create_charmap_dictionary(new_file_name
)
75 print('------------------------------------------------------------')
76 print('Total removed characters in newly generated CHARMAP: %d'
77 %len(set(ocharmap
)-set(ncharmap
)))
78 if ARGS
.show_missing_characters
:
79 for key
in sorted(set(ocharmap
)-set(ncharmap
)):
80 print('removed: {:s} {:s} {:s}'.format(
81 unicode_utils
.ucs_symbol(key
),
83 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['name'] \
84 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None'))
85 print('------------------------------------------------------------')
87 for key
in set(ocharmap
).intersection(set(ncharmap
)):
88 if ocharmap
[key
] != ncharmap
[key
]:
89 changed_charmap
[key
] = (ocharmap
[key
], ncharmap
[key
])
90 print('Total changed characters in newly generated CHARMAP: %d'
91 %len(changed_charmap
))
92 if ARGS
.show_changed_characters
:
93 for key
in sorted(changed_charmap
):
94 print('changed: {:s} {:s}->{:s} {:s}'.format(
95 unicode_utils
.ucs_symbol(key
),
96 changed_charmap
[key
][0],
97 changed_charmap
[key
][1],
98 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['name'] \
99 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None'))
100 print('------------------------------------------------------------')
101 print('Total added characters in newly generated CHARMAP: %d'
102 %len(set(ncharmap
)-set(ocharmap
)))
103 if ARGS
.show_added_characters
:
104 for key
in sorted(set(ncharmap
)-set(ocharmap
)):
105 print('added: {:s} {:s} {:s}'.format(
106 unicode_utils
.ucs_symbol(key
),
108 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['name'] \
109 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None'))
111 def create_width_dictionary(file_name
):
112 '''Create a dictionary for all code points found in the WIDTH
115 with
open(file_name
, mode
='r') as utf8_file
:
116 width_dictionary
= {}
117 for line
in utf8_file
:
118 if line
.startswith('WIDTH'):
120 for line
in utf8_file
:
121 if line
.startswith('END WIDTH'):
122 return width_dictionary
124 r
'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
125 +r
'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
126 +r
'\s+(?P<width>[02])',
130 codepoint1
= match
.group('codepoint1')
131 codepoint2
= match
.group('codepoint2')
133 codepoint2
= codepoint1
134 for i
in range(int(codepoint1
, 16),
135 int(codepoint2
, 16) + 1):
136 width_dictionary
[i
] = int(match
.group('width'))
137 sys
.stderr
.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file
)
139 def check_width(original_file_name
, new_file_name
):
140 '''Report differences in the WIDTH section between the old and the new
143 print('************************************************************')
144 print('Report on WIDTH:')
145 owidth
= create_width_dictionary(original_file_name
)
146 nwidth
= create_width_dictionary(new_file_name
)
147 print('------------------------------------------------------------')
148 print('Total removed characters in newly generated WIDTH: %d'
149 %len(set(owidth
)-set(nwidth
)))
150 print('(Characters not in WIDTH get width 1 by default, '
151 + 'i.e. these have width 1 now.)')
152 if ARGS
.show_missing_characters
:
153 for key
in sorted(set(owidth
)-set(nwidth
)):
154 print('removed: {:s} '.format(unicode_utils
.ucs_symbol(key
))
155 + '{:d} : '.format(owidth
[key
])
156 + 'eaw={:s} '.format(
157 unicode_utils
.EAST_ASIAN_WIDTHS
[key
]
158 if key
in unicode_utils
.EAST_ASIAN_WIDTHS
else 'None')
159 + 'category={:2s} '.format(
160 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['category']
161 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None')
162 + 'bidi={:3s} '.format(
163 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['bidi']
164 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None')
165 + 'name={:s}'.format(
166 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['name']
167 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None'))
168 print('------------------------------------------------------------')
170 for key
in set(owidth
).intersection(set(nwidth
)):
171 if owidth
[key
] != nwidth
[key
]:
172 changed_width
[key
] = (owidth
[key
], nwidth
[key
])
173 print('Total changed characters in newly generated WIDTH: %d'
175 if ARGS
.show_changed_characters
:
176 for key
in sorted(changed_width
):
177 print('changed width: {:s} '.format(unicode_utils
.ucs_symbol(key
))
178 + '{:d}->{:d} : '.format(changed_width
[key
][0],
179 changed_width
[key
][1])
180 + 'eaw={:s} '.format(
181 unicode_utils
.EAST_ASIAN_WIDTHS
[key
]
182 if key
in unicode_utils
.EAST_ASIAN_WIDTHS
else 'None')
183 + 'category={:2s} '.format(
184 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['category']
185 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None')
186 + 'bidi={:3s} '.format(
187 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['bidi']
188 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None')
189 + 'name={:s}'.format(
190 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['name']
191 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None'))
192 print('------------------------------------------------------------')
193 print('Total added characters in newly generated WIDTH: %d'
194 %len(set(nwidth
)-set(owidth
)))
195 print('(Characters not in WIDTH get width 1 by default, '
196 + 'i.e. these had width 1 before.)')
197 if ARGS
.show_added_characters
:
198 for key
in sorted(set(nwidth
)-set(owidth
)):
199 print('added: {:s} '.format(unicode_utils
.ucs_symbol(key
))
200 + '{:d} : '.format(nwidth
[key
])
201 + 'eaw={:s} '.format(
202 unicode_utils
.EAST_ASIAN_WIDTHS
[key
]
203 if key
in unicode_utils
.EAST_ASIAN_WIDTHS
else 'None')
204 + 'category={:2s} '.format(
205 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['category']
206 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None')
207 + 'bidi={:3s} '.format(
208 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['bidi']
209 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None')
210 + 'name={:s}'.format(
211 unicode_utils
.UNICODE_ATTRIBUTES
[key
]['name']
212 if key
in unicode_utils
.UNICODE_ATTRIBUTES
else 'None'))
214 if __name__
== "__main__":
215 PARSER
= argparse
.ArgumentParser(
217 Compare the contents of LC_CTYPE in two files and check for errors.
220 '-o', '--old_utf8_file',
224 help='The old UTF-8 file.')
226 '-n', '--new_utf8_file',
230 help='The new UTF-8 file.')
232 '-u', '--unicode_data_file',
235 help='The UnicodeData.txt file to read.')
237 '-e', '--east_asian_width_file',
240 help='The EastAsianWidth.txt file to read.')
242 '-a', '--show_added_characters',
244 help='Show characters which were added in detail.')
246 '-m', '--show_missing_characters',
248 help='Show characters which were removed in detail.')
250 '-c', '--show_changed_characters',
252 help='Show characters whose width was changed in detail.')
253 ARGS
= PARSER
.parse_args()
255 if ARGS
.unicode_data_file
:
256 unicode_utils
.fill_attributes(ARGS
.unicode_data_file
)
257 if ARGS
.east_asian_width_file
:
258 unicode_utils
.fill_east_asian_widths(ARGS
.east_asian_width_file
)
259 check_charmap(ARGS
.old_utf8_file
, ARGS
.new_utf8_file
)
260 check_width(ARGS
.old_utf8_file
, ARGS
.new_utf8_file
)