Added a test for the ability to specify a class attribute in Formatter configuration...
[python.git] / Tools / unicode / gencodec.py
blob9b4ae1626a85d2d0634c8b23e13349172ad38128
1 """ Unicode Mapping Parser and Codec Generator.
3 This script parses Unicode mapping files as available from the Unicode
4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5 modules from them. The codecs use the standard character mapping codec
6 to actually apply the mapping.
8 Synopsis: gencodec.py dir codec_prefix
10 All files in dir are scanned and those producing non-empty mappings
11 will be written to <codec_prefix><mapname>.py with <mapname> being the
12 first part of the map's filename ('a' in a.b.c.txt) converted to
13 lowercase with hyphens replaced by underscores.
15 The tool also writes marshalled versions of the mapping tables to the
16 same location (with .mapping extension).
18 Written by Marc-Andre Lemburg (mal@lemburg.com).
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21 (c) Copyright Guido van Rossum, 2000.
23 Table generation:
24 (c) Copyright Marc-Andre Lemburg, 2005.
25 Licensed to PSF under a Contributor Agreement.
27 """#"
29 import re, os, time, marshal, codecs
31 # Maximum allowed size of charmap tables
32 MAX_TABLE_SIZE = 8192
34 # Standard undefined Unicode code point
35 UNI_UNDEFINED = unichr(0xFFFE)
37 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38 '\s+'
39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
40 '\s*'
41 '(#.+)?')
43 def parsecodes(codes,
44 len=len, filter=filter,range=range):
46 """ Converts code combinations to either a single code integer
47 or a tuple of integers.
49 meta-codes (in angular brackets, e.g. <LR> and <RL>) are
50 ignored.
52 Empty codes or illegal ones are returned as None.
54 """
55 if not codes:
56 return None
57 l = codes.split('+')
58 if len(l) == 1:
59 return int(l[0],16)
60 for i in range(len(l)):
61 try:
62 l[i] = int(l[i],16)
63 except ValueError:
64 l[i] = None
65 l = filter(lambda x: x is not None, l)
66 if len(l) == 1:
67 return l[0]
68 else:
69 return tuple(l)
71 def readmap(filename):
73 f = open(filename,'r')
74 lines = f.readlines()
75 f.close()
76 enc2uni = {}
77 identity = []
78 unmapped = range(256)
80 # UTC mapping tables per convention don't include the identity
81 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82 # explicitly mapped to different characters or undefined
83 for i in range(32) + [127]:
84 identity.append(i)
85 unmapped.remove(i)
86 enc2uni[i] = (i, 'CONTROL CHARACTER')
88 for line in lines:
89 line = line.strip()
90 if not line or line[0] == '#':
91 continue
92 m = mapRE.match(line)
93 if not m:
94 #print '* not matched: %s' % repr(line)
95 continue
96 enc,uni,comment = m.groups()
97 enc = parsecodes(enc)
98 uni = parsecodes(uni)
99 if comment is None:
100 comment = ''
101 else:
102 comment = comment[1:].strip()
103 if enc < 256:
104 if enc in unmapped:
105 unmapped.remove(enc)
106 if enc == uni:
107 identity.append(enc)
108 enc2uni[enc] = (uni,comment)
109 else:
110 enc2uni[enc] = (uni,comment)
112 # If there are more identity-mapped entries than unmapped entries,
113 # it pays to generate an identity dictionary first, and add explicit
114 # mappings to None for the rest
115 if len(identity) >= len(unmapped):
116 for enc in unmapped:
117 enc2uni[enc] = (None, "")
118 enc2uni['IDENTITY'] = 256
120 return enc2uni
122 def hexrepr(t, precision=4):
124 if t is None:
125 return 'None'
126 try:
127 len(t)
128 except:
129 return '0x%0*X' % (precision, t)
130 try:
131 return '(' + ', '.join(['0x%0*X' % (precision, item)
132 for item in t]) + ')'
133 except TypeError, why:
134 print '* failed to convert %r: %s' % (t, why)
135 raise
137 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
139 l = []
140 append = l.append
141 if map.has_key("IDENTITY"):
142 append("%s = codecs.make_identity_dict(range(%d))" %
143 (varname, map["IDENTITY"]))
144 append("%s.update({" % varname)
145 splits = 1
146 del map["IDENTITY"]
147 identity = 1
148 else:
149 append("%s = {" % varname)
150 splits = 0
151 identity = 0
153 mappings = map.items()
154 mappings.sort()
155 i = 0
156 key_precision, value_precision = precisions
157 for mapkey, mapvalue in mappings:
158 mapcomment = ''
159 if isinstance(mapkey, tuple):
160 (mapkey, mapcomment) = mapkey
161 if isinstance(mapvalue, tuple):
162 (mapvalue, mapcomment) = mapvalue
163 if mapkey is None:
164 continue
165 if (identity and
166 mapkey == mapvalue and
167 mapkey < 256):
168 # No need to include identity mappings, since these
169 # are already set for the first 256 code points.
170 continue
171 key = hexrepr(mapkey, key_precision)
172 value = hexrepr(mapvalue, value_precision)
173 if mapcomment and comments:
174 append(' %s: %s,\t# %s' % (key, value, mapcomment))
175 else:
176 append(' %s: %s,' % (key, value))
177 i += 1
178 if i == 4096:
179 # Split the definition into parts to that the Python
180 # parser doesn't dump core
181 if splits == 0:
182 append('}')
183 else:
184 append('})')
185 append('%s.update({' % varname)
186 i = 0
187 splits = splits + 1
188 if splits == 0:
189 append('}')
190 else:
191 append('})')
193 return l
195 def python_tabledef_code(varname, map, comments=1, key_precision=2):
197 l = []
198 append = l.append
199 append('%s = (' % varname)
201 # Analyze map and create table dict
202 mappings = map.items()
203 mappings.sort()
204 table = {}
205 maxkey = 0
206 if map.has_key('IDENTITY'):
207 for key in range(256):
208 table[key] = (key, '')
209 maxkey = 255
210 del map['IDENTITY']
211 for mapkey, mapvalue in mappings:
212 mapcomment = ''
213 if isinstance(mapkey, tuple):
214 (mapkey, mapcomment) = mapkey
215 if isinstance(mapvalue, tuple):
216 (mapvalue, mapcomment) = mapvalue
217 if mapkey is None:
218 continue
219 table[mapkey] = (mapvalue, mapcomment)
220 if mapkey > maxkey:
221 maxkey = mapkey
222 if maxkey > MAX_TABLE_SIZE:
223 # Table too large
224 return None
226 # Create table code
227 for key in range(maxkey + 1):
228 if key not in table:
229 mapvalue = None
230 mapcomment = 'UNDEFINED'
231 else:
232 mapvalue, mapcomment = table[key]
233 if mapvalue is None:
234 mapchar = UNI_UNDEFINED
235 else:
236 if isinstance(mapvalue, tuple):
237 # 1-n mappings not supported
238 return None
239 else:
240 mapchar = unichr(mapvalue)
241 if mapcomment and comments:
242 append(' %r\t# %s -> %s' % (mapchar,
243 hexrepr(key, key_precision),
244 mapcomment))
245 else:
246 append(' %r' % mapchar)
248 append(')')
249 return l
251 def codegen(name, map, comments=1):
253 """ Returns Python source for the given map.
255 Comments are included in the source, if comments is true (default).
258 # Generate code
259 decoding_map_code = python_mapdef_code(
260 'decoding_map',
261 map,
262 comments=comments)
263 decoding_table_code = python_tabledef_code(
264 'decoding_table',
265 map,
266 comments=comments)
267 encoding_map_code = python_mapdef_code(
268 'encoding_map',
269 codecs.make_encoding_map(map),
270 comments=comments,
271 precisions=(4, 2))
273 l = [
274 '''\
275 """ Python Character Mapping Codec generated from '%s' with gencodec.py.
277 """#"
279 import codecs
281 ### Codec APIs
283 class Codec(codecs.Codec):
285 def encode(self,input,errors='strict'):
287 return codecs.charmap_encode(input,errors,encoding_map)
289 def decode(self,input,errors='strict'):
290 ''' % name
292 if decoding_table_code:
293 l.append('''\
294 return codecs.charmap_decode(input,errors,decoding_table)''')
295 else:
296 l.append('''\
297 return codecs.charmap_decode(input,errors,decoding_map)''')
299 l.append('''
300 class StreamWriter(Codec,codecs.StreamWriter):
301 pass
303 class StreamReader(Codec,codecs.StreamReader):
304 pass
306 ### encodings module API
308 def getregentry():
310 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
311 ''')
313 # Add decoding table or map (with preference to the table)
314 if not decoding_table_code:
315 l.append('''
316 ### Decoding Map
317 ''')
318 l.extend(decoding_map_code)
319 else:
320 l.append('''
321 ### Decoding Table
322 ''')
323 l.extend(decoding_table_code)
325 # Add encoding map
326 l.append('''
327 ### Encoding Map
328 ''')
329 l.extend(encoding_map_code)
331 # Final new-line
332 l.append('\n')
334 return '\n'.join(l)
336 def pymap(name,map,pyfile,comments=1):
338 code = codegen(name,map,comments)
339 f = open(pyfile,'w')
340 f.write(code)
341 f.close()
343 def marshalmap(name,map,marshalfile):
345 d = {}
346 for e,(u,c) in map.items():
347 d[e] = (u,c)
348 f = open(marshalfile,'wb')
349 marshal.dump(d,f)
350 f.close()
352 def convertdir(dir,prefix='',comments=1):
354 mapnames = os.listdir(dir)
355 for mapname in mapnames:
356 mappathname = os.path.join(dir, mapname)
357 if not os.path.isfile(mappathname):
358 continue
359 name = os.path.split(mapname)[1]
360 name = name.replace('-','_')
361 name = name.split('.')[0]
362 name = name.lower()
363 codefile = name + '.py'
364 marshalfile = name + '.mapping'
365 print 'converting %s to %s and %s' % (mapname,
366 prefix + codefile,
367 prefix + marshalfile)
368 try:
369 map = readmap(os.path.join(dir,mapname))
370 if not map:
371 print '* map is empty; skipping'
372 else:
373 pymap(mappathname, map, prefix + codefile,comments)
374 marshalmap(mappathname, map, prefix + marshalfile)
375 except ValueError, why:
376 print '* conversion failed: %s' % why
377 raise
379 def rewritepythondir(dir,prefix='',comments=1):
381 mapnames = os.listdir(dir)
382 for mapname in mapnames:
383 if not mapname.endswith('.mapping'):
384 continue
385 codefile = mapname[:-len('.mapping')] + '.py'
386 print 'converting %s to %s' % (mapname,
387 prefix + codefile)
388 try:
389 map = marshal.load(open(os.path.join(dir,mapname),
390 'rb'))
391 if not map:
392 print '* map is empty; skipping'
393 else:
394 pymap(mapname, map, prefix + codefile,comments)
395 except ValueError, why:
396 print '* conversion failed: %s' % why
398 if __name__ == '__main__':
400 import sys
401 if 1:
402 apply(convertdir,tuple(sys.argv[1:]))
403 else:
404 apply(rewritepythondir,tuple(sys.argv[1:]))