Tools/unicode/gencodec.py

   1 """ Unicode Mapping Parser and Codec Generator.
   2
   3 This script parses Unicode mapping files as available from the Unicode
   4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
   5 modules from them. The codecs use the standard character mapping codec
   6 to actually apply the mapping.
   7
   8 Synopsis: gencodec.py dir codec_prefix
   9
  10 All files in dir are scanned and those producing non-empty mappings
  11 will be written to <codec_prefix><mapname>.py with <mapname> being the
  12 first part of the map's filename ('a' in a.b.c.txt) converted to
  13 lowercase with hyphens replaced by underscores.
  14
  15 The tool also writes marshalled versions of the mapping tables to the
  16 same location (with .mapping extension).
  17
  18 Written by Marc-Andre Lemburg (mal@lemburg.com).
  19
  20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  21 (c) Copyright Guido van Rossum, 2000.
  22
  23 Table generation:
  24 (c) Copyright Marc-Andre Lemburg, 2005.
  25     Licensed to PSF under a Contributor Agreement.
  26
  27 """#"
  28
  29 import re, os, time, marshal, codecs
  30
  31 # Maximum allowed size of charmap tables
  32 MAX_TABLE_SIZE = 8192
  33
  34 # Standard undefined Unicode code point
  35 UNI_UNDEFINED = unichr(0xFFFE)
  36
  37 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
  38                    '\s+'
  39                    '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
  40                    '\s*'
  41                    '(#.+)?')
  42
  43 def parsecodes(codes,
  44                len=len, filter=filter,range=range):
  45
  46     """ Converts code combinations to either a single code integer
  47         or a tuple of integers.
  48
  49         meta-codes (in angular brackets, e.g. <LR> and <RL>) are
  50         ignored.
  51
  52         Empty codes or illegal ones are returned as None.
  53
  54     """
  55     if not codes:
  56         return None
  57     l = codes.split('+')
  58     if len(l) == 1:
  59         return int(l[0],16)
  60     for i in range(len(l)):
  61         try:
  62             l[i] = int(l[i],16)
  63         except ValueError:
  64             l[i] = None
  65     l = filter(lambda x: x is not None, l)
  66     if len(l) == 1:
  67         return l[0]
  68     else:
  69         return tuple(l)
  70
  71 def readmap(filename):
  72
  73     f = open(filename,'r')
  74     lines = f.readlines()
  75     f.close()
  76     enc2uni = {}
  77     identity = []
  78     unmapped = range(256)
  79
  80     # UTC mapping tables per convention don't include the identity
  81     # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
  82     # explicitly mapped to different characters or undefined
  83     for i in range(32) + [127]:
  84         identity.append(i)
  85         unmapped.remove(i)
  86         enc2uni[i] = (i, 'CONTROL CHARACTER')
  87
  88     for line in lines:
  89         line = line.strip()
  90         if not line or line[0] == '#':
  91             continue
  92         m = mapRE.match(line)
  93         if not m:
  94             #print '* not matched: %s' % repr(line)
  95             continue
  96         enc,uni,comment = m.groups()
  97         enc = parsecodes(enc)
  98         uni = parsecodes(uni)
  99         if comment is None:
 100             comment = ''
 101         else:
 102             comment = comment[1:].strip()
 103         if enc < 256:
 104             if enc in unmapped:
 105                 unmapped.remove(enc)
 106             if enc == uni:
 107                 identity.append(enc)
 108             enc2uni[enc] = (uni,comment)
 109         else:
 110             enc2uni[enc] = (uni,comment)
 111
 112     # If there are more identity-mapped entries than unmapped entries,
 113     # it pays to generate an identity dictionary first, and add explicit
 114     # mappings to None for the rest
 115     if len(identity) >= len(unmapped):
 116         for enc in unmapped:
 117             enc2uni[enc] = (None, "")
 118         enc2uni['IDENTITY'] = 256
 119
 120     return enc2uni
 121
 122 def hexrepr(t, precision=4):
 123
 124     if t is None:
 125         return 'None'
 126     try:
 127         len(t)
 128     except:
 129         return '0x%0*X' % (precision, t)
 130     try:
 131         return '(' + ', '.join(['0x%0*X' % (precision, item)
 132                                 for item in t]) + ')'
 133     except TypeError, why:
 134         print '* failed to convert %r: %s' % (t, why)
 135         raise
 136
 137 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
 138
 139     l = []
 140     append = l.append
 141     if map.has_key("IDENTITY"):
 142         append("%s = codecs.make_identity_dict(range(%d))" %
 143                (varname, map["IDENTITY"]))
 144         append("%s.update({" % varname)
 145         splits = 1
 146         del map["IDENTITY"]
 147         identity = 1
 148     else:
 149         append("%s = {" % varname)
 150         splits = 0
 151         identity = 0
 152
 153     mappings = map.items()
 154     mappings.sort()
 155     i = 0
 156     key_precision, value_precision = precisions
 157     for mapkey, mapvalue in mappings:
 158         mapcomment = ''
 159         if isinstance(mapkey, tuple):
 160             (mapkey, mapcomment) = mapkey
 161         if isinstance(mapvalue, tuple):
 162             (mapvalue, mapcomment) = mapvalue
 163         if mapkey is None:
 164             continue
 165         if (identity and
 166             mapkey == mapvalue and
 167             mapkey < 256):
 168             # No need to include identity mappings, since these
 169             # are already set for the first 256 code points.
 170             continue
 171         key = hexrepr(mapkey, key_precision)
 172         value = hexrepr(mapvalue, value_precision)
 173         if mapcomment and comments:
 174             append('    %s: %s,\t#  %s' % (key, value, mapcomment))
 175         else:
 176             append('    %s: %s,' % (key, value))
 177         i += 1
 178         if i == 4096:
 179             # Split the definition into parts to that the Python
 180             # parser doesn't dump core
 181             if splits == 0:
 182                 append('}')
 183             else:
 184                 append('})')
 185             append('%s.update({' % varname)
 186             i = 0
 187             splits = splits + 1
 188     if splits == 0:
 189         append('}')
 190     else:
 191         append('})')
 192
 193     return l
 194
 195 def python_tabledef_code(varname, map, comments=1, key_precision=2):
 196
 197     l = []
 198     append = l.append
 199     append('%s = (' % varname)
 200
 201     # Analyze map and create table dict
 202     mappings = map.items()
 203     mappings.sort()
 204     table = {}
 205     maxkey = 0
 206     if map.has_key('IDENTITY'):
 207         for key in range(256):
 208             table[key] = (key, '')
 209         maxkey = 255
 210         del map['IDENTITY']
 211     for mapkey, mapvalue in mappings:
 212         mapcomment = ''
 213         if isinstance(mapkey, tuple):
 214             (mapkey, mapcomment) = mapkey
 215         if isinstance(mapvalue, tuple):
 216             (mapvalue, mapcomment) = mapvalue
 217         if mapkey is None:
 218             continue
 219         table[mapkey] = (mapvalue, mapcomment)
 220         if mapkey > maxkey:
 221             maxkey = mapkey
 222     if maxkey > MAX_TABLE_SIZE:
 223         # Table too large
 224         return None
 225
 226     # Create table code
 227     for key in range(maxkey + 1):
 228         if key not in table:
 229             mapvalue = None
 230             mapcomment = 'UNDEFINED'
 231         else:
 232             mapvalue, mapcomment = table[key]
 233         if mapvalue is None:
 234             mapchar = UNI_UNDEFINED
 235         else:
 236             if isinstance(mapvalue, tuple):
 237                 # 1-n mappings not supported
 238                 return None
 239             else:
 240                 mapchar = unichr(mapvalue)
 241         if mapcomment and comments:
 242             append('    %r\t#  %s -> %s' % (mapchar,
 243                                             hexrepr(key, key_precision),
 244                                             mapcomment))
 245         else:
 246             append('    %r' % mapchar)
 247
 248     append(')')
 249     return l
 250
 251 def codegen(name, map, comments=1):
 252
 253     """ Returns Python source for the given map.
 254
 255         Comments are included in the source, if comments is true (default).
 256
 257     """
 258     # Generate code
 259     decoding_map_code = python_mapdef_code(
 260         'decoding_map',
 261         map,
 262         comments=comments)
 263     decoding_table_code = python_tabledef_code(
 264         'decoding_table',
 265         map,
 266         comments=comments)
 267     encoding_map_code = python_mapdef_code(
 268         'encoding_map',
 269         codecs.make_encoding_map(map),
 270         comments=comments,
 271         precisions=(4, 2))
 272
 273     l = [
 274         '''\
 275 """ Python Character Mapping Codec generated from '%s' with gencodec.py.
 276
 277 """#"
 278
 279 import codecs
 280
 281 ### Codec APIs
 282
 283 class Codec(codecs.Codec):
 284
 285     def encode(self,input,errors='strict'):
 286
 287         return codecs.charmap_encode(input,errors,encoding_map)
 288
 289     def decode(self,input,errors='strict'):
 290 ''' % name
 291         ]
 292     if decoding_table_code:
 293         l.append('''\
 294         return codecs.charmap_decode(input,errors,decoding_table)''')
 295     else:
 296         l.append('''\
 297         return codecs.charmap_decode(input,errors,decoding_map)''')
 298
 299     l.append('''
 300 class StreamWriter(Codec,codecs.StreamWriter):
 301     pass
 302
 303 class StreamReader(Codec,codecs.StreamReader):
 304     pass
 305
 306 ### encodings module API
 307
 308 def getregentry():
 309
 310     return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
 311 ''')
 312
 313     # Add decoding table or map (with preference to the table)
 314     if not decoding_table_code:
 315         l.append('''
 316 ### Decoding Map
 317 ''')
 318         l.extend(decoding_map_code)
 319     else:
 320         l.append('''
 321 ### Decoding Table
 322 ''')
 323         l.extend(decoding_table_code)
 324
 325     # Add encoding map
 326     l.append('''
 327 ### Encoding Map
 328 ''')
 329     l.extend(encoding_map_code)
 330
 331     # Final new-line
 332     l.append('\n')
 333
 334     return '\n'.join(l)
 335
 336 def pymap(name,map,pyfile,comments=1):
 337
 338     code = codegen(name,map,comments)
 339     f = open(pyfile,'w')
 340     f.write(code)
 341     f.close()
 342
 343 def marshalmap(name,map,marshalfile):
 344
 345     d = {}
 346     for e,(u,c) in map.items():
 347         d[e] = (u,c)
 348     f = open(marshalfile,'wb')
 349     marshal.dump(d,f)
 350     f.close()
 351
 352 def convertdir(dir,prefix='',comments=1):
 353
 354     mapnames = os.listdir(dir)
 355     for mapname in mapnames:
 356         mappathname = os.path.join(dir, mapname)
 357         if not os.path.isfile(mappathname):
 358             continue
 359         name = os.path.split(mapname)[1]
 360         name = name.replace('-','_')
 361         name = name.split('.')[0]
 362         name = name.lower()
 363         codefile = name + '.py'
 364         marshalfile = name + '.mapping'
 365         print 'converting %s to %s and %s' % (mapname,
 366                                               prefix + codefile,
 367                                               prefix + marshalfile)
 368         try:
 369             map = readmap(os.path.join(dir,mapname))
 370             if not map:
 371                 print '* map is empty; skipping'
 372             else:
 373                 pymap(mappathname, map, prefix + codefile,comments)
 374                 marshalmap(mappathname, map, prefix + marshalfile)
 375         except ValueError, why:
 376             print '* conversion failed: %s' % why
 377             raise
 378
 379 def rewritepythondir(dir,prefix='',comments=1):
 380
 381     mapnames = os.listdir(dir)
 382     for mapname in mapnames:
 383         if not mapname.endswith('.mapping'):
 384             continue
 385         codefile = mapname[:-len('.mapping')] + '.py'
 386         print 'converting %s to %s' % (mapname,
 387                                        prefix + codefile)
 388         try:
 389             map = marshal.load(open(os.path.join(dir,mapname),
 390                                'rb'))
 391             if not map:
 392                 print '* map is empty; skipping'
 393             else:
 394                 pymap(mapname, map, prefix + codefile,comments)
 395         except ValueError, why:
 396             print '* conversion failed: %s' % why
 397
 398 if __name__ == '__main__':
 399
 400     import sys
 401     if 1:
 402         apply(convertdir,tuple(sys.argv[1:]))
 403     else:
 404         apply(rewritepythondir,tuple(sys.argv[1:]))