Lib/encodings/__init__.py

   1 """ Standard "encodings" Package
   2
   3     Standard Python encoding modules are stored in this package
   4     directory.
   5
   6     Codec modules must have names corresponding to normalized encoding
   7     names as defined in the normalize_encoding() function below, e.g.
   8     'utf-8' must be implemented by the module 'utf_8.py'.
   9
  10     Each codec module must export the following interface:
  11
  12     * getregentry() -> codecs.CodecInfo object
  13     The getregentry() API must a CodecInfo object with encoder, decoder,
  14     incrementalencoder, incrementaldecoder, streamwriter and streamreader
  15     atttributes which adhere to the Python Codec Interface Standard.
  16
  17     In addition, a module may optionally also define the following
  18     APIs which are then used by the package's codec search function:
  19
  20     * getaliases() -> sequence of encoding name strings to use as aliases
  21
  22     Alias names returned by getaliases() must be normalized encoding
  23     names as defined by normalize_encoding().
  24
  25 Written by Marc-Andre Lemburg (mal@lemburg.com).
  26
  27 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  28
  29 """#"
  30
  31 import codecs
  32 from encodings import aliases
  33 import __builtin__
  34
  35 _cache = {}
  36 _unknown = '--unknown--'
  37 _import_tail = ['*']
  38 _norm_encoding_map = ('                                              . '
  39                       '0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
  40                       ' abcdefghijklmnopqrstuvwxyz                     '
  41                       '                                                '
  42                       '                                                '
  43                       '                ')
  44 _aliases = aliases.aliases
  45
  46 class CodecRegistryError(LookupError, SystemError):
  47     pass
  48
  49 def normalize_encoding(encoding):
  50
  51     """ Normalize an encoding name.
  52
  53         Normalization works as follows: all non-alphanumeric
  54         characters except the dot used for Python package names are
  55         collapsed and replaced with a single underscore, e.g. '  -;#'
  56         becomes '_'. Leading and trailing underscores are removed.
  57
  58         Note that encoding names should be ASCII only; if they do use
  59         non-ASCII characters, these must be Latin-1 compatible.
  60
  61     """
  62     # Make sure we have an 8-bit string, because .translate() works
  63     # differently for Unicode strings.
  64     if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):
  65         # Note that .encode('latin-1') does *not* use the codec
  66         # registry, so this call doesn't recurse. (See unicodeobject.c
  67         # PyUnicode_AsEncodedString() for details)
  68         encoding = encoding.encode('latin-1')
  69     return '_'.join(encoding.translate(_norm_encoding_map).split())
  70
  71 def search_function(encoding):
  72
  73     # Cache lookup
  74     entry = _cache.get(encoding, _unknown)
  75     if entry is not _unknown:
  76         return entry
  77
  78     # Import the module:
  79     #
  80     # First try to find an alias for the normalized encoding
  81     # name and lookup the module using the aliased name, then try to
  82     # lookup the module using the standard import scheme, i.e. first
  83     # try in the encodings package, then at top-level.
  84     #
  85     norm_encoding = normalize_encoding(encoding)
  86     aliased_encoding = _aliases.get(norm_encoding) or \
  87                        _aliases.get(norm_encoding.replace('.', '_'))
  88     if aliased_encoding is not None:
  89         modnames = [aliased_encoding,
  90                     norm_encoding]
  91     else:
  92         modnames = [norm_encoding]
  93     for modname in modnames:
  94         if not modname or '.' in modname:
  95             continue
  96         try:
  97             # Import is absolute to prevent the possibly malicious import of a
  98             # module with side-effects that is not in the 'encodings' package.
  99             mod = __import__('encodings.' + modname, fromlist=_import_tail,
 100                              level=0)
 101         except ImportError:
 102             pass
 103         else:
 104             break
 105     else:
 106         mod = None
 107
 108     try:
 109         getregentry = mod.getregentry
 110     except AttributeError:
 111         # Not a codec module
 112         mod = None
 113
 114     if mod is None:
 115         # Cache misses
 116         _cache[encoding] = None
 117         return None
 118
 119     # Now ask the module for the registry entry
 120     entry = getregentry()
 121     if not isinstance(entry, codecs.CodecInfo):
 122         if not 4 <= len(entry) <= 7:
 123             raise CodecRegistryError,\
 124                  'module "%s" (%s) failed to register' % \
 125                   (mod.__name__, mod.__file__)
 126         if not hasattr(entry[0], '__call__') or \
 127            not hasattr(entry[1], '__call__') or \
 128            (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
 129            (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
 130            (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
 131            (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
 132             raise CodecRegistryError,\
 133                 'incompatible codecs in module "%s" (%s)' % \
 134                 (mod.__name__, mod.__file__)
 135         if len(entry)<7 or entry[6] is None:
 136             entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
 137         entry = codecs.CodecInfo(*entry)
 138
 139     # Cache the codec registry entry
 140     _cache[encoding] = entry
 141
 142     # Register its aliases (without overwriting previously registered
 143     # aliases)
 144     try:
 145         codecaliases = mod.getaliases()
 146     except AttributeError:
 147         pass
 148     else:
 149         for alias in codecaliases:
 150             if alias not in _aliases:
 151                 _aliases[alias] = modname
 152
 153     # Return the registry entry
 154     return entry
 155
 156 # Register the search_function in the Python codec registry
 157 codecs.register(search_function)