Lib/encodings/__init__.py

   1 """ Standard "encodings" Package
   2
   3     Standard Python encoding modules are stored in this package
   4     directory.
   5
   6     Codec modules must have names corresponding to normalized encoding
   7     names as defined in the normalize_encoding() function below, e.g.
   8     'utf-8' must be implemented by the module 'utf_8.py'.
   9
  10     Each codec module must export the following interface:
  11
  12     * getregentry() -> (encoder, decoder, stream_reader, stream_writer)
  13     The getregentry() API must return callable objects which adhere to
  14     the Python Codec Interface Standard.
  15
  16     In addition, a module may optionally also define the following
  17     APIs which are then used by the package's codec search function:
  18
  19     * getaliases() -> sequence of encoding name strings to use as aliases
  20
  21     Alias names returned by getaliases() must be normalized encoding
  22     names as defined by normalize_encoding().
  23
  24 Written by Marc-Andre Lemburg (mal@lemburg.com).
  25
  26 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  27
  28 """#"
  29
  30 import codecs, types, aliases
  31
  32 _cache = {}
  33 _unknown = '--unknown--'
  34 _import_tail = ['*']
  35 _norm_encoding_map = ('                                              . '
  36                       '0123456789       ABCDEFGHIJKLMNOPQRSTUVWXYZ     '
  37                       ' abcdefghijklmnopqrstuvwxyz                     '
  38                       '                                                '
  39                       '                                                '
  40                       '                ')
  41 _aliases = aliases.aliases
  42
  43 class CodecRegistryError(LookupError, SystemError):
  44     pass
  45
  46 def normalize_encoding(encoding):
  47
  48     """ Normalize an encoding name.
  49
  50         Normalization works as follows: all non-alphanumeric
  51         characters except the dot used for Python package names are
  52         collapsed and replaced with a single underscore, e.g. '  -;#'
  53         becomes '_'. Leading and trailing underscores are removed.
  54
  55         Note that encoding names should be ASCII only; if they do use
  56         non-ASCII characters, these must be Latin-1 compatible.
  57
  58     """
  59     # Make sure we have an 8-bit string, because .translate() works
  60     # differently for Unicode strings.
  61     if type(encoding) is types.UnicodeType:
  62         # Note that .encode('latin-1') does *not* use the codec
  63         # registry, so this call doesn't recurse. (See unicodeobject.c
  64         # PyUnicode_AsEncodedString() for details)
  65         encoding = encoding.encode('latin-1')
  66     return '_'.join(encoding.translate(_norm_encoding_map).split())
  67
  68 def search_function(encoding):
  69
  70     # Cache lookup
  71     entry = _cache.get(encoding, _unknown)
  72     if entry is not _unknown:
  73         return entry
  74
  75     # Import the module:
  76     #
  77     # First try to find an alias for the normalized encoding
  78     # name and lookup the module using the aliased name, then try to
  79     # lookup the module using the standard import scheme, i.e. first
  80     # try in the encodings package, then at top-level.
  81     #
  82     norm_encoding = normalize_encoding(encoding)
  83     aliased_encoding = _aliases.get(norm_encoding) or \
  84                        _aliases.get(norm_encoding.replace('.', '_'))
  85     if aliased_encoding is not None:
  86         modnames = [aliased_encoding,
  87                     norm_encoding]
  88     else:
  89         modnames = [norm_encoding]
  90     for modname in modnames:
  91         if not modname:
  92             continue
  93         try:
  94             mod = __import__(modname,
  95                              globals(), locals(), _import_tail)
  96         except ImportError:
  97             pass
  98         else:
  99             break
 100     else:
 101         mod = None
 102
 103     try:
 104         getregentry = mod.getregentry
 105     except AttributeError:
 106         # Not a codec module
 107         mod = None
 108
 109     if mod is None:
 110         # Cache misses
 111         _cache[encoding] = None
 112         return None
 113
 114     # Now ask the module for the registry entry
 115     entry = tuple(getregentry())
 116     if len(entry) != 4:
 117         raise CodecRegistryError,\
 118               'module "%s" (%s) failed to register' % \
 119               (mod.__name__, mod.__file__)
 120     for obj in entry:
 121         if not callable(obj):
 122             raise CodecRegistryError,\
 123                   'incompatible codecs in module "%s" (%s)' % \
 124                   (mod.__name__, mod.__file__)
 125
 126     # Cache the codec registry entry
 127     _cache[encoding] = entry
 128
 129     # Register its aliases (without overwriting previously registered
 130     # aliases)
 131     try:
 132         codecaliases = mod.getaliases()
 133     except AttributeError:
 134         pass
 135     else:
 136         for alias in codecaliases:
 137             if not _aliases.has_key(alias):
 138                 _aliases[alias] = modname
 139
 140     # Return the registry entry
 141     return entry
 142
 143 # Register the search_function in the Python codec registry
 144 codecs.register(search_function)