Lib/encodings/__init__.py

   1 """ Standard "encodings" Package
   2
   3     Standard Python encoding modules are stored in this package
   4     directory.
   5
   6     Codec modules must have names corresponding to normalized encoding
   7     names as defined in the normalize_encoding() function below, e.g.
   8     'utf-8' must be implemented by the module 'utf_8.py'.
   9
  10     Each codec module must export the following interface:
  11
  12     * getregentry() -> codecs.CodecInfo object
  13     The getregentry() API must a CodecInfo object with encoder, decoder,
  14     incrementalencoder, incrementaldecoder, streamwriter and streamreader
  15     atttributes which adhere to the Python Codec Interface Standard.
  16
  17     In addition, a module may optionally also define the following
  18     APIs which are then used by the package's codec search function:
  19
  20     * getaliases() -> sequence of encoding name strings to use as aliases
  21
  22     Alias names returned by getaliases() must be normalized encoding
  23     names as defined by normalize_encoding().
  24
  25 Written by Marc-Andre Lemburg (mal@lemburg.com).
  26
  27 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  28
  29 """#"
  30
  31 import codecs
  32 from . import aliases
  33
  34 _cache = {}
  35 _unknown = '--unknown--'
  36 _import_tail = ['*']
  37 _aliases = aliases.aliases
  38
  39 class CodecRegistryError(LookupError, SystemError):
  40     pass
  41
  42 def normalize_encoding(encoding):
  43
  44     """ Normalize an encoding name.
  45
  46         Normalization works as follows: all non-alphanumeric
  47         characters except the dot used for Python package names are
  48         collapsed and replaced with a single underscore, e.g. '  -;#'
  49         becomes '_'. Leading and trailing underscores are removed.
  50
  51         Note that encoding names should be ASCII only; if they do use
  52         non-ASCII characters, these must be Latin-1 compatible.
  53
  54     """
  55     if isinstance(encoding, bytes):
  56         encoding = str(encoding, "ascii")
  57     chars = []
  58     punct = False
  59     for c in encoding:
  60         if c.isalnum() or c == '.':
  61             if punct and chars:
  62                 chars.append('_')
  63             chars.append(c)
  64             punct = False
  65         else:
  66             punct = True
  67     return ''.join(chars)
  68
  69 def search_function(encoding):
  70
  71     # Cache lookup
  72     entry = _cache.get(encoding, _unknown)
  73     if entry is not _unknown:
  74         return entry
  75
  76     # Import the module:
  77     #
  78     # First try to find an alias for the normalized encoding
  79     # name and lookup the module using the aliased name, then try to
  80     # lookup the module using the standard import scheme, i.e. first
  81     # try in the encodings package, then at top-level.
  82     #
  83     norm_encoding = normalize_encoding(encoding)
  84     aliased_encoding = _aliases.get(norm_encoding) or \
  85                        _aliases.get(norm_encoding.replace('.', '_'))
  86     if aliased_encoding is not None:
  87         modnames = [aliased_encoding,
  88                     norm_encoding]
  89     else:
  90         modnames = [norm_encoding]
  91     for modname in modnames:
  92         if not modname or '.' in modname:
  93             continue
  94         try:
  95             # Import is absolute to prevent the possibly malicious import of a
  96             # module with side-effects that is not in the 'encodings' package.
  97             mod = __import__('encodings.' + modname, fromlist=_import_tail,
  98                              level=0)
  99         except ImportError:
 100             pass
 101         else:
 102             break
 103     else:
 104         mod = None
 105
 106     try:
 107         getregentry = mod.getregentry
 108     except AttributeError:
 109         # Not a codec module
 110         mod = None
 111
 112     if mod is None:
 113         # Cache misses
 114         _cache[encoding] = None
 115         return None
 116
 117     # Now ask the module for the registry entry
 118     entry = getregentry()
 119     if not isinstance(entry, codecs.CodecInfo):
 120         if not 4 <= len(entry) <= 7:
 121             raise CodecRegistryError('module "%s" (%s) failed to register'
 122                                      % (mod.__name__, mod.__file__))
 123         if not hasattr(entry[0], '__call__') or \
 124            not hasattr(entry[1], '__call__') or \
 125            (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
 126            (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
 127            (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
 128            (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
 129             raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
 130                                      % (mod.__name__, mod.__file__))
 131         if len(entry)<7 or entry[6] is None:
 132             entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
 133         entry = codecs.CodecInfo(*entry)
 134
 135     # Cache the codec registry entry
 136     _cache[encoding] = entry
 137
 138     # Register its aliases (without overwriting previously registered
 139     # aliases)
 140     try:
 141         codecaliases = mod.getaliases()
 142     except AttributeError:
 143         pass
 144     else:
 145         for alias in codecaliases:
 146             if alias not in _aliases:
 147                 _aliases[alias] = modname
 148
 149     # Return the registry entry
 150     return entry
 151
 152 # Register the search_function in the Python codec registry
 153 codecs.register(search_function)