Merged revisions 81656 via svnmerge from
[python/dscho.git] / Lib / encodings / __init__.py
blobd72eae93281f2ab4cf7e8a624760e576477112d5
1 """ Standard "encodings" Package
3 Standard Python encoding modules are stored in this package
4 directory.
6 Codec modules must have names corresponding to normalized encoding
7 names as defined in the normalize_encoding() function below, e.g.
8 'utf-8' must be implemented by the module 'utf_8.py'.
10 Each codec module must export the following interface:
12 * getregentry() -> codecs.CodecInfo object
13 The getregentry() API must a CodecInfo object with encoder, decoder,
14 incrementalencoder, incrementaldecoder, streamwriter and streamreader
15 atttributes which adhere to the Python Codec Interface Standard.
17 In addition, a module may optionally also define the following
18 APIs which are then used by the package's codec search function:
20 * getaliases() -> sequence of encoding name strings to use as aliases
22 Alias names returned by getaliases() must be normalized encoding
23 names as defined by normalize_encoding().
25 Written by Marc-Andre Lemburg (mal@lemburg.com).
27 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
29 """#"
31 import codecs
32 from . import aliases
34 _cache = {}
35 _unknown = '--unknown--'
36 _import_tail = ['*']
37 _aliases = aliases.aliases
39 class CodecRegistryError(LookupError, SystemError):
40 pass
42 def normalize_encoding(encoding):
44 """ Normalize an encoding name.
46 Normalization works as follows: all non-alphanumeric
47 characters except the dot used for Python package names are
48 collapsed and replaced with a single underscore, e.g. ' -;#'
49 becomes '_'. Leading and trailing underscores are removed.
51 Note that encoding names should be ASCII only; if they do use
52 non-ASCII characters, these must be Latin-1 compatible.
54 """
55 if isinstance(encoding, bytes):
56 encoding = str(encoding, "ascii")
57 chars = []
58 punct = False
59 for c in encoding:
60 if c.isalnum() or c == '.':
61 if punct and chars:
62 chars.append('_')
63 chars.append(c)
64 punct = False
65 else:
66 punct = True
67 return ''.join(chars)
69 def search_function(encoding):
71 # Cache lookup
72 entry = _cache.get(encoding, _unknown)
73 if entry is not _unknown:
74 return entry
76 # Import the module:
78 # First try to find an alias for the normalized encoding
79 # name and lookup the module using the aliased name, then try to
80 # lookup the module using the standard import scheme, i.e. first
81 # try in the encodings package, then at top-level.
83 norm_encoding = normalize_encoding(encoding)
84 aliased_encoding = _aliases.get(norm_encoding) or \
85 _aliases.get(norm_encoding.replace('.', '_'))
86 if aliased_encoding is not None:
87 modnames = [aliased_encoding,
88 norm_encoding]
89 else:
90 modnames = [norm_encoding]
91 for modname in modnames:
92 if not modname or '.' in modname:
93 continue
94 try:
95 # Import is absolute to prevent the possibly malicious import of a
96 # module with side-effects that is not in the 'encodings' package.
97 mod = __import__('encodings.' + modname, fromlist=_import_tail,
98 level=0)
99 except ImportError:
100 pass
101 else:
102 break
103 else:
104 mod = None
106 try:
107 getregentry = mod.getregentry
108 except AttributeError:
109 # Not a codec module
110 mod = None
112 if mod is None:
113 # Cache misses
114 _cache[encoding] = None
115 return None
117 # Now ask the module for the registry entry
118 entry = getregentry()
119 if not isinstance(entry, codecs.CodecInfo):
120 if not 4 <= len(entry) <= 7:
121 raise CodecRegistryError('module "%s" (%s) failed to register'
122 % (mod.__name__, mod.__file__))
123 if not hasattr(entry[0], '__call__') or \
124 not hasattr(entry[1], '__call__') or \
125 (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
126 (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
127 (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
128 (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
129 raise CodecRegistryError('incompatible codecs in module "%s" (%s)'
130 % (mod.__name__, mod.__file__))
131 if len(entry)<7 or entry[6] is None:
132 entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
133 entry = codecs.CodecInfo(*entry)
135 # Cache the codec registry entry
136 _cache[encoding] = entry
138 # Register its aliases (without overwriting previously registered
139 # aliases)
140 try:
141 codecaliases = mod.getaliases()
142 except AttributeError:
143 pass
144 else:
145 for alias in codecaliases:
146 if alias not in _aliases:
147 _aliases[alias] = modname
149 # Return the registry entry
150 return entry
152 # Register the search_function in the Python codec registry
153 codecs.register(search_function)