Add Gawain Bolton to Misc/ACKS for his work on base 10 integer -> string optimizations.
[python.git] / Lib / codecs.py
blob557ccf77acabd0f1897ef04dd4662c36097eb047
1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
10 import __builtin__, sys
12 ### Registry and builtin stateless codec functions
14 try:
15 from _codecs import *
16 except ImportError, why:
17 raise SystemError('Failed to load the builtin codecs: %s' % why)
19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
27 ### Constants
30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31 # and its possible byte string values
32 # for UTF8/UTF16/UTF32 output and little/big endian machines
35 # UTF-8
36 BOM_UTF8 = '\xef\xbb\xbf'
38 # UTF-16, little endian
39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41 # UTF-16, big endian
42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44 # UTF-32, little endian
45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
47 # UTF-32, big endian
48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
50 if sys.byteorder == 'little':
52 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
58 else:
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
66 # Old broken names (don't use in new code)
67 BOM32_LE = BOM_UTF16_LE
68 BOM32_BE = BOM_UTF16_BE
69 BOM64_LE = BOM_UTF32_LE
70 BOM64_BE = BOM_UTF32_BE
73 ### Codec base classes (defining the API)
75 class CodecInfo(tuple):
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
89 def __repr__(self):
90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
92 class Codec:
94 """ Defines the interface for stateless encoders/decoders.
96 The .encode()/.decode() methods may use different error
97 handling schemes by providing the errors argument. These
98 string values are predefined:
100 'strict' - raise a ValueError error (or a subclass)
101 'ignore' - ignore the character and continue with the next
102 'replace' - replace with a suitable replacement character;
103 Python will use the official U+FFFD REPLACEMENT
104 CHARACTER for the builtin Unicode codecs on
105 decoding and '?' on encoding.
106 'xmlcharrefreplace' - Replace with the appropriate XML
107 character reference (only for encoding).
108 'backslashreplace' - Replace with backslashed escape sequences
109 (only for encoding).
111 The set of allowed values can be extended via register_error.
114 def encode(self, input, errors='strict'):
116 """ Encodes the object input and returns a tuple (output
117 object, length consumed).
119 errors defines the error handling to apply. It defaults to
120 'strict' handling.
122 The method may not store state in the Codec instance. Use
123 StreamCodec for codecs which have to keep state in order to
124 make encoding/decoding efficient.
126 The encoder must be able to handle zero length input and
127 return an empty object of the output object type in this
128 situation.
131 raise NotImplementedError
133 def decode(self, input, errors='strict'):
135 """ Decodes the object input and returns a tuple (output
136 object, length consumed).
138 input must be an object which provides the bf_getreadbuf
139 buffer slot. Python strings, buffer objects and memory
140 mapped files are examples of objects providing this slot.
142 errors defines the error handling to apply. It defaults to
143 'strict' handling.
145 The method may not store state in the Codec instance. Use
146 StreamCodec for codecs which have to keep state in order to
147 make encoding/decoding efficient.
149 The decoder must be able to handle zero length input and
150 return an empty object of the output object type in this
151 situation.
154 raise NotImplementedError
156 class IncrementalEncoder(object):
158 An IncrementalEncoder encodes an input in multiple steps. The input can be
159 passed piece by piece to the encode() method. The IncrementalEncoder remembers
160 the state of the Encoding process between calls to encode().
162 def __init__(self, errors='strict'):
164 Creates an IncrementalEncoder instance.
166 The IncrementalEncoder may use different error handling schemes by
167 providing the errors keyword argument. See the module docstring
168 for a list of possible values.
170 self.errors = errors
171 self.buffer = ""
173 def encode(self, input, final=False):
175 Encodes input and returns the resulting object.
177 raise NotImplementedError
179 def reset(self):
181 Resets the encoder to the initial state.
184 def getstate(self):
186 Return the current state of the encoder.
188 return 0
190 def setstate(self, state):
192 Set the current state of the encoder. state must have been
193 returned by getstate().
196 class BufferedIncrementalEncoder(IncrementalEncoder):
198 This subclass of IncrementalEncoder can be used as the baseclass for an
199 incremental encoder if the encoder must keep some of the output in a
200 buffer between calls to encode().
202 def __init__(self, errors='strict'):
203 IncrementalEncoder.__init__(self, errors)
204 self.buffer = "" # unencoded input that is kept between calls to encode()
206 def _buffer_encode(self, input, errors, final):
207 # Overwrite this method in subclasses: It must encode input
208 # and return an (output, length consumed) tuple
209 raise NotImplementedError
211 def encode(self, input, final=False):
212 # encode input (taking the buffer into account)
213 data = self.buffer + input
214 (result, consumed) = self._buffer_encode(data, self.errors, final)
215 # keep unencoded input until the next call
216 self.buffer = data[consumed:]
217 return result
219 def reset(self):
220 IncrementalEncoder.reset(self)
221 self.buffer = ""
223 def getstate(self):
224 return self.buffer or 0
226 def setstate(self, state):
227 self.buffer = state or ""
229 class IncrementalDecoder(object):
231 An IncrementalDecoder decodes an input in multiple steps. The input can be
232 passed piece by piece to the decode() method. The IncrementalDecoder
233 remembers the state of the decoding process between calls to decode().
235 def __init__(self, errors='strict'):
237 Creates a IncrementalDecoder instance.
239 The IncrementalDecoder may use different error handling schemes by
240 providing the errors keyword argument. See the module docstring
241 for a list of possible values.
243 self.errors = errors
245 def decode(self, input, final=False):
247 Decodes input and returns the resulting object.
249 raise NotImplementedError
251 def reset(self):
253 Resets the decoder to the initial state.
256 def getstate(self):
258 Return the current state of the decoder.
260 This must be a (buffered_input, additional_state_info) tuple.
261 buffered_input must be a bytes object containing bytes that
262 were passed to decode() that have not yet been converted.
263 additional_state_info must be a non-negative integer
264 representing the state of the decoder WITHOUT yet having
265 processed the contents of buffered_input. In the initial state
266 and after reset(), getstate() must return (b"", 0).
268 return (b"", 0)
270 def setstate(self, state):
272 Set the current state of the decoder.
274 state must have been returned by getstate(). The effect of
275 setstate((b"", 0)) must be equivalent to reset().
278 class BufferedIncrementalDecoder(IncrementalDecoder):
280 This subclass of IncrementalDecoder can be used as the baseclass for an
281 incremental decoder if the decoder must be able to handle incomplete byte
282 sequences.
284 def __init__(self, errors='strict'):
285 IncrementalDecoder.__init__(self, errors)
286 self.buffer = "" # undecoded input that is kept between calls to decode()
288 def _buffer_decode(self, input, errors, final):
289 # Overwrite this method in subclasses: It must decode input
290 # and return an (output, length consumed) tuple
291 raise NotImplementedError
293 def decode(self, input, final=False):
294 # decode input (taking the buffer into account)
295 data = self.buffer + input
296 (result, consumed) = self._buffer_decode(data, self.errors, final)
297 # keep undecoded input until the next call
298 self.buffer = data[consumed:]
299 return result
301 def reset(self):
302 IncrementalDecoder.reset(self)
303 self.buffer = ""
305 def getstate(self):
306 # additional state info is always 0
307 return (self.buffer, 0)
309 def setstate(self, state):
310 # ignore additional state info
311 self.buffer = state[0]
314 # The StreamWriter and StreamReader class provide generic working
315 # interfaces which can be used to implement new encoding submodules
316 # very easily. See encodings/utf_8.py for an example on how this is
317 # done.
320 class StreamWriter(Codec):
322 def __init__(self, stream, errors='strict'):
324 """ Creates a StreamWriter instance.
326 stream must be a file-like object open for writing
327 (binary) data.
329 The StreamWriter may use different error handling
330 schemes by providing the errors keyword argument. These
331 parameters are predefined:
333 'strict' - raise a ValueError (or a subclass)
334 'ignore' - ignore the character and continue with the next
335 'replace'- replace with a suitable replacement character
336 'xmlcharrefreplace' - Replace with the appropriate XML
337 character reference.
338 'backslashreplace' - Replace with backslashed escape
339 sequences (only for encoding).
341 The set of allowed parameter values can be extended via
342 register_error.
344 self.stream = stream
345 self.errors = errors
347 def write(self, object):
349 """ Writes the object's contents encoded to self.stream.
351 data, consumed = self.encode(object, self.errors)
352 self.stream.write(data)
354 def writelines(self, list):
356 """ Writes the concatenated list of strings to the stream
357 using .write().
359 self.write(''.join(list))
361 def reset(self):
363 """ Flushes and resets the codec buffers used for keeping state.
365 Calling this method should ensure that the data on the
366 output is put into a clean state, that allows appending
367 of new fresh data without having to rescan the whole
368 stream to recover state.
371 pass
373 def __getattr__(self, name,
374 getattr=getattr):
376 """ Inherit all other methods from the underlying stream.
378 return getattr(self.stream, name)
380 def __enter__(self):
381 return self
383 def __exit__(self, type, value, tb):
384 self.stream.close()
388 class StreamReader(Codec):
390 def __init__(self, stream, errors='strict'):
392 """ Creates a StreamReader instance.
394 stream must be a file-like object open for reading
395 (binary) data.
397 The StreamReader may use different error handling
398 schemes by providing the errors keyword argument. These
399 parameters are predefined:
401 'strict' - raise a ValueError (or a subclass)
402 'ignore' - ignore the character and continue with the next
403 'replace'- replace with a suitable replacement character;
405 The set of allowed parameter values can be extended via
406 register_error.
408 self.stream = stream
409 self.errors = errors
410 self.bytebuffer = ""
411 # For str->str decoding this will stay a str
412 # For str->unicode decoding the first read will promote it to unicode
413 self.charbuffer = ""
414 self.linebuffer = None
416 def decode(self, input, errors='strict'):
417 raise NotImplementedError
419 def read(self, size=-1, chars=-1, firstline=False):
421 """ Decodes data from the stream self.stream and returns the
422 resulting object.
424 chars indicates the number of characters to read from the
425 stream. read() will never return more than chars
426 characters, but it might return less, if there are not enough
427 characters available.
429 size indicates the approximate maximum number of bytes to
430 read from the stream for decoding purposes. The decoder
431 can modify this setting as appropriate. The default value
432 -1 indicates to read and decode as much as possible. size
433 is intended to prevent having to decode huge files in one
434 step.
436 If firstline is true, and a UnicodeDecodeError happens
437 after the first line terminator in the input only the first line
438 will be returned, the rest of the input will be kept until the
439 next call to read().
441 The method should use a greedy read strategy meaning that
442 it should read as much data as is allowed within the
443 definition of the encoding and the given size, e.g. if
444 optional encoding endings or state markers are available
445 on the stream, these should be read too.
447 # If we have lines cached, first merge them back into characters
448 if self.linebuffer:
449 self.charbuffer = "".join(self.linebuffer)
450 self.linebuffer = None
452 # read until we get the required number of characters (if available)
453 while True:
454 # can the request can be satisfied from the character buffer?
455 if chars < 0:
456 if size < 0:
457 if self.charbuffer:
458 break
459 elif len(self.charbuffer) >= size:
460 break
461 else:
462 if len(self.charbuffer) >= chars:
463 break
464 # we need more data
465 if size < 0:
466 newdata = self.stream.read()
467 else:
468 newdata = self.stream.read(size)
469 # decode bytes (those remaining from the last call included)
470 data = self.bytebuffer + newdata
471 try:
472 newchars, decodedbytes = self.decode(data, self.errors)
473 except UnicodeDecodeError, exc:
474 if firstline:
475 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
476 lines = newchars.splitlines(True)
477 if len(lines)<=1:
478 raise
479 else:
480 raise
481 # keep undecoded bytes until the next call
482 self.bytebuffer = data[decodedbytes:]
483 # put new characters in the character buffer
484 self.charbuffer += newchars
485 # there was no data available
486 if not newdata:
487 break
488 if chars < 0:
489 # Return everything we've got
490 result = self.charbuffer
491 self.charbuffer = ""
492 else:
493 # Return the first chars characters
494 result = self.charbuffer[:chars]
495 self.charbuffer = self.charbuffer[chars:]
496 return result
498 def readline(self, size=None, keepends=True):
500 """ Read one line from the input stream and return the
501 decoded data.
503 size, if given, is passed as size argument to the
504 read() method.
507 # If we have lines cached from an earlier read, return
508 # them unconditionally
509 if self.linebuffer:
510 line = self.linebuffer[0]
511 del self.linebuffer[0]
512 if len(self.linebuffer) == 1:
513 # revert to charbuffer mode; we might need more data
514 # next time
515 self.charbuffer = self.linebuffer[0]
516 self.linebuffer = None
517 if not keepends:
518 line = line.splitlines(False)[0]
519 return line
521 readsize = size or 72
522 line = ""
523 # If size is given, we call read() only once
524 while True:
525 data = self.read(readsize, firstline=True)
526 if data:
527 # If we're at a "\r" read one extra character (which might
528 # be a "\n") to get a proper line ending. If the stream is
529 # temporarily exhausted we return the wrong line ending.
530 if data.endswith("\r"):
531 data += self.read(size=1, chars=1)
533 line += data
534 lines = line.splitlines(True)
535 if lines:
536 if len(lines) > 1:
537 # More than one line result; the first line is a full line
538 # to return
539 line = lines[0]
540 del lines[0]
541 if len(lines) > 1:
542 # cache the remaining lines
543 lines[-1] += self.charbuffer
544 self.linebuffer = lines
545 self.charbuffer = None
546 else:
547 # only one remaining line, put it back into charbuffer
548 self.charbuffer = lines[0] + self.charbuffer
549 if not keepends:
550 line = line.splitlines(False)[0]
551 break
552 line0withend = lines[0]
553 line0withoutend = lines[0].splitlines(False)[0]
554 if line0withend != line0withoutend: # We really have a line end
555 # Put the rest back together and keep it until the next call
556 self.charbuffer = "".join(lines[1:]) + self.charbuffer
557 if keepends:
558 line = line0withend
559 else:
560 line = line0withoutend
561 break
562 # we didn't get anything or this was our only try
563 if not data or size is not None:
564 if line and not keepends:
565 line = line.splitlines(False)[0]
566 break
567 if readsize<8000:
568 readsize *= 2
569 return line
571 def readlines(self, sizehint=None, keepends=True):
573 """ Read all lines available on the input stream
574 and return them as list of lines.
576 Line breaks are implemented using the codec's decoder
577 method and are included in the list entries.
579 sizehint, if given, is ignored since there is no efficient
580 way to finding the true end-of-line.
583 data = self.read()
584 return data.splitlines(keepends)
586 def reset(self):
588 """ Resets the codec buffers used for keeping state.
590 Note that no stream repositioning should take place.
591 This method is primarily intended to be able to recover
592 from decoding errors.
595 self.bytebuffer = ""
596 self.charbuffer = u""
597 self.linebuffer = None
599 def seek(self, offset, whence=0):
600 """ Set the input stream's current position.
602 Resets the codec buffers used for keeping state.
604 self.reset()
605 self.stream.seek(offset, whence)
607 def next(self):
609 """ Return the next decoded line from the input stream."""
610 line = self.readline()
611 if line:
612 return line
613 raise StopIteration
615 def __iter__(self):
616 return self
618 def __getattr__(self, name,
619 getattr=getattr):
621 """ Inherit all other methods from the underlying stream.
623 return getattr(self.stream, name)
625 def __enter__(self):
626 return self
628 def __exit__(self, type, value, tb):
629 self.stream.close()
633 class StreamReaderWriter:
635 """ StreamReaderWriter instances allow wrapping streams which
636 work in both read and write modes.
638 The design is such that one can use the factory functions
639 returned by the codec.lookup() function to construct the
640 instance.
643 # Optional attributes set by the file wrappers below
644 encoding = 'unknown'
646 def __init__(self, stream, Reader, Writer, errors='strict'):
648 """ Creates a StreamReaderWriter instance.
650 stream must be a Stream-like object.
652 Reader, Writer must be factory functions or classes
653 providing the StreamReader, StreamWriter interface resp.
655 Error handling is done in the same way as defined for the
656 StreamWriter/Readers.
659 self.stream = stream
660 self.reader = Reader(stream, errors)
661 self.writer = Writer(stream, errors)
662 self.errors = errors
664 def read(self, size=-1):
666 return self.reader.read(size)
668 def readline(self, size=None):
670 return self.reader.readline(size)
672 def readlines(self, sizehint=None):
674 return self.reader.readlines(sizehint)
676 def next(self):
678 """ Return the next decoded line from the input stream."""
679 return self.reader.next()
681 def __iter__(self):
682 return self
684 def write(self, data):
686 return self.writer.write(data)
688 def writelines(self, list):
690 return self.writer.writelines(list)
692 def reset(self):
694 self.reader.reset()
695 self.writer.reset()
697 def __getattr__(self, name,
698 getattr=getattr):
700 """ Inherit all other methods from the underlying stream.
702 return getattr(self.stream, name)
704 # these are needed to make "with codecs.open(...)" work properly
706 def __enter__(self):
707 return self
709 def __exit__(self, type, value, tb):
710 self.stream.close()
714 class StreamRecoder:
716 """ StreamRecoder instances provide a frontend - backend
717 view of encoding data.
719 They use the complete set of APIs returned by the
720 codecs.lookup() function to implement their task.
722 Data written to the stream is first decoded into an
723 intermediate format (which is dependent on the given codec
724 combination) and then written to the stream using an instance
725 of the provided Writer class.
727 In the other direction, data is read from the stream using a
728 Reader instance and then return encoded data to the caller.
731 # Optional attributes set by the file wrappers below
732 data_encoding = 'unknown'
733 file_encoding = 'unknown'
735 def __init__(self, stream, encode, decode, Reader, Writer,
736 errors='strict'):
738 """ Creates a StreamRecoder instance which implements a two-way
739 conversion: encode and decode work on the frontend (the
740 input to .read() and output of .write()) while
741 Reader and Writer work on the backend (reading and
742 writing to the stream).
744 You can use these objects to do transparent direct
745 recodings from e.g. latin-1 to utf-8 and back.
747 stream must be a file-like object.
749 encode, decode must adhere to the Codec interface, Reader,
750 Writer must be factory functions or classes providing the
751 StreamReader, StreamWriter interface resp.
753 encode and decode are needed for the frontend translation,
754 Reader and Writer for the backend translation. Unicode is
755 used as intermediate encoding.
757 Error handling is done in the same way as defined for the
758 StreamWriter/Readers.
761 self.stream = stream
762 self.encode = encode
763 self.decode = decode
764 self.reader = Reader(stream, errors)
765 self.writer = Writer(stream, errors)
766 self.errors = errors
768 def read(self, size=-1):
770 data = self.reader.read(size)
771 data, bytesencoded = self.encode(data, self.errors)
772 return data
774 def readline(self, size=None):
776 if size is None:
777 data = self.reader.readline()
778 else:
779 data = self.reader.readline(size)
780 data, bytesencoded = self.encode(data, self.errors)
781 return data
783 def readlines(self, sizehint=None):
785 data = self.reader.read()
786 data, bytesencoded = self.encode(data, self.errors)
787 return data.splitlines(1)
789 def next(self):
791 """ Return the next decoded line from the input stream."""
792 data = self.reader.next()
793 data, bytesencoded = self.encode(data, self.errors)
794 return data
796 def __iter__(self):
797 return self
799 def write(self, data):
801 data, bytesdecoded = self.decode(data, self.errors)
802 return self.writer.write(data)
804 def writelines(self, list):
806 data = ''.join(list)
807 data, bytesdecoded = self.decode(data, self.errors)
808 return self.writer.write(data)
810 def reset(self):
812 self.reader.reset()
813 self.writer.reset()
815 def __getattr__(self, name,
816 getattr=getattr):
818 """ Inherit all other methods from the underlying stream.
820 return getattr(self.stream, name)
822 def __enter__(self):
823 return self
825 def __exit__(self, type, value, tb):
826 self.stream.close()
828 ### Shortcuts
830 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
832 """ Open an encoded file using the given mode and return
833 a wrapped version providing transparent encoding/decoding.
835 Note: The wrapped version will only accept the object format
836 defined by the codecs, i.e. Unicode objects for most builtin
837 codecs. Output is also codec dependent and will usually be
838 Unicode as well.
840 Files are always opened in binary mode, even if no binary mode
841 was specified. This is done to avoid data loss due to encodings
842 using 8-bit values. The default file mode is 'rb' meaning to
843 open the file in binary read mode.
845 encoding specifies the encoding which is to be used for the
846 file.
848 errors may be given to define the error handling. It defaults
849 to 'strict' which causes ValueErrors to be raised in case an
850 encoding error occurs.
852 buffering has the same meaning as for the builtin open() API.
853 It defaults to line buffered.
855 The returned wrapped file object provides an extra attribute
856 .encoding which allows querying the used encoding. This
857 attribute is only available if an encoding was specified as
858 parameter.
861 if encoding is not None and \
862 'b' not in mode:
863 # Force opening of the file in binary mode
864 mode = mode + 'b'
865 file = __builtin__.open(filename, mode, buffering)
866 if encoding is None:
867 return file
868 info = lookup(encoding)
869 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
870 # Add attributes to simplify introspection
871 srw.encoding = encoding
872 return srw
874 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
876 """ Return a wrapped version of file which provides transparent
877 encoding translation.
879 Strings written to the wrapped file are interpreted according
880 to the given data_encoding and then written to the original
881 file as string using file_encoding. The intermediate encoding
882 will usually be Unicode but depends on the specified codecs.
884 Strings are read from the file using file_encoding and then
885 passed back to the caller as string using data_encoding.
887 If file_encoding is not given, it defaults to data_encoding.
889 errors may be given to define the error handling. It defaults
890 to 'strict' which causes ValueErrors to be raised in case an
891 encoding error occurs.
893 The returned wrapped file object provides two extra attributes
894 .data_encoding and .file_encoding which reflect the given
895 parameters of the same name. The attributes can be used for
896 introspection by Python programs.
899 if file_encoding is None:
900 file_encoding = data_encoding
901 data_info = lookup(data_encoding)
902 file_info = lookup(file_encoding)
903 sr = StreamRecoder(file, data_info.encode, data_info.decode,
904 file_info.streamreader, file_info.streamwriter, errors)
905 # Add attributes to simplify introspection
906 sr.data_encoding = data_encoding
907 sr.file_encoding = file_encoding
908 return sr
910 ### Helpers for codec lookup
912 def getencoder(encoding):
914 """ Lookup up the codec for the given encoding and return
915 its encoder function.
917 Raises a LookupError in case the encoding cannot be found.
920 return lookup(encoding).encode
922 def getdecoder(encoding):
924 """ Lookup up the codec for the given encoding and return
925 its decoder function.
927 Raises a LookupError in case the encoding cannot be found.
930 return lookup(encoding).decode
932 def getincrementalencoder(encoding):
934 """ Lookup up the codec for the given encoding and return
935 its IncrementalEncoder class or factory function.
937 Raises a LookupError in case the encoding cannot be found
938 or the codecs doesn't provide an incremental encoder.
941 encoder = lookup(encoding).incrementalencoder
942 if encoder is None:
943 raise LookupError(encoding)
944 return encoder
946 def getincrementaldecoder(encoding):
948 """ Lookup up the codec for the given encoding and return
949 its IncrementalDecoder class or factory function.
951 Raises a LookupError in case the encoding cannot be found
952 or the codecs doesn't provide an incremental decoder.
955 decoder = lookup(encoding).incrementaldecoder
956 if decoder is None:
957 raise LookupError(encoding)
958 return decoder
960 def getreader(encoding):
962 """ Lookup up the codec for the given encoding and return
963 its StreamReader class or factory function.
965 Raises a LookupError in case the encoding cannot be found.
968 return lookup(encoding).streamreader
970 def getwriter(encoding):
972 """ Lookup up the codec for the given encoding and return
973 its StreamWriter class or factory function.
975 Raises a LookupError in case the encoding cannot be found.
978 return lookup(encoding).streamwriter
980 def iterencode(iterator, encoding, errors='strict', **kwargs):
982 Encoding iterator.
984 Encodes the input strings from the iterator using a IncrementalEncoder.
986 errors and kwargs are passed through to the IncrementalEncoder
987 constructor.
989 encoder = getincrementalencoder(encoding)(errors, **kwargs)
990 for input in iterator:
991 output = encoder.encode(input)
992 if output:
993 yield output
994 output = encoder.encode("", True)
995 if output:
996 yield output
998 def iterdecode(iterator, encoding, errors='strict', **kwargs):
1000 Decoding iterator.
1002 Decodes the input strings from the iterator using a IncrementalDecoder.
1004 errors and kwargs are passed through to the IncrementalDecoder
1005 constructor.
1007 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1008 for input in iterator:
1009 output = decoder.decode(input)
1010 if output:
1011 yield output
1012 output = decoder.decode("", True)
1013 if output:
1014 yield output
1016 ### Helpers for charmap-based codecs
1018 def make_identity_dict(rng):
1020 """ make_identity_dict(rng) -> dict
1022 Return a dictionary where elements of the rng sequence are
1023 mapped to themselves.
1026 res = {}
1027 for i in rng:
1028 res[i]=i
1029 return res
1031 def make_encoding_map(decoding_map):
1033 """ Creates an encoding map from a decoding map.
1035 If a target mapping in the decoding map occurs multiple
1036 times, then that target is mapped to None (undefined mapping),
1037 causing an exception when encountered by the charmap codec
1038 during translation.
1040 One example where this happens is cp875.py which decodes
1041 multiple character to \u001a.
1044 m = {}
1045 for k,v in decoding_map.items():
1046 if not v in m:
1047 m[v] = k
1048 else:
1049 m[v] = None
1050 return m
1052 ### error handlers
1054 try:
1055 strict_errors = lookup_error("strict")
1056 ignore_errors = lookup_error("ignore")
1057 replace_errors = lookup_error("replace")
1058 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1059 backslashreplace_errors = lookup_error("backslashreplace")
1060 except LookupError:
1061 # In --disable-unicode builds, these error handler are missing
1062 strict_errors = None
1063 ignore_errors = None
1064 replace_errors = None
1065 xmlcharrefreplace_errors = None
1066 backslashreplace_errors = None
1068 # Tell modulefinder that using codecs probably needs the encodings
1069 # package
1070 _false = 0
1071 if _false:
1072 import encodings
1074 ### Tests
1076 if __name__ == '__main__':
1078 # Make stdout translate Latin-1 output into UTF-8 output
1079 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1081 # Have stdin translate Latin-1 input into UTF-8 input
1082 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')