Merged revisions 81181 via svnmerge from
[python/dscho.git] / Lib / codecs.py
blobf05c4f7e2db4c18c5b0db16654642586301b2316
1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
10 import builtins, sys
12 ### Registry and builtin stateless codec functions
14 try:
15 from _codecs import *
16 except ImportError as why:
17 raise SystemError('Failed to load the builtin codecs: %s' % why)
19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
27 ### Constants
30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31 # and its possible byte string values
32 # for UTF8/UTF16/UTF32 output and little/big endian machines
35 # UTF-8
36 BOM_UTF8 = b'\xef\xbb\xbf'
38 # UTF-16, little endian
39 BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
41 # UTF-16, big endian
42 BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
44 # UTF-32, little endian
45 BOM_UTF32_LE = b'\xff\xfe\x00\x00'
47 # UTF-32, big endian
48 BOM_UTF32_BE = b'\x00\x00\xfe\xff'
50 if sys.byteorder == 'little':
52 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
58 else:
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
66 # Old broken names (don't use in new code)
67 BOM32_LE = BOM_UTF16_LE
68 BOM32_BE = BOM_UTF16_BE
69 BOM64_LE = BOM_UTF32_LE
70 BOM64_BE = BOM_UTF32_BE
73 ### Codec base classes (defining the API)
75 class CodecInfo(tuple):
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
89 def __repr__(self):
90 return "<%s.%s object for encoding %s at 0x%x>" % \
91 (self.__class__.__module__, self.__class__.__name__,
92 self.name, id(self))
94 class Codec:
96 """ Defines the interface for stateless encoders/decoders.
98 The .encode()/.decode() methods may use different error
99 handling schemes by providing the errors argument. These
100 string values are predefined:
102 'strict' - raise a ValueError error (or a subclass)
103 'ignore' - ignore the character and continue with the next
104 'replace' - replace with a suitable replacement character;
105 Python will use the official U+FFFD REPLACEMENT
106 CHARACTER for the builtin Unicode codecs on
107 decoding and '?' on encoding.
108 'xmlcharrefreplace' - Replace with the appropriate XML
109 character reference (only for encoding).
110 'backslashreplace' - Replace with backslashed escape sequences
111 (only for encoding).
113 The set of allowed values can be extended via register_error.
116 def encode(self, input, errors='strict'):
118 """ Encodes the object input and returns a tuple (output
119 object, length consumed).
121 errors defines the error handling to apply. It defaults to
122 'strict' handling.
124 The method may not store state in the Codec instance. Use
125 StreamCodec for codecs which have to keep state in order to
126 make encoding/decoding efficient.
128 The encoder must be able to handle zero length input and
129 return an empty object of the output object type in this
130 situation.
133 raise NotImplementedError
135 def decode(self, input, errors='strict'):
137 """ Decodes the object input and returns a tuple (output
138 object, length consumed).
140 input must be an object which provides the bf_getreadbuf
141 buffer slot. Python strings, buffer objects and memory
142 mapped files are examples of objects providing this slot.
144 errors defines the error handling to apply. It defaults to
145 'strict' handling.
147 The method may not store state in the Codec instance. Use
148 StreamCodec for codecs which have to keep state in order to
149 make encoding/decoding efficient.
151 The decoder must be able to handle zero length input and
152 return an empty object of the output object type in this
153 situation.
156 raise NotImplementedError
158 class IncrementalEncoder(object):
160 An IncrementalEncoder encodes an input in multiple steps. The input can
161 be passed piece by piece to the encode() method. The IncrementalEncoder
162 remembers the state of the encoding process between calls to encode().
164 def __init__(self, errors='strict'):
166 Creates an IncrementalEncoder instance.
168 The IncrementalEncoder may use different error handling schemes by
169 providing the errors keyword argument. See the module docstring
170 for a list of possible values.
172 self.errors = errors
173 self.buffer = ""
175 def encode(self, input, final=False):
177 Encodes input and returns the resulting object.
179 raise NotImplementedError
181 def reset(self):
183 Resets the encoder to the initial state.
186 def getstate(self):
188 Return the current state of the encoder.
190 return 0
192 def setstate(self, state):
194 Set the current state of the encoder. state must have been
195 returned by getstate().
198 class BufferedIncrementalEncoder(IncrementalEncoder):
200 This subclass of IncrementalEncoder can be used as the baseclass for an
201 incremental encoder if the encoder must keep some of the output in a
202 buffer between calls to encode().
204 def __init__(self, errors='strict'):
205 IncrementalEncoder.__init__(self, errors)
206 # unencoded input that is kept between calls to encode()
207 self.buffer = ""
209 def _buffer_encode(self, input, errors, final):
210 # Overwrite this method in subclasses: It must encode input
211 # and return an (output, length consumed) tuple
212 raise NotImplementedError
214 def encode(self, input, final=False):
215 # encode input (taking the buffer into account)
216 data = self.buffer + input
217 (result, consumed) = self._buffer_encode(data, self.errors, final)
218 # keep unencoded input until the next call
219 self.buffer = data[consumed:]
220 return result
222 def reset(self):
223 IncrementalEncoder.reset(self)
224 self.buffer = ""
226 def getstate(self):
227 return self.buffer or 0
229 def setstate(self, state):
230 self.buffer = state or ""
232 class IncrementalDecoder(object):
234 An IncrementalDecoder decodes an input in multiple steps. The input can
235 be passed piece by piece to the decode() method. The IncrementalDecoder
236 remembers the state of the decoding process between calls to decode().
238 def __init__(self, errors='strict'):
240 Create a IncrementalDecoder instance.
242 The IncrementalDecoder may use different error handling schemes by
243 providing the errors keyword argument. See the module docstring
244 for a list of possible values.
246 self.errors = errors
248 def decode(self, input, final=False):
250 Decode input and returns the resulting object.
252 raise NotImplementedError
254 def reset(self):
256 Reset the decoder to the initial state.
259 def getstate(self):
261 Return the current state of the decoder.
263 This must be a (buffered_input, additional_state_info) tuple.
264 buffered_input must be a bytes object containing bytes that
265 were passed to decode() that have not yet been converted.
266 additional_state_info must be a non-negative integer
267 representing the state of the decoder WITHOUT yet having
268 processed the contents of buffered_input. In the initial state
269 and after reset(), getstate() must return (b"", 0).
271 return (b"", 0)
273 def setstate(self, state):
275 Set the current state of the decoder.
277 state must have been returned by getstate(). The effect of
278 setstate((b"", 0)) must be equivalent to reset().
281 class BufferedIncrementalDecoder(IncrementalDecoder):
283 This subclass of IncrementalDecoder can be used as the baseclass for an
284 incremental decoder if the decoder must be able to handle incomplete
285 byte sequences.
287 def __init__(self, errors='strict'):
288 IncrementalDecoder.__init__(self, errors)
289 # undecoded input that is kept between calls to decode()
290 self.buffer = b""
292 def _buffer_decode(self, input, errors, final):
293 # Overwrite this method in subclasses: It must decode input
294 # and return an (output, length consumed) tuple
295 raise NotImplementedError
297 def decode(self, input, final=False):
298 # decode input (taking the buffer into account)
299 data = self.buffer + input
300 (result, consumed) = self._buffer_decode(data, self.errors, final)
301 # keep undecoded input until the next call
302 self.buffer = data[consumed:]
303 return result
305 def reset(self):
306 IncrementalDecoder.reset(self)
307 self.buffer = b""
309 def getstate(self):
310 # additional state info is always 0
311 return (self.buffer, 0)
313 def setstate(self, state):
314 # ignore additional state info
315 self.buffer = state[0]
318 # The StreamWriter and StreamReader class provide generic working
319 # interfaces which can be used to implement new encoding submodules
320 # very easily. See encodings/utf_8.py for an example on how this is
321 # done.
324 class StreamWriter(Codec):
326 def __init__(self, stream, errors='strict'):
328 """ Creates a StreamWriter instance.
330 stream must be a file-like object open for writing
331 (binary) data.
333 The StreamWriter may use different error handling
334 schemes by providing the errors keyword argument. These
335 parameters are predefined:
337 'strict' - raise a ValueError (or a subclass)
338 'ignore' - ignore the character and continue with the next
339 'replace'- replace with a suitable replacement character
340 'xmlcharrefreplace' - Replace with the appropriate XML
341 character reference.
342 'backslashreplace' - Replace with backslashed escape
343 sequences (only for encoding).
345 The set of allowed parameter values can be extended via
346 register_error.
348 self.stream = stream
349 self.errors = errors
351 def write(self, object):
353 """ Writes the object's contents encoded to self.stream.
355 data, consumed = self.encode(object, self.errors)
356 self.stream.write(data)
358 def writelines(self, list):
360 """ Writes the concatenated list of strings to the stream
361 using .write().
363 self.write(''.join(list))
365 def reset(self):
367 """ Flushes and resets the codec buffers used for keeping state.
369 Calling this method should ensure that the data on the
370 output is put into a clean state, that allows appending
371 of new fresh data without having to rescan the whole
372 stream to recover state.
375 pass
377 def __getattr__(self, name,
378 getattr=getattr):
380 """ Inherit all other methods from the underlying stream.
382 return getattr(self.stream, name)
384 def __enter__(self):
385 return self
387 def __exit__(self, type, value, tb):
388 self.stream.close()
392 class StreamReader(Codec):
394 def __init__(self, stream, errors='strict'):
396 """ Creates a StreamReader instance.
398 stream must be a file-like object open for reading
399 (binary) data.
401 The StreamReader may use different error handling
402 schemes by providing the errors keyword argument. These
403 parameters are predefined:
405 'strict' - raise a ValueError (or a subclass)
406 'ignore' - ignore the character and continue with the next
407 'replace'- replace with a suitable replacement character;
409 The set of allowed parameter values can be extended via
410 register_error.
412 self.stream = stream
413 self.errors = errors
414 self.bytebuffer = b""
415 # For str->str decoding this will stay a str
416 # For str->unicode decoding the first read will promote it to unicode
417 self.charbuffer = ""
418 self.linebuffer = None
420 def decode(self, input, errors='strict'):
421 raise NotImplementedError
423 def read(self, size=-1, chars=-1, firstline=False):
425 """ Decodes data from the stream self.stream and returns the
426 resulting object.
428 chars indicates the number of characters to read from the
429 stream. read() will never return more than chars
430 characters, but it might return less, if there are not enough
431 characters available.
433 size indicates the approximate maximum number of bytes to
434 read from the stream for decoding purposes. The decoder
435 can modify this setting as appropriate. The default value
436 -1 indicates to read and decode as much as possible. size
437 is intended to prevent having to decode huge files in one
438 step.
440 If firstline is true, and a UnicodeDecodeError happens
441 after the first line terminator in the input only the first line
442 will be returned, the rest of the input will be kept until the
443 next call to read().
445 The method should use a greedy read strategy meaning that
446 it should read as much data as is allowed within the
447 definition of the encoding and the given size, e.g. if
448 optional encoding endings or state markers are available
449 on the stream, these should be read too.
451 # If we have lines cached, first merge them back into characters
452 if self.linebuffer:
453 self.charbuffer = "".join(self.linebuffer)
454 self.linebuffer = None
456 # read until we get the required number of characters (if available)
457 while True:
458 # can the request can be satisfied from the character buffer?
459 if chars < 0:
460 if size < 0:
461 if self.charbuffer:
462 break
463 elif len(self.charbuffer) >= size:
464 break
465 else:
466 if len(self.charbuffer) >= chars:
467 break
468 # we need more data
469 if size < 0:
470 newdata = self.stream.read()
471 else:
472 newdata = self.stream.read(size)
473 # decode bytes (those remaining from the last call included)
474 data = self.bytebuffer + newdata
475 try:
476 newchars, decodedbytes = self.decode(data, self.errors)
477 except UnicodeDecodeError as exc:
478 if firstline:
479 newchars, decodedbytes = \
480 self.decode(data[:exc.start], self.errors)
481 lines = newchars.splitlines(True)
482 if len(lines)<=1:
483 raise
484 else:
485 raise
486 # keep undecoded bytes until the next call
487 self.bytebuffer = data[decodedbytes:]
488 # put new characters in the character buffer
489 self.charbuffer += newchars
490 # there was no data available
491 if not newdata:
492 break
493 if chars < 0:
494 # Return everything we've got
495 result = self.charbuffer
496 self.charbuffer = ""
497 else:
498 # Return the first chars characters
499 result = self.charbuffer[:chars]
500 self.charbuffer = self.charbuffer[chars:]
501 return result
503 def readline(self, size=None, keepends=True):
505 """ Read one line from the input stream and return the
506 decoded data.
508 size, if given, is passed as size argument to the
509 read() method.
512 # If we have lines cached from an earlier read, return
513 # them unconditionally
514 if self.linebuffer:
515 line = self.linebuffer[0]
516 del self.linebuffer[0]
517 if len(self.linebuffer) == 1:
518 # revert to charbuffer mode; we might need more data
519 # next time
520 self.charbuffer = self.linebuffer[0]
521 self.linebuffer = None
522 if not keepends:
523 line = line.splitlines(False)[0]
524 return line
526 readsize = size or 72
527 line = ""
528 # If size is given, we call read() only once
529 while True:
530 data = self.read(readsize, firstline=True)
531 if data:
532 # If we're at a "\r" read one extra character (which might
533 # be a "\n") to get a proper line ending. If the stream is
534 # temporarily exhausted we return the wrong line ending.
535 if data.endswith("\r"):
536 data += self.read(size=1, chars=1)
538 line += data
539 lines = line.splitlines(True)
540 if lines:
541 if len(lines) > 1:
542 # More than one line result; the first line is a full line
543 # to return
544 line = lines[0]
545 del lines[0]
546 if len(lines) > 1:
547 # cache the remaining lines
548 lines[-1] += self.charbuffer
549 self.linebuffer = lines
550 self.charbuffer = None
551 else:
552 # only one remaining line, put it back into charbuffer
553 self.charbuffer = lines[0] + self.charbuffer
554 if not keepends:
555 line = line.splitlines(False)[0]
556 break
557 line0withend = lines[0]
558 line0withoutend = lines[0].splitlines(False)[0]
559 if line0withend != line0withoutend: # We really have a line end
560 # Put the rest back together and keep it until the next call
561 self.charbuffer = "".join(lines[1:]) + self.charbuffer
562 if keepends:
563 line = line0withend
564 else:
565 line = line0withoutend
566 break
567 # we didn't get anything or this was our only try
568 if not data or size is not None:
569 if line and not keepends:
570 line = line.splitlines(False)[0]
571 break
572 if readsize<8000:
573 readsize *= 2
574 return line
576 def readlines(self, sizehint=None, keepends=True):
578 """ Read all lines available on the input stream
579 and return them as list of lines.
581 Line breaks are implemented using the codec's decoder
582 method and are included in the list entries.
584 sizehint, if given, is ignored since there is no efficient
585 way to finding the true end-of-line.
588 data = self.read()
589 return data.splitlines(keepends)
591 def reset(self):
593 """ Resets the codec buffers used for keeping state.
595 Note that no stream repositioning should take place.
596 This method is primarily intended to be able to recover
597 from decoding errors.
600 self.bytebuffer = b""
601 self.charbuffer = ""
602 self.linebuffer = None
604 def seek(self, offset, whence=0):
605 """ Set the input stream's current position.
607 Resets the codec buffers used for keeping state.
609 self.reset()
610 self.stream.seek(offset, whence)
612 def __next__(self):
614 """ Return the next decoded line from the input stream."""
615 line = self.readline()
616 if line:
617 return line
618 raise StopIteration
620 def __iter__(self):
621 return self
623 def __getattr__(self, name,
624 getattr=getattr):
626 """ Inherit all other methods from the underlying stream.
628 return getattr(self.stream, name)
630 def __enter__(self):
631 return self
633 def __exit__(self, type, value, tb):
634 self.stream.close()
638 class StreamReaderWriter:
640 """ StreamReaderWriter instances allow wrapping streams which
641 work in both read and write modes.
643 The design is such that one can use the factory functions
644 returned by the codec.lookup() function to construct the
645 instance.
648 # Optional attributes set by the file wrappers below
649 encoding = 'unknown'
651 def __init__(self, stream, Reader, Writer, errors='strict'):
653 """ Creates a StreamReaderWriter instance.
655 stream must be a Stream-like object.
657 Reader, Writer must be factory functions or classes
658 providing the StreamReader, StreamWriter interface resp.
660 Error handling is done in the same way as defined for the
661 StreamWriter/Readers.
664 self.stream = stream
665 self.reader = Reader(stream, errors)
666 self.writer = Writer(stream, errors)
667 self.errors = errors
669 def read(self, size=-1):
671 return self.reader.read(size)
673 def readline(self, size=None):
675 return self.reader.readline(size)
677 def readlines(self, sizehint=None):
679 return self.reader.readlines(sizehint)
681 def __next__(self):
683 """ Return the next decoded line from the input stream."""
684 return next(self.reader)
686 def __iter__(self):
687 return self
689 def write(self, data):
691 return self.writer.write(data)
693 def writelines(self, list):
695 return self.writer.writelines(list)
697 def reset(self):
699 self.reader.reset()
700 self.writer.reset()
702 def __getattr__(self, name,
703 getattr=getattr):
705 """ Inherit all other methods from the underlying stream.
707 return getattr(self.stream, name)
709 # these are needed to make "with codecs.open(...)" work properly
711 def __enter__(self):
712 return self
714 def __exit__(self, type, value, tb):
715 self.stream.close()
719 class StreamRecoder:
721 """ StreamRecoder instances provide a frontend - backend
722 view of encoding data.
724 They use the complete set of APIs returned by the
725 codecs.lookup() function to implement their task.
727 Data written to the stream is first decoded into an
728 intermediate format (which is dependent on the given codec
729 combination) and then written to the stream using an instance
730 of the provided Writer class.
732 In the other direction, data is read from the stream using a
733 Reader instance and then return encoded data to the caller.
736 # Optional attributes set by the file wrappers below
737 data_encoding = 'unknown'
738 file_encoding = 'unknown'
740 def __init__(self, stream, encode, decode, Reader, Writer,
741 errors='strict'):
743 """ Creates a StreamRecoder instance which implements a two-way
744 conversion: encode and decode work on the frontend (the
745 input to .read() and output of .write()) while
746 Reader and Writer work on the backend (reading and
747 writing to the stream).
749 You can use these objects to do transparent direct
750 recodings from e.g. latin-1 to utf-8 and back.
752 stream must be a file-like object.
754 encode, decode must adhere to the Codec interface, Reader,
755 Writer must be factory functions or classes providing the
756 StreamReader, StreamWriter interface resp.
758 encode and decode are needed for the frontend translation,
759 Reader and Writer for the backend translation. Unicode is
760 used as intermediate encoding.
762 Error handling is done in the same way as defined for the
763 StreamWriter/Readers.
766 self.stream = stream
767 self.encode = encode
768 self.decode = decode
769 self.reader = Reader(stream, errors)
770 self.writer = Writer(stream, errors)
771 self.errors = errors
773 def read(self, size=-1):
775 data = self.reader.read(size)
776 data, bytesencoded = self.encode(data, self.errors)
777 return data
779 def readline(self, size=None):
781 if size is None:
782 data = self.reader.readline()
783 else:
784 data = self.reader.readline(size)
785 data, bytesencoded = self.encode(data, self.errors)
786 return data
788 def readlines(self, sizehint=None):
790 data = self.reader.read()
791 data, bytesencoded = self.encode(data, self.errors)
792 return data.splitlines(1)
794 def __next__(self):
796 """ Return the next decoded line from the input stream."""
797 data = next(self.reader)
798 data, bytesencoded = self.encode(data, self.errors)
799 return data
801 def __iter__(self):
802 return self
804 def write(self, data):
806 data, bytesdecoded = self.decode(data, self.errors)
807 return self.writer.write(data)
809 def writelines(self, list):
811 data = ''.join(list)
812 data, bytesdecoded = self.decode(data, self.errors)
813 return self.writer.write(data)
815 def reset(self):
817 self.reader.reset()
818 self.writer.reset()
820 def __getattr__(self, name,
821 getattr=getattr):
823 """ Inherit all other methods from the underlying stream.
825 return getattr(self.stream, name)
827 def __enter__(self):
828 return self
830 def __exit__(self, type, value, tb):
831 self.stream.close()
833 ### Shortcuts
835 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
837 """ Open an encoded file using the given mode and return
838 a wrapped version providing transparent encoding/decoding.
840 Note: The wrapped version will only accept the object format
841 defined by the codecs, i.e. Unicode objects for most builtin
842 codecs. Output is also codec dependent and will usually be
843 Unicode as well.
845 Files are always opened in binary mode, even if no binary mode
846 was specified. This is done to avoid data loss due to encodings
847 using 8-bit values. The default file mode is 'rb' meaning to
848 open the file in binary read mode.
850 encoding specifies the encoding which is to be used for the
851 file.
853 errors may be given to define the error handling. It defaults
854 to 'strict' which causes ValueErrors to be raised in case an
855 encoding error occurs.
857 buffering has the same meaning as for the builtin open() API.
858 It defaults to line buffered.
860 The returned wrapped file object provides an extra attribute
861 .encoding which allows querying the used encoding. This
862 attribute is only available if an encoding was specified as
863 parameter.
866 if encoding is not None and \
867 'b' not in mode:
868 # Force opening of the file in binary mode
869 mode = mode + 'b'
870 file = builtins.open(filename, mode, buffering)
871 if encoding is None:
872 return file
873 info = lookup(encoding)
874 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
875 # Add attributes to simplify introspection
876 srw.encoding = encoding
877 return srw
879 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
881 """ Return a wrapped version of file which provides transparent
882 encoding translation.
884 Strings written to the wrapped file are interpreted according
885 to the given data_encoding and then written to the original
886 file as string using file_encoding. The intermediate encoding
887 will usually be Unicode but depends on the specified codecs.
889 Strings are read from the file using file_encoding and then
890 passed back to the caller as string using data_encoding.
892 If file_encoding is not given, it defaults to data_encoding.
894 errors may be given to define the error handling. It defaults
895 to 'strict' which causes ValueErrors to be raised in case an
896 encoding error occurs.
898 The returned wrapped file object provides two extra attributes
899 .data_encoding and .file_encoding which reflect the given
900 parameters of the same name. The attributes can be used for
901 introspection by Python programs.
904 if file_encoding is None:
905 file_encoding = data_encoding
906 data_info = lookup(data_encoding)
907 file_info = lookup(file_encoding)
908 sr = StreamRecoder(file, data_info.encode, data_info.decode,
909 file_info.streamreader, file_info.streamwriter, errors)
910 # Add attributes to simplify introspection
911 sr.data_encoding = data_encoding
912 sr.file_encoding = file_encoding
913 return sr
915 ### Helpers for codec lookup
917 def getencoder(encoding):
919 """ Lookup up the codec for the given encoding and return
920 its encoder function.
922 Raises a LookupError in case the encoding cannot be found.
925 return lookup(encoding).encode
927 def getdecoder(encoding):
929 """ Lookup up the codec for the given encoding and return
930 its decoder function.
932 Raises a LookupError in case the encoding cannot be found.
935 return lookup(encoding).decode
937 def getincrementalencoder(encoding):
939 """ Lookup up the codec for the given encoding and return
940 its IncrementalEncoder class or factory function.
942 Raises a LookupError in case the encoding cannot be found
943 or the codecs doesn't provide an incremental encoder.
946 encoder = lookup(encoding).incrementalencoder
947 if encoder is None:
948 raise LookupError(encoding)
949 return encoder
951 def getincrementaldecoder(encoding):
953 """ Lookup up the codec for the given encoding and return
954 its IncrementalDecoder class or factory function.
956 Raises a LookupError in case the encoding cannot be found
957 or the codecs doesn't provide an incremental decoder.
960 decoder = lookup(encoding).incrementaldecoder
961 if decoder is None:
962 raise LookupError(encoding)
963 return decoder
965 def getreader(encoding):
967 """ Lookup up the codec for the given encoding and return
968 its StreamReader class or factory function.
970 Raises a LookupError in case the encoding cannot be found.
973 return lookup(encoding).streamreader
975 def getwriter(encoding):
977 """ Lookup up the codec for the given encoding and return
978 its StreamWriter class or factory function.
980 Raises a LookupError in case the encoding cannot be found.
983 return lookup(encoding).streamwriter
985 def iterencode(iterator, encoding, errors='strict', **kwargs):
987 Encoding iterator.
989 Encodes the input strings from the iterator using a IncrementalEncoder.
991 errors and kwargs are passed through to the IncrementalEncoder
992 constructor.
994 encoder = getincrementalencoder(encoding)(errors, **kwargs)
995 for input in iterator:
996 output = encoder.encode(input)
997 if output:
998 yield output
999 output = encoder.encode("", True)
1000 if output:
1001 yield output
1003 def iterdecode(iterator, encoding, errors='strict', **kwargs):
1005 Decoding iterator.
1007 Decodes the input strings from the iterator using a IncrementalDecoder.
1009 errors and kwargs are passed through to the IncrementalDecoder
1010 constructor.
1012 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1013 for input in iterator:
1014 output = decoder.decode(input)
1015 if output:
1016 yield output
1017 output = decoder.decode(b"", True)
1018 if output:
1019 yield output
1021 ### Helpers for charmap-based codecs
1023 def make_identity_dict(rng):
1025 """ make_identity_dict(rng) -> dict
1027 Return a dictionary where elements of the rng sequence are
1028 mapped to themselves.
1031 res = {}
1032 for i in rng:
1033 res[i]=i
1034 return res
1036 def make_encoding_map(decoding_map):
1038 """ Creates an encoding map from a decoding map.
1040 If a target mapping in the decoding map occurs multiple
1041 times, then that target is mapped to None (undefined mapping),
1042 causing an exception when encountered by the charmap codec
1043 during translation.
1045 One example where this happens is cp875.py which decodes
1046 multiple character to \u001a.
1049 m = {}
1050 for k,v in decoding_map.items():
1051 if not v in m:
1052 m[v] = k
1053 else:
1054 m[v] = None
1055 return m
1057 ### error handlers
1059 try:
1060 strict_errors = lookup_error("strict")
1061 ignore_errors = lookup_error("ignore")
1062 replace_errors = lookup_error("replace")
1063 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1064 backslashreplace_errors = lookup_error("backslashreplace")
1065 except LookupError:
1066 # In --disable-unicode builds, these error handler are missing
1067 strict_errors = None
1068 ignore_errors = None
1069 replace_errors = None
1070 xmlcharrefreplace_errors = None
1071 backslashreplace_errors = None
1073 # Tell modulefinder that using codecs probably needs the encodings
1074 # package
1075 _false = 0
1076 if _false:
1077 import encodings
1079 ### Tests
1081 if __name__ == '__main__':
1083 # Make stdout translate Latin-1 output into UTF-8 output
1084 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1086 # Have stdin translate Latin-1 input into UTF-8 input
1087 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')