Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import builtins, sys
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError as why:
  17     raise SystemError('Failed to load the builtin codecs: %s' % why)
  18
  19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
  21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
  22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
  23            "strict_errors", "ignore_errors", "replace_errors",
  24            "xmlcharrefreplace_errors",
  25            "register_error", "lookup_error"]
  26
  27 ### Constants
  28
  29 #
  30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
  31 # and its possible byte string values
  32 # for UTF8/UTF16/UTF32 output and little/big endian machines
  33 #
  34
  35 # UTF-8
  36 BOM_UTF8 = b'\xef\xbb\xbf'
  37
  38 # UTF-16, little endian
  39 BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
  40
  41 # UTF-16, big endian
  42 BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
  43
  44 # UTF-32, little endian
  45 BOM_UTF32_LE = b'\xff\xfe\x00\x00'
  46
  47 # UTF-32, big endian
  48 BOM_UTF32_BE = b'\x00\x00\xfe\xff'
  49
  50 if sys.byteorder == 'little':
  51
  52     # UTF-16, native endianness
  53     BOM = BOM_UTF16 = BOM_UTF16_LE
  54
  55     # UTF-32, native endianness
  56     BOM_UTF32 = BOM_UTF32_LE
  57
  58 else:
  59
  60     # UTF-16, native endianness
  61     BOM = BOM_UTF16 = BOM_UTF16_BE
  62
  63     # UTF-32, native endianness
  64     BOM_UTF32 = BOM_UTF32_BE
  65
  66 # Old broken names (don't use in new code)
  67 BOM32_LE = BOM_UTF16_LE
  68 BOM32_BE = BOM_UTF16_BE
  69 BOM64_LE = BOM_UTF32_LE
  70 BOM64_BE = BOM_UTF32_BE
  71
  72
  73 ### Codec base classes (defining the API)
  74
  75 class CodecInfo(tuple):
  76
  77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
  78         incrementalencoder=None, incrementaldecoder=None, name=None):
  79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
  80         self.name = name
  81         self.encode = encode
  82         self.decode = decode
  83         self.incrementalencoder = incrementalencoder
  84         self.incrementaldecoder = incrementaldecoder
  85         self.streamwriter = streamwriter
  86         self.streamreader = streamreader
  87         return self
  88
  89     def __repr__(self):
  90         return "<%s.%s object for encoding %s at 0x%x>" % \
  91                 (self.__class__.__module__, self.__class__.__name__,
  92                  self.name, id(self))
  93
  94 class Codec:
  95
  96     """ Defines the interface for stateless encoders/decoders.
  97
  98         The .encode()/.decode() methods may use different error
  99         handling schemes by providing the errors argument. These
 100         string values are predefined:
 101
 102          'strict' - raise a ValueError error (or a subclass)
 103          'ignore' - ignore the character and continue with the next
 104          'replace' - replace with a suitable replacement character;
 105                     Python will use the official U+FFFD REPLACEMENT
 106                     CHARACTER for the builtin Unicode codecs on
 107                     decoding and '?' on encoding.
 108          'xmlcharrefreplace' - Replace with the appropriate XML
 109                                character reference (only for encoding).
 110          'backslashreplace'  - Replace with backslashed escape sequences
 111                                (only for encoding).
 112
 113         The set of allowed values can be extended via register_error.
 114
 115     """
 116     def encode(self, input, errors='strict'):
 117
 118         """ Encodes the object input and returns a tuple (output
 119             object, length consumed).
 120
 121             errors defines the error handling to apply. It defaults to
 122             'strict' handling.
 123
 124             The method may not store state in the Codec instance. Use
 125             StreamCodec for codecs which have to keep state in order to
 126             make encoding/decoding efficient.
 127
 128             The encoder must be able to handle zero length input and
 129             return an empty object of the output object type in this
 130             situation.
 131
 132         """
 133         raise NotImplementedError
 134
 135     def decode(self, input, errors='strict'):
 136
 137         """ Decodes the object input and returns a tuple (output
 138             object, length consumed).
 139
 140             input must be an object which provides the bf_getreadbuf
 141             buffer slot. Python strings, buffer objects and memory
 142             mapped files are examples of objects providing this slot.
 143
 144             errors defines the error handling to apply. It defaults to
 145             'strict' handling.
 146
 147             The method may not store state in the Codec instance. Use
 148             StreamCodec for codecs which have to keep state in order to
 149             make encoding/decoding efficient.
 150
 151             The decoder must be able to handle zero length input and
 152             return an empty object of the output object type in this
 153             situation.
 154
 155         """
 156         raise NotImplementedError
 157
 158 class IncrementalEncoder(object):
 159     """
 160     An IncrementalEncoder encodes an input in multiple steps. The input can
 161     be passed piece by piece to the encode() method. The IncrementalEncoder
 162     remembers the state of the encoding process between calls to encode().
 163     """
 164     def __init__(self, errors='strict'):
 165         """
 166         Creates an IncrementalEncoder instance.
 167
 168         The IncrementalEncoder may use different error handling schemes by
 169         providing the errors keyword argument. See the module docstring
 170         for a list of possible values.
 171         """
 172         self.errors = errors
 173         self.buffer = ""
 174
 175     def encode(self, input, final=False):
 176         """
 177         Encodes input and returns the resulting object.
 178         """
 179         raise NotImplementedError
 180
 181     def reset(self):
 182         """
 183         Resets the encoder to the initial state.
 184         """
 185
 186     def getstate(self):
 187         """
 188         Return the current state of the encoder.
 189         """
 190         return 0
 191
 192     def setstate(self, state):
 193         """
 194         Set the current state of the encoder. state must have been
 195         returned by getstate().
 196         """
 197
 198 class BufferedIncrementalEncoder(IncrementalEncoder):
 199     """
 200     This subclass of IncrementalEncoder can be used as the baseclass for an
 201     incremental encoder if the encoder must keep some of the output in a
 202     buffer between calls to encode().
 203     """
 204     def __init__(self, errors='strict'):
 205         IncrementalEncoder.__init__(self, errors)
 206         # unencoded input that is kept between calls to encode()
 207         self.buffer = ""
 208
 209     def _buffer_encode(self, input, errors, final):
 210         # Overwrite this method in subclasses: It must encode input
 211         # and return an (output, length consumed) tuple
 212         raise NotImplementedError
 213
 214     def encode(self, input, final=False):
 215         # encode input (taking the buffer into account)
 216         data = self.buffer + input
 217         (result, consumed) = self._buffer_encode(data, self.errors, final)
 218         # keep unencoded input until the next call
 219         self.buffer = data[consumed:]
 220         return result
 221
 222     def reset(self):
 223         IncrementalEncoder.reset(self)
 224         self.buffer = ""
 225
 226     def getstate(self):
 227         return self.buffer or 0
 228
 229     def setstate(self, state):
 230         self.buffer = state or ""
 231
 232 class IncrementalDecoder(object):
 233     """
 234     An IncrementalDecoder decodes an input in multiple steps. The input can
 235     be passed piece by piece to the decode() method. The IncrementalDecoder
 236     remembers the state of the decoding process between calls to decode().
 237     """
 238     def __init__(self, errors='strict'):
 239         """
 240         Create a IncrementalDecoder instance.
 241
 242         The IncrementalDecoder may use different error handling schemes by
 243         providing the errors keyword argument. See the module docstring
 244         for a list of possible values.
 245         """
 246         self.errors = errors
 247
 248     def decode(self, input, final=False):
 249         """
 250         Decode input and returns the resulting object.
 251         """
 252         raise NotImplementedError
 253
 254     def reset(self):
 255         """
 256         Reset the decoder to the initial state.
 257         """
 258
 259     def getstate(self):
 260         """
 261         Return the current state of the decoder.
 262
 263         This must be a (buffered_input, additional_state_info) tuple.
 264         buffered_input must be a bytes object containing bytes that
 265         were passed to decode() that have not yet been converted.
 266         additional_state_info must be a non-negative integer
 267         representing the state of the decoder WITHOUT yet having
 268         processed the contents of buffered_input.  In the initial state
 269         and after reset(), getstate() must return (b"", 0).
 270         """
 271         return (b"", 0)
 272
 273     def setstate(self, state):
 274         """
 275         Set the current state of the decoder.
 276
 277         state must have been returned by getstate().  The effect of
 278         setstate((b"", 0)) must be equivalent to reset().
 279         """
 280
 281 class BufferedIncrementalDecoder(IncrementalDecoder):
 282     """
 283     This subclass of IncrementalDecoder can be used as the baseclass for an
 284     incremental decoder if the decoder must be able to handle incomplete
 285     byte sequences.
 286     """
 287     def __init__(self, errors='strict'):
 288         IncrementalDecoder.__init__(self, errors)
 289         # undecoded input that is kept between calls to decode()
 290         self.buffer = b""
 291
 292     def _buffer_decode(self, input, errors, final):
 293         # Overwrite this method in subclasses: It must decode input
 294         # and return an (output, length consumed) tuple
 295         raise NotImplementedError
 296
 297     def decode(self, input, final=False):
 298         # decode input (taking the buffer into account)
 299         data = self.buffer + input
 300         (result, consumed) = self._buffer_decode(data, self.errors, final)
 301         # keep undecoded input until the next call
 302         self.buffer = data[consumed:]
 303         return result
 304
 305     def reset(self):
 306         IncrementalDecoder.reset(self)
 307         self.buffer = b""
 308
 309     def getstate(self):
 310         # additional state info is always 0
 311         return (self.buffer, 0)
 312
 313     def setstate(self, state):
 314         # ignore additional state info
 315         self.buffer = state[0]
 316
 317 #
 318 # The StreamWriter and StreamReader class provide generic working
 319 # interfaces which can be used to implement new encoding submodules
 320 # very easily. See encodings/utf_8.py for an example on how this is
 321 # done.
 322 #
 323
 324 class StreamWriter(Codec):
 325
 326     def __init__(self, stream, errors='strict'):
 327
 328         """ Creates a StreamWriter instance.
 329
 330             stream must be a file-like object open for writing
 331             (binary) data.
 332
 333             The StreamWriter may use different error handling
 334             schemes by providing the errors keyword argument. These
 335             parameters are predefined:
 336
 337              'strict' - raise a ValueError (or a subclass)
 338              'ignore' - ignore the character and continue with the next
 339              'replace'- replace with a suitable replacement character
 340              'xmlcharrefreplace' - Replace with the appropriate XML
 341                                    character reference.
 342              'backslashreplace'  - Replace with backslashed escape
 343                                    sequences (only for encoding).
 344
 345             The set of allowed parameter values can be extended via
 346             register_error.
 347         """
 348         self.stream = stream
 349         self.errors = errors
 350
 351     def write(self, object):
 352
 353         """ Writes the object's contents encoded to self.stream.
 354         """
 355         data, consumed = self.encode(object, self.errors)
 356         self.stream.write(data)
 357
 358     def writelines(self, list):
 359
 360         """ Writes the concatenated list of strings to the stream
 361             using .write().
 362         """
 363         self.write(''.join(list))
 364
 365     def reset(self):
 366
 367         """ Flushes and resets the codec buffers used for keeping state.
 368
 369             Calling this method should ensure that the data on the
 370             output is put into a clean state, that allows appending
 371             of new fresh data without having to rescan the whole
 372             stream to recover state.
 373
 374         """
 375         pass
 376
 377     def __getattr__(self, name,
 378                     getattr=getattr):
 379
 380         """ Inherit all other methods from the underlying stream.
 381         """
 382         return getattr(self.stream, name)
 383
 384     def __enter__(self):
 385         return self
 386
 387     def __exit__(self, type, value, tb):
 388         self.stream.close()
 389
 390 ###
 391
 392 class StreamReader(Codec):
 393
 394     def __init__(self, stream, errors='strict'):
 395
 396         """ Creates a StreamReader instance.
 397
 398             stream must be a file-like object open for reading
 399             (binary) data.
 400
 401             The StreamReader may use different error handling
 402             schemes by providing the errors keyword argument. These
 403             parameters are predefined:
 404
 405              'strict' - raise a ValueError (or a subclass)
 406              'ignore' - ignore the character and continue with the next
 407              'replace'- replace with a suitable replacement character;
 408
 409             The set of allowed parameter values can be extended via
 410             register_error.
 411         """
 412         self.stream = stream
 413         self.errors = errors
 414         self.bytebuffer = b""
 415         # For str->str decoding this will stay a str
 416         # For str->unicode decoding the first read will promote it to unicode
 417         self.charbuffer = ""
 418         self.linebuffer = None
 419
 420     def decode(self, input, errors='strict'):
 421         raise NotImplementedError
 422
 423     def read(self, size=-1, chars=-1, firstline=False):
 424
 425         """ Decodes data from the stream self.stream and returns the
 426             resulting object.
 427
 428             chars indicates the number of characters to read from the
 429             stream. read() will never return more than chars
 430             characters, but it might return less, if there are not enough
 431             characters available.
 432
 433             size indicates the approximate maximum number of bytes to
 434             read from the stream for decoding purposes. The decoder
 435             can modify this setting as appropriate. The default value
 436             -1 indicates to read and decode as much as possible.  size
 437             is intended to prevent having to decode huge files in one
 438             step.
 439
 440             If firstline is true, and a UnicodeDecodeError happens
 441             after the first line terminator in the input only the first line
 442             will be returned, the rest of the input will be kept until the
 443             next call to read().
 444
 445             The method should use a greedy read strategy meaning that
 446             it should read as much data as is allowed within the
 447             definition of the encoding and the given size, e.g.  if
 448             optional encoding endings or state markers are available
 449             on the stream, these should be read too.
 450         """
 451         # If we have lines cached, first merge them back into characters
 452         if self.linebuffer:
 453             self.charbuffer = "".join(self.linebuffer)
 454             self.linebuffer = None
 455
 456         # read until we get the required number of characters (if available)
 457         while True:
 458             # can the request can be satisfied from the character buffer?
 459             if chars < 0:
 460                 if size < 0:
 461                     if self.charbuffer:
 462                         break
 463                 elif len(self.charbuffer) >= size:
 464                     break
 465             else:
 466                 if len(self.charbuffer) >= chars:
 467                     break
 468             # we need more data
 469             if size < 0:
 470                 newdata = self.stream.read()
 471             else:
 472                 newdata = self.stream.read(size)
 473             # decode bytes (those remaining from the last call included)
 474             data = self.bytebuffer + newdata
 475             try:
 476                 newchars, decodedbytes = self.decode(data, self.errors)
 477             except UnicodeDecodeError as exc:
 478                 if firstline:
 479                     newchars, decodedbytes = \
 480                         self.decode(data[:exc.start], self.errors)
 481                     lines = newchars.splitlines(True)
 482                     if len(lines)<=1:
 483                         raise
 484                 else:
 485                     raise
 486             # keep undecoded bytes until the next call
 487             self.bytebuffer = data[decodedbytes:]
 488             # put new characters in the character buffer
 489             self.charbuffer += newchars
 490             # there was no data available
 491             if not newdata:
 492                 break
 493         if chars < 0:
 494             # Return everything we've got
 495             result = self.charbuffer
 496             self.charbuffer = ""
 497         else:
 498             # Return the first chars characters
 499             result = self.charbuffer[:chars]
 500             self.charbuffer = self.charbuffer[chars:]
 501         return result
 502
 503     def readline(self, size=None, keepends=True):
 504
 505         """ Read one line from the input stream and return the
 506             decoded data.
 507
 508             size, if given, is passed as size argument to the
 509             read() method.
 510
 511         """
 512         # If we have lines cached from an earlier read, return
 513         # them unconditionally
 514         if self.linebuffer:
 515             line = self.linebuffer[0]
 516             del self.linebuffer[0]
 517             if len(self.linebuffer) == 1:
 518                 # revert to charbuffer mode; we might need more data
 519                 # next time
 520                 self.charbuffer = self.linebuffer[0]
 521                 self.linebuffer = None
 522             if not keepends:
 523                 line = line.splitlines(False)[0]
 524             return line
 525
 526         readsize = size or 72
 527         line = ""
 528         # If size is given, we call read() only once
 529         while True:
 530             data = self.read(readsize, firstline=True)
 531             if data:
 532                 # If we're at a "\r" read one extra character (which might
 533                 # be a "\n") to get a proper line ending. If the stream is
 534                 # temporarily exhausted we return the wrong line ending.
 535                 if data.endswith("\r"):
 536                     data += self.read(size=1, chars=1)
 537
 538             line += data
 539             lines = line.splitlines(True)
 540             if lines:
 541                 if len(lines) > 1:
 542                     # More than one line result; the first line is a full line
 543                     # to return
 544                     line = lines[0]
 545                     del lines[0]
 546                     if len(lines) > 1:
 547                         # cache the remaining lines
 548                         lines[-1] += self.charbuffer
 549                         self.linebuffer = lines
 550                         self.charbuffer = None
 551                     else:
 552                         # only one remaining line, put it back into charbuffer
 553                         self.charbuffer = lines[0] + self.charbuffer
 554                     if not keepends:
 555                         line = line.splitlines(False)[0]
 556                     break
 557                 line0withend = lines[0]
 558                 line0withoutend = lines[0].splitlines(False)[0]
 559                 if line0withend != line0withoutend: # We really have a line end
 560                     # Put the rest back together and keep it until the next call
 561                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
 562                     if keepends:
 563                         line = line0withend
 564                     else:
 565                         line = line0withoutend
 566                     break
 567             # we didn't get anything or this was our only try
 568             if not data or size is not None:
 569                 if line and not keepends:
 570                     line = line.splitlines(False)[0]
 571                 break
 572             if readsize<8000:
 573                 readsize *= 2
 574         return line
 575
 576     def readlines(self, sizehint=None, keepends=True):
 577
 578         """ Read all lines available on the input stream
 579             and return them as list of lines.
 580
 581             Line breaks are implemented using the codec's decoder
 582             method and are included in the list entries.
 583
 584             sizehint, if given, is ignored since there is no efficient
 585             way to finding the true end-of-line.
 586
 587         """
 588         data = self.read()
 589         return data.splitlines(keepends)
 590
 591     def reset(self):
 592
 593         """ Resets the codec buffers used for keeping state.
 594
 595             Note that no stream repositioning should take place.
 596             This method is primarily intended to be able to recover
 597             from decoding errors.
 598
 599         """
 600         self.bytebuffer = b""
 601         self.charbuffer = ""
 602         self.linebuffer = None
 603
 604     def seek(self, offset, whence=0):
 605         """ Set the input stream's current position.
 606
 607             Resets the codec buffers used for keeping state.
 608         """
 609         self.reset()
 610         self.stream.seek(offset, whence)
 611
 612     def __next__(self):
 613
 614         """ Return the next decoded line from the input stream."""
 615         line = self.readline()
 616         if line:
 617             return line
 618         raise StopIteration
 619
 620     def __iter__(self):
 621         return self
 622
 623     def __getattr__(self, name,
 624                     getattr=getattr):
 625
 626         """ Inherit all other methods from the underlying stream.
 627         """
 628         return getattr(self.stream, name)
 629
 630     def __enter__(self):
 631         return self
 632
 633     def __exit__(self, type, value, tb):
 634         self.stream.close()
 635
 636 ###
 637
 638 class StreamReaderWriter:
 639
 640     """ StreamReaderWriter instances allow wrapping streams which
 641         work in both read and write modes.
 642
 643         The design is such that one can use the factory functions
 644         returned by the codec.lookup() function to construct the
 645         instance.
 646
 647     """
 648     # Optional attributes set by the file wrappers below
 649     encoding = 'unknown'
 650
 651     def __init__(self, stream, Reader, Writer, errors='strict'):
 652
 653         """ Creates a StreamReaderWriter instance.
 654
 655             stream must be a Stream-like object.
 656
 657             Reader, Writer must be factory functions or classes
 658             providing the StreamReader, StreamWriter interface resp.
 659
 660             Error handling is done in the same way as defined for the
 661             StreamWriter/Readers.
 662
 663         """
 664         self.stream = stream
 665         self.reader = Reader(stream, errors)
 666         self.writer = Writer(stream, errors)
 667         self.errors = errors
 668
 669     def read(self, size=-1):
 670
 671         return self.reader.read(size)
 672
 673     def readline(self, size=None):
 674
 675         return self.reader.readline(size)
 676
 677     def readlines(self, sizehint=None):
 678
 679         return self.reader.readlines(sizehint)
 680
 681     def __next__(self):
 682
 683         """ Return the next decoded line from the input stream."""
 684         return next(self.reader)
 685
 686     def __iter__(self):
 687         return self
 688
 689     def write(self, data):
 690
 691         return self.writer.write(data)
 692
 693     def writelines(self, list):
 694
 695         return self.writer.writelines(list)
 696
 697     def reset(self):
 698
 699         self.reader.reset()
 700         self.writer.reset()
 701
 702     def __getattr__(self, name,
 703                     getattr=getattr):
 704
 705         """ Inherit all other methods from the underlying stream.
 706         """
 707         return getattr(self.stream, name)
 708
 709     # these are needed to make "with codecs.open(...)" work properly
 710
 711     def __enter__(self):
 712         return self
 713
 714     def __exit__(self, type, value, tb):
 715         self.stream.close()
 716
 717 ###
 718
 719 class StreamRecoder:
 720
 721     """ StreamRecoder instances provide a frontend - backend
 722         view of encoding data.
 723
 724         They use the complete set of APIs returned by the
 725         codecs.lookup() function to implement their task.
 726
 727         Data written to the stream is first decoded into an
 728         intermediate format (which is dependent on the given codec
 729         combination) and then written to the stream using an instance
 730         of the provided Writer class.
 731
 732         In the other direction, data is read from the stream using a
 733         Reader instance and then return encoded data to the caller.
 734
 735     """
 736     # Optional attributes set by the file wrappers below
 737     data_encoding = 'unknown'
 738     file_encoding = 'unknown'
 739
 740     def __init__(self, stream, encode, decode, Reader, Writer,
 741                  errors='strict'):
 742
 743         """ Creates a StreamRecoder instance which implements a two-way
 744             conversion: encode and decode work on the frontend (the
 745             input to .read() and output of .write()) while
 746             Reader and Writer work on the backend (reading and
 747             writing to the stream).
 748
 749             You can use these objects to do transparent direct
 750             recodings from e.g. latin-1 to utf-8 and back.
 751
 752             stream must be a file-like object.
 753
 754             encode, decode must adhere to the Codec interface, Reader,
 755             Writer must be factory functions or classes providing the
 756             StreamReader, StreamWriter interface resp.
 757
 758             encode and decode are needed for the frontend translation,
 759             Reader and Writer for the backend translation. Unicode is
 760             used as intermediate encoding.
 761
 762             Error handling is done in the same way as defined for the
 763             StreamWriter/Readers.
 764
 765         """
 766         self.stream = stream
 767         self.encode = encode
 768         self.decode = decode
 769         self.reader = Reader(stream, errors)
 770         self.writer = Writer(stream, errors)
 771         self.errors = errors
 772
 773     def read(self, size=-1):
 774
 775         data = self.reader.read(size)
 776         data, bytesencoded = self.encode(data, self.errors)
 777         return data
 778
 779     def readline(self, size=None):
 780
 781         if size is None:
 782             data = self.reader.readline()
 783         else:
 784             data = self.reader.readline(size)
 785         data, bytesencoded = self.encode(data, self.errors)
 786         return data
 787
 788     def readlines(self, sizehint=None):
 789
 790         data = self.reader.read()
 791         data, bytesencoded = self.encode(data, self.errors)
 792         return data.splitlines(1)
 793
 794     def __next__(self):
 795
 796         """ Return the next decoded line from the input stream."""
 797         data = next(self.reader)
 798         data, bytesencoded = self.encode(data, self.errors)
 799         return data
 800
 801     def __iter__(self):
 802         return self
 803
 804     def write(self, data):
 805
 806         data, bytesdecoded = self.decode(data, self.errors)
 807         return self.writer.write(data)
 808
 809     def writelines(self, list):
 810
 811         data = ''.join(list)
 812         data, bytesdecoded = self.decode(data, self.errors)
 813         return self.writer.write(data)
 814
 815     def reset(self):
 816
 817         self.reader.reset()
 818         self.writer.reset()
 819
 820     def __getattr__(self, name,
 821                     getattr=getattr):
 822
 823         """ Inherit all other methods from the underlying stream.
 824         """
 825         return getattr(self.stream, name)
 826
 827     def __enter__(self):
 828         return self
 829
 830     def __exit__(self, type, value, tb):
 831         self.stream.close()
 832
 833 ### Shortcuts
 834
 835 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 836
 837     """ Open an encoded file using the given mode and return
 838         a wrapped version providing transparent encoding/decoding.
 839
 840         Note: The wrapped version will only accept the object format
 841         defined by the codecs, i.e. Unicode objects for most builtin
 842         codecs. Output is also codec dependent and will usually be
 843         Unicode as well.
 844
 845         Files are always opened in binary mode, even if no binary mode
 846         was specified. This is done to avoid data loss due to encodings
 847         using 8-bit values. The default file mode is 'rb' meaning to
 848         open the file in binary read mode.
 849
 850         encoding specifies the encoding which is to be used for the
 851         file.
 852
 853         errors may be given to define the error handling. It defaults
 854         to 'strict' which causes ValueErrors to be raised in case an
 855         encoding error occurs.
 856
 857         buffering has the same meaning as for the builtin open() API.
 858         It defaults to line buffered.
 859
 860         The returned wrapped file object provides an extra attribute
 861         .encoding which allows querying the used encoding. This
 862         attribute is only available if an encoding was specified as
 863         parameter.
 864
 865     """
 866     if encoding is not None and \
 867        'b' not in mode:
 868         # Force opening of the file in binary mode
 869         mode = mode + 'b'
 870     file = builtins.open(filename, mode, buffering)
 871     if encoding is None:
 872         return file
 873     info = lookup(encoding)
 874     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
 875     # Add attributes to simplify introspection
 876     srw.encoding = encoding
 877     return srw
 878
 879 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 880
 881     """ Return a wrapped version of file which provides transparent
 882         encoding translation.
 883
 884         Strings written to the wrapped file are interpreted according
 885         to the given data_encoding and then written to the original
 886         file as string using file_encoding. The intermediate encoding
 887         will usually be Unicode but depends on the specified codecs.
 888
 889         Strings are read from the file using file_encoding and then
 890         passed back to the caller as string using data_encoding.
 891
 892         If file_encoding is not given, it defaults to data_encoding.
 893
 894         errors may be given to define the error handling. It defaults
 895         to 'strict' which causes ValueErrors to be raised in case an
 896         encoding error occurs.
 897
 898         The returned wrapped file object provides two extra attributes
 899         .data_encoding and .file_encoding which reflect the given
 900         parameters of the same name. The attributes can be used for
 901         introspection by Python programs.
 902
 903     """
 904     if file_encoding is None:
 905         file_encoding = data_encoding
 906     data_info = lookup(data_encoding)
 907     file_info = lookup(file_encoding)
 908     sr = StreamRecoder(file, data_info.encode, data_info.decode,
 909                        file_info.streamreader, file_info.streamwriter, errors)
 910     # Add attributes to simplify introspection
 911     sr.data_encoding = data_encoding
 912     sr.file_encoding = file_encoding
 913     return sr
 914
 915 ### Helpers for codec lookup
 916
 917 def getencoder(encoding):
 918
 919     """ Lookup up the codec for the given encoding and return
 920         its encoder function.
 921
 922         Raises a LookupError in case the encoding cannot be found.
 923
 924     """
 925     return lookup(encoding).encode
 926
 927 def getdecoder(encoding):
 928
 929     """ Lookup up the codec for the given encoding and return
 930         its decoder function.
 931
 932         Raises a LookupError in case the encoding cannot be found.
 933
 934     """
 935     return lookup(encoding).decode
 936
 937 def getincrementalencoder(encoding):
 938
 939     """ Lookup up the codec for the given encoding and return
 940         its IncrementalEncoder class or factory function.
 941
 942         Raises a LookupError in case the encoding cannot be found
 943         or the codecs doesn't provide an incremental encoder.
 944
 945     """
 946     encoder = lookup(encoding).incrementalencoder
 947     if encoder is None:
 948         raise LookupError(encoding)
 949     return encoder
 950
 951 def getincrementaldecoder(encoding):
 952
 953     """ Lookup up the codec for the given encoding and return
 954         its IncrementalDecoder class or factory function.
 955
 956         Raises a LookupError in case the encoding cannot be found
 957         or the codecs doesn't provide an incremental decoder.
 958
 959     """
 960     decoder = lookup(encoding).incrementaldecoder
 961     if decoder is None:
 962         raise LookupError(encoding)
 963     return decoder
 964
 965 def getreader(encoding):
 966
 967     """ Lookup up the codec for the given encoding and return
 968         its StreamReader class or factory function.
 969
 970         Raises a LookupError in case the encoding cannot be found.
 971
 972     """
 973     return lookup(encoding).streamreader
 974
 975 def getwriter(encoding):
 976
 977     """ Lookup up the codec for the given encoding and return
 978         its StreamWriter class or factory function.
 979
 980         Raises a LookupError in case the encoding cannot be found.
 981
 982     """
 983     return lookup(encoding).streamwriter
 984
 985 def iterencode(iterator, encoding, errors='strict', **kwargs):
 986     """
 987     Encoding iterator.
 988
 989     Encodes the input strings from the iterator using a IncrementalEncoder.
 990
 991     errors and kwargs are passed through to the IncrementalEncoder
 992     constructor.
 993     """
 994     encoder = getincrementalencoder(encoding)(errors, **kwargs)
 995     for input in iterator:
 996         output = encoder.encode(input)
 997         if output:
 998             yield output
 999     output = encoder.encode("", True)
1000     if output:
1001         yield output
1002
1003 def iterdecode(iterator, encoding, errors='strict', **kwargs):
1004     """
1005     Decoding iterator.
1006
1007     Decodes the input strings from the iterator using a IncrementalDecoder.
1008
1009     errors and kwargs are passed through to the IncrementalDecoder
1010     constructor.
1011     """
1012     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1013     for input in iterator:
1014         output = decoder.decode(input)
1015         if output:
1016             yield output
1017     output = decoder.decode(b"", True)
1018     if output:
1019         yield output
1020
1021 ### Helpers for charmap-based codecs
1022
1023 def make_identity_dict(rng):
1024
1025     """ make_identity_dict(rng) -> dict
1026
1027         Return a dictionary where elements of the rng sequence are
1028         mapped to themselves.
1029
1030     """
1031     res = {}
1032     for i in rng:
1033         res[i]=i
1034     return res
1035
1036 def make_encoding_map(decoding_map):
1037
1038     """ Creates an encoding map from a decoding map.
1039
1040         If a target mapping in the decoding map occurs multiple
1041         times, then that target is mapped to None (undefined mapping),
1042         causing an exception when encountered by the charmap codec
1043         during translation.
1044
1045         One example where this happens is cp875.py which decodes
1046         multiple character to \u001a.
1047
1048     """
1049     m = {}
1050     for k,v in decoding_map.items():
1051         if not v in m:
1052             m[v] = k
1053         else:
1054             m[v] = None
1055     return m
1056
1057 ### error handlers
1058
1059 try:
1060     strict_errors = lookup_error("strict")
1061     ignore_errors = lookup_error("ignore")
1062     replace_errors = lookup_error("replace")
1063     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1064     backslashreplace_errors = lookup_error("backslashreplace")
1065 except LookupError:
1066     # In --disable-unicode builds, these error handler are missing
1067     strict_errors = None
1068     ignore_errors = None
1069     replace_errors = None
1070     xmlcharrefreplace_errors = None
1071     backslashreplace_errors = None
1072
1073 # Tell modulefinder that using codecs probably needs the encodings
1074 # package
1075 _false = 0
1076 if _false:
1077     import encodings
1078
1079 ### Tests
1080
1081 if __name__ == '__main__':
1082
1083     # Make stdout translate Latin-1 output into UTF-8 output
1084     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1085
1086     # Have stdin translate Latin-1 input into UTF-8 input
1087     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')