Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import __builtin__, sys
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError, why:
  17     raise SystemError('Failed to load the builtin codecs: %s' % why)
  18
  19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
  21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
  22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
  23            "strict_errors", "ignore_errors", "replace_errors",
  24            "xmlcharrefreplace_errors",
  25            "register_error", "lookup_error"]
  26
  27 ### Constants
  28
  29 #
  30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
  31 # and its possible byte string values
  32 # for UTF8/UTF16/UTF32 output and little/big endian machines
  33 #
  34
  35 # UTF-8
  36 BOM_UTF8 = '\xef\xbb\xbf'
  37
  38 # UTF-16, little endian
  39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
  40
  41 # UTF-16, big endian
  42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
  43
  44 # UTF-32, little endian
  45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
  46
  47 # UTF-32, big endian
  48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
  49
  50 if sys.byteorder == 'little':
  51
  52     # UTF-16, native endianness
  53     BOM = BOM_UTF16 = BOM_UTF16_LE
  54
  55     # UTF-32, native endianness
  56     BOM_UTF32 = BOM_UTF32_LE
  57
  58 else:
  59
  60     # UTF-16, native endianness
  61     BOM = BOM_UTF16 = BOM_UTF16_BE
  62
  63     # UTF-32, native endianness
  64     BOM_UTF32 = BOM_UTF32_BE
  65
  66 # Old broken names (don't use in new code)
  67 BOM32_LE = BOM_UTF16_LE
  68 BOM32_BE = BOM_UTF16_BE
  69 BOM64_LE = BOM_UTF32_LE
  70 BOM64_BE = BOM_UTF32_BE
  71
  72
  73 ### Codec base classes (defining the API)
  74
  75 class CodecInfo(tuple):
  76
  77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
  78         incrementalencoder=None, incrementaldecoder=None, name=None):
  79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
  80         self.name = name
  81         self.encode = encode
  82         self.decode = decode
  83         self.incrementalencoder = incrementalencoder
  84         self.incrementaldecoder = incrementaldecoder
  85         self.streamwriter = streamwriter
  86         self.streamreader = streamreader
  87         return self
  88
  89     def __repr__(self):
  90         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
  91
  92 class Codec:
  93
  94     """ Defines the interface for stateless encoders/decoders.
  95
  96         The .encode()/.decode() methods may use different error
  97         handling schemes by providing the errors argument. These
  98         string values are predefined:
  99
 100          'strict' - raise a ValueError error (or a subclass)
 101          'ignore' - ignore the character and continue with the next
 102          'replace' - replace with a suitable replacement character;
 103                     Python will use the official U+FFFD REPLACEMENT
 104                     CHARACTER for the builtin Unicode codecs on
 105                     decoding and '?' on encoding.
 106          'xmlcharrefreplace' - Replace with the appropriate XML
 107                                character reference (only for encoding).
 108          'backslashreplace'  - Replace with backslashed escape sequences
 109                                (only for encoding).
 110
 111         The set of allowed values can be extended via register_error.
 112
 113     """
 114     def encode(self, input, errors='strict'):
 115
 116         """ Encodes the object input and returns a tuple (output
 117             object, length consumed).
 118
 119             errors defines the error handling to apply. It defaults to
 120             'strict' handling.
 121
 122             The method may not store state in the Codec instance. Use
 123             StreamCodec for codecs which have to keep state in order to
 124             make encoding/decoding efficient.
 125
 126             The encoder must be able to handle zero length input and
 127             return an empty object of the output object type in this
 128             situation.
 129
 130         """
 131         raise NotImplementedError
 132
 133     def decode(self, input, errors='strict'):
 134
 135         """ Decodes the object input and returns a tuple (output
 136             object, length consumed).
 137
 138             input must be an object which provides the bf_getreadbuf
 139             buffer slot. Python strings, buffer objects and memory
 140             mapped files are examples of objects providing this slot.
 141
 142             errors defines the error handling to apply. It defaults to
 143             'strict' handling.
 144
 145             The method may not store state in the Codec instance. Use
 146             StreamCodec for codecs which have to keep state in order to
 147             make encoding/decoding efficient.
 148
 149             The decoder must be able to handle zero length input and
 150             return an empty object of the output object type in this
 151             situation.
 152
 153         """
 154         raise NotImplementedError
 155
 156 class IncrementalEncoder(object):
 157     """
 158     An IncrementalEncoder encodes an input in multiple steps. The input can be
 159     passed piece by piece to the encode() method. The IncrementalEncoder remembers
 160     the state of the Encoding process between calls to encode().
 161     """
 162     def __init__(self, errors='strict'):
 163         """
 164         Creates an IncrementalEncoder instance.
 165
 166         The IncrementalEncoder may use different error handling schemes by
 167         providing the errors keyword argument. See the module docstring
 168         for a list of possible values.
 169         """
 170         self.errors = errors
 171         self.buffer = ""
 172
 173     def encode(self, input, final=False):
 174         """
 175         Encodes input and returns the resulting object.
 176         """
 177         raise NotImplementedError
 178
 179     def reset(self):
 180         """
 181         Resets the encoder to the initial state.
 182         """
 183
 184     def getstate(self):
 185         """
 186         Return the current state of the encoder.
 187         """
 188         return 0
 189
 190     def setstate(self, state):
 191         """
 192         Set the current state of the encoder. state must have been
 193         returned by getstate().
 194         """
 195
 196 class BufferedIncrementalEncoder(IncrementalEncoder):
 197     """
 198     This subclass of IncrementalEncoder can be used as the baseclass for an
 199     incremental encoder if the encoder must keep some of the output in a
 200     buffer between calls to encode().
 201     """
 202     def __init__(self, errors='strict'):
 203         IncrementalEncoder.__init__(self, errors)
 204         self.buffer = "" # unencoded input that is kept between calls to encode()
 205
 206     def _buffer_encode(self, input, errors, final):
 207         # Overwrite this method in subclasses: It must encode input
 208         # and return an (output, length consumed) tuple
 209         raise NotImplementedError
 210
 211     def encode(self, input, final=False):
 212         # encode input (taking the buffer into account)
 213         data = self.buffer + input
 214         (result, consumed) = self._buffer_encode(data, self.errors, final)
 215         # keep unencoded input until the next call
 216         self.buffer = data[consumed:]
 217         return result
 218
 219     def reset(self):
 220         IncrementalEncoder.reset(self)
 221         self.buffer = ""
 222
 223     def getstate(self):
 224         return self.buffer or 0
 225
 226     def setstate(self, state):
 227         self.buffer = state or ""
 228
 229 class IncrementalDecoder(object):
 230     """
 231     An IncrementalDecoder decodes an input in multiple steps. The input can be
 232     passed piece by piece to the decode() method. The IncrementalDecoder
 233     remembers the state of the decoding process between calls to decode().
 234     """
 235     def __init__(self, errors='strict'):
 236         """
 237         Creates a IncrementalDecoder instance.
 238
 239         The IncrementalDecoder may use different error handling schemes by
 240         providing the errors keyword argument. See the module docstring
 241         for a list of possible values.
 242         """
 243         self.errors = errors
 244
 245     def decode(self, input, final=False):
 246         """
 247         Decodes input and returns the resulting object.
 248         """
 249         raise NotImplementedError
 250
 251     def reset(self):
 252         """
 253         Resets the decoder to the initial state.
 254         """
 255
 256     def getstate(self):
 257         """
 258         Return the current state of the decoder.
 259
 260         This must be a (buffered_input, additional_state_info) tuple.
 261         buffered_input must be a bytes object containing bytes that
 262         were passed to decode() that have not yet been converted.
 263         additional_state_info must be a non-negative integer
 264         representing the state of the decoder WITHOUT yet having
 265         processed the contents of buffered_input.  In the initial state
 266         and after reset(), getstate() must return (b"", 0).
 267         """
 268         return (b"", 0)
 269
 270     def setstate(self, state):
 271         """
 272         Set the current state of the decoder.
 273
 274         state must have been returned by getstate().  The effect of
 275         setstate((b"", 0)) must be equivalent to reset().
 276         """
 277
 278 class BufferedIncrementalDecoder(IncrementalDecoder):
 279     """
 280     This subclass of IncrementalDecoder can be used as the baseclass for an
 281     incremental decoder if the decoder must be able to handle incomplete byte
 282     sequences.
 283     """
 284     def __init__(self, errors='strict'):
 285         IncrementalDecoder.__init__(self, errors)
 286         self.buffer = "" # undecoded input that is kept between calls to decode()
 287
 288     def _buffer_decode(self, input, errors, final):
 289         # Overwrite this method in subclasses: It must decode input
 290         # and return an (output, length consumed) tuple
 291         raise NotImplementedError
 292
 293     def decode(self, input, final=False):
 294         # decode input (taking the buffer into account)
 295         data = self.buffer + input
 296         (result, consumed) = self._buffer_decode(data, self.errors, final)
 297         # keep undecoded input until the next call
 298         self.buffer = data[consumed:]
 299         return result
 300
 301     def reset(self):
 302         IncrementalDecoder.reset(self)
 303         self.buffer = ""
 304
 305     def getstate(self):
 306         # additional state info is always 0
 307         return (self.buffer, 0)
 308
 309     def setstate(self, state):
 310         # ignore additional state info
 311         self.buffer = state[0]
 312
 313 #
 314 # The StreamWriter and StreamReader class provide generic working
 315 # interfaces which can be used to implement new encoding submodules
 316 # very easily. See encodings/utf_8.py for an example on how this is
 317 # done.
 318 #
 319
 320 class StreamWriter(Codec):
 321
 322     def __init__(self, stream, errors='strict'):
 323
 324         """ Creates a StreamWriter instance.
 325
 326             stream must be a file-like object open for writing
 327             (binary) data.
 328
 329             The StreamWriter may use different error handling
 330             schemes by providing the errors keyword argument. These
 331             parameters are predefined:
 332
 333              'strict' - raise a ValueError (or a subclass)
 334              'ignore' - ignore the character and continue with the next
 335              'replace'- replace with a suitable replacement character
 336              'xmlcharrefreplace' - Replace with the appropriate XML
 337                                    character reference.
 338              'backslashreplace'  - Replace with backslashed escape
 339                                    sequences (only for encoding).
 340
 341             The set of allowed parameter values can be extended via
 342             register_error.
 343         """
 344         self.stream = stream
 345         self.errors = errors
 346
 347     def write(self, object):
 348
 349         """ Writes the object's contents encoded to self.stream.
 350         """
 351         data, consumed = self.encode(object, self.errors)
 352         self.stream.write(data)
 353
 354     def writelines(self, list):
 355
 356         """ Writes the concatenated list of strings to the stream
 357             using .write().
 358         """
 359         self.write(''.join(list))
 360
 361     def reset(self):
 362
 363         """ Flushes and resets the codec buffers used for keeping state.
 364
 365             Calling this method should ensure that the data on the
 366             output is put into a clean state, that allows appending
 367             of new fresh data without having to rescan the whole
 368             stream to recover state.
 369
 370         """
 371         pass
 372
 373     def seek(self, offset, whence=0):
 374         self.stream.seek(offset, whence)
 375         if whence == 0 and offset == 0:
 376             self.reset()
 377
 378     def __getattr__(self, name,
 379                     getattr=getattr):
 380
 381         """ Inherit all other methods from the underlying stream.
 382         """
 383         return getattr(self.stream, name)
 384
 385     def __enter__(self):
 386         return self
 387
 388     def __exit__(self, type, value, tb):
 389         self.stream.close()
 390
 391 ###
 392
 393 class StreamReader(Codec):
 394
 395     def __init__(self, stream, errors='strict'):
 396
 397         """ Creates a StreamReader instance.
 398
 399             stream must be a file-like object open for reading
 400             (binary) data.
 401
 402             The StreamReader may use different error handling
 403             schemes by providing the errors keyword argument. These
 404             parameters are predefined:
 405
 406              'strict' - raise a ValueError (or a subclass)
 407              'ignore' - ignore the character and continue with the next
 408              'replace'- replace with a suitable replacement character;
 409
 410             The set of allowed parameter values can be extended via
 411             register_error.
 412         """
 413         self.stream = stream
 414         self.errors = errors
 415         self.bytebuffer = ""
 416         # For str->str decoding this will stay a str
 417         # For str->unicode decoding the first read will promote it to unicode
 418         self.charbuffer = ""
 419         self.linebuffer = None
 420
 421     def decode(self, input, errors='strict'):
 422         raise NotImplementedError
 423
 424     def read(self, size=-1, chars=-1, firstline=False):
 425
 426         """ Decodes data from the stream self.stream and returns the
 427             resulting object.
 428
 429             chars indicates the number of characters to read from the
 430             stream. read() will never return more than chars
 431             characters, but it might return less, if there are not enough
 432             characters available.
 433
 434             size indicates the approximate maximum number of bytes to
 435             read from the stream for decoding purposes. The decoder
 436             can modify this setting as appropriate. The default value
 437             -1 indicates to read and decode as much as possible.  size
 438             is intended to prevent having to decode huge files in one
 439             step.
 440
 441             If firstline is true, and a UnicodeDecodeError happens
 442             after the first line terminator in the input only the first line
 443             will be returned, the rest of the input will be kept until the
 444             next call to read().
 445
 446             The method should use a greedy read strategy meaning that
 447             it should read as much data as is allowed within the
 448             definition of the encoding and the given size, e.g.  if
 449             optional encoding endings or state markers are available
 450             on the stream, these should be read too.
 451         """
 452         # If we have lines cached, first merge them back into characters
 453         if self.linebuffer:
 454             self.charbuffer = "".join(self.linebuffer)
 455             self.linebuffer = None
 456
 457         # read until we get the required number of characters (if available)
 458         while True:
 459             # can the request can be satisfied from the character buffer?
 460             if chars < 0:
 461                 if size < 0:
 462                     if self.charbuffer:
 463                         break
 464                 elif len(self.charbuffer) >= size:
 465                     break
 466             else:
 467                 if len(self.charbuffer) >= chars:
 468                     break
 469             # we need more data
 470             if size < 0:
 471                 newdata = self.stream.read()
 472             else:
 473                 newdata = self.stream.read(size)
 474             # decode bytes (those remaining from the last call included)
 475             data = self.bytebuffer + newdata
 476             try:
 477                 newchars, decodedbytes = self.decode(data, self.errors)
 478             except UnicodeDecodeError, exc:
 479                 if firstline:
 480                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
 481                     lines = newchars.splitlines(True)
 482                     if len(lines)<=1:
 483                         raise
 484                 else:
 485                     raise
 486             # keep undecoded bytes until the next call
 487             self.bytebuffer = data[decodedbytes:]
 488             # put new characters in the character buffer
 489             self.charbuffer += newchars
 490             # there was no data available
 491             if not newdata:
 492                 break
 493         if chars < 0:
 494             # Return everything we've got
 495             result = self.charbuffer
 496             self.charbuffer = ""
 497         else:
 498             # Return the first chars characters
 499             result = self.charbuffer[:chars]
 500             self.charbuffer = self.charbuffer[chars:]
 501         return result
 502
 503     def readline(self, size=None, keepends=True):
 504
 505         """ Read one line from the input stream and return the
 506             decoded data.
 507
 508             size, if given, is passed as size argument to the
 509             read() method.
 510
 511         """
 512         # If we have lines cached from an earlier read, return
 513         # them unconditionally
 514         if self.linebuffer:
 515             line = self.linebuffer[0]
 516             del self.linebuffer[0]
 517             if len(self.linebuffer) == 1:
 518                 # revert to charbuffer mode; we might need more data
 519                 # next time
 520                 self.charbuffer = self.linebuffer[0]
 521                 self.linebuffer = None
 522             if not keepends:
 523                 line = line.splitlines(False)[0]
 524             return line
 525
 526         readsize = size or 72
 527         line = ""
 528         # If size is given, we call read() only once
 529         while True:
 530             data = self.read(readsize, firstline=True)
 531             if data:
 532                 # If we're at a "\r" read one extra character (which might
 533                 # be a "\n") to get a proper line ending. If the stream is
 534                 # temporarily exhausted we return the wrong line ending.
 535                 if data.endswith("\r"):
 536                     data += self.read(size=1, chars=1)
 537
 538             line += data
 539             lines = line.splitlines(True)
 540             if lines:
 541                 if len(lines) > 1:
 542                     # More than one line result; the first line is a full line
 543                     # to return
 544                     line = lines[0]
 545                     del lines[0]
 546                     if len(lines) > 1:
 547                         # cache the remaining lines
 548                         lines[-1] += self.charbuffer
 549                         self.linebuffer = lines
 550                         self.charbuffer = None
 551                     else:
 552                         # only one remaining line, put it back into charbuffer
 553                         self.charbuffer = lines[0] + self.charbuffer
 554                     if not keepends:
 555                         line = line.splitlines(False)[0]
 556                     break
 557                 line0withend = lines[0]
 558                 line0withoutend = lines[0].splitlines(False)[0]
 559                 if line0withend != line0withoutend: # We really have a line end
 560                     # Put the rest back together and keep it until the next call
 561                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
 562                     if keepends:
 563                         line = line0withend
 564                     else:
 565                         line = line0withoutend
 566                     break
 567             # we didn't get anything or this was our only try
 568             if not data or size is not None:
 569                 if line and not keepends:
 570                     line = line.splitlines(False)[0]
 571                 break
 572             if readsize<8000:
 573                 readsize *= 2
 574         return line
 575
 576     def readlines(self, sizehint=None, keepends=True):
 577
 578         """ Read all lines available on the input stream
 579             and return them as list of lines.
 580
 581             Line breaks are implemented using the codec's decoder
 582             method and are included in the list entries.
 583
 584             sizehint, if given, is ignored since there is no efficient
 585             way to finding the true end-of-line.
 586
 587         """
 588         data = self.read()
 589         return data.splitlines(keepends)
 590
 591     def reset(self):
 592
 593         """ Resets the codec buffers used for keeping state.
 594
 595             Note that no stream repositioning should take place.
 596             This method is primarily intended to be able to recover
 597             from decoding errors.
 598
 599         """
 600         self.bytebuffer = ""
 601         self.charbuffer = u""
 602         self.linebuffer = None
 603
 604     def seek(self, offset, whence=0):
 605         """ Set the input stream's current position.
 606
 607             Resets the codec buffers used for keeping state.
 608         """
 609         self.stream.seek(offset, whence)
 610         self.reset()
 611
 612     def next(self):
 613
 614         """ Return the next decoded line from the input stream."""
 615         line = self.readline()
 616         if line:
 617             return line
 618         raise StopIteration
 619
 620     def __iter__(self):
 621         return self
 622
 623     def __getattr__(self, name,
 624                     getattr=getattr):
 625
 626         """ Inherit all other methods from the underlying stream.
 627         """
 628         return getattr(self.stream, name)
 629
 630     def __enter__(self):
 631         return self
 632
 633     def __exit__(self, type, value, tb):
 634         self.stream.close()
 635
 636 ###
 637
 638 class StreamReaderWriter:
 639
 640     """ StreamReaderWriter instances allow wrapping streams which
 641         work in both read and write modes.
 642
 643         The design is such that one can use the factory functions
 644         returned by the codec.lookup() function to construct the
 645         instance.
 646
 647     """
 648     # Optional attributes set by the file wrappers below
 649     encoding = 'unknown'
 650
 651     def __init__(self, stream, Reader, Writer, errors='strict'):
 652
 653         """ Creates a StreamReaderWriter instance.
 654
 655             stream must be a Stream-like object.
 656
 657             Reader, Writer must be factory functions or classes
 658             providing the StreamReader, StreamWriter interface resp.
 659
 660             Error handling is done in the same way as defined for the
 661             StreamWriter/Readers.
 662
 663         """
 664         self.stream = stream
 665         self.reader = Reader(stream, errors)
 666         self.writer = Writer(stream, errors)
 667         self.errors = errors
 668
 669     def read(self, size=-1):
 670
 671         return self.reader.read(size)
 672
 673     def readline(self, size=None):
 674
 675         return self.reader.readline(size)
 676
 677     def readlines(self, sizehint=None):
 678
 679         return self.reader.readlines(sizehint)
 680
 681     def next(self):
 682
 683         """ Return the next decoded line from the input stream."""
 684         return self.reader.next()
 685
 686     def __iter__(self):
 687         return self
 688
 689     def write(self, data):
 690
 691         return self.writer.write(data)
 692
 693     def writelines(self, list):
 694
 695         return self.writer.writelines(list)
 696
 697     def reset(self):
 698
 699         self.reader.reset()
 700         self.writer.reset()
 701
 702     def seek(self, offset, whence=0):
 703         self.stream.seek(offset, whence)
 704         self.reader.reset()
 705         if whence == 0 and offset == 0:
 706             self.writer.reset()
 707
 708     def __getattr__(self, name,
 709                     getattr=getattr):
 710
 711         """ Inherit all other methods from the underlying stream.
 712         """
 713         return getattr(self.stream, name)
 714
 715     # these are needed to make "with codecs.open(...)" work properly
 716
 717     def __enter__(self):
 718         return self
 719
 720     def __exit__(self, type, value, tb):
 721         self.stream.close()
 722
 723 ###
 724
 725 class StreamRecoder:
 726
 727     """ StreamRecoder instances provide a frontend - backend
 728         view of encoding data.
 729
 730         They use the complete set of APIs returned by the
 731         codecs.lookup() function to implement their task.
 732
 733         Data written to the stream is first decoded into an
 734         intermediate format (which is dependent on the given codec
 735         combination) and then written to the stream using an instance
 736         of the provided Writer class.
 737
 738         In the other direction, data is read from the stream using a
 739         Reader instance and then return encoded data to the caller.
 740
 741     """
 742     # Optional attributes set by the file wrappers below
 743     data_encoding = 'unknown'
 744     file_encoding = 'unknown'
 745
 746     def __init__(self, stream, encode, decode, Reader, Writer,
 747                  errors='strict'):
 748
 749         """ Creates a StreamRecoder instance which implements a two-way
 750             conversion: encode and decode work on the frontend (the
 751             input to .read() and output of .write()) while
 752             Reader and Writer work on the backend (reading and
 753             writing to the stream).
 754
 755             You can use these objects to do transparent direct
 756             recodings from e.g. latin-1 to utf-8 and back.
 757
 758             stream must be a file-like object.
 759
 760             encode, decode must adhere to the Codec interface, Reader,
 761             Writer must be factory functions or classes providing the
 762             StreamReader, StreamWriter interface resp.
 763
 764             encode and decode are needed for the frontend translation,
 765             Reader and Writer for the backend translation. Unicode is
 766             used as intermediate encoding.
 767
 768             Error handling is done in the same way as defined for the
 769             StreamWriter/Readers.
 770
 771         """
 772         self.stream = stream
 773         self.encode = encode
 774         self.decode = decode
 775         self.reader = Reader(stream, errors)
 776         self.writer = Writer(stream, errors)
 777         self.errors = errors
 778
 779     def read(self, size=-1):
 780
 781         data = self.reader.read(size)
 782         data, bytesencoded = self.encode(data, self.errors)
 783         return data
 784
 785     def readline(self, size=None):
 786
 787         if size is None:
 788             data = self.reader.readline()
 789         else:
 790             data = self.reader.readline(size)
 791         data, bytesencoded = self.encode(data, self.errors)
 792         return data
 793
 794     def readlines(self, sizehint=None):
 795
 796         data = self.reader.read()
 797         data, bytesencoded = self.encode(data, self.errors)
 798         return data.splitlines(1)
 799
 800     def next(self):
 801
 802         """ Return the next decoded line from the input stream."""
 803         data = self.reader.next()
 804         data, bytesencoded = self.encode(data, self.errors)
 805         return data
 806
 807     def __iter__(self):
 808         return self
 809
 810     def write(self, data):
 811
 812         data, bytesdecoded = self.decode(data, self.errors)
 813         return self.writer.write(data)
 814
 815     def writelines(self, list):
 816
 817         data = ''.join(list)
 818         data, bytesdecoded = self.decode(data, self.errors)
 819         return self.writer.write(data)
 820
 821     def reset(self):
 822
 823         self.reader.reset()
 824         self.writer.reset()
 825
 826     def __getattr__(self, name,
 827                     getattr=getattr):
 828
 829         """ Inherit all other methods from the underlying stream.
 830         """
 831         return getattr(self.stream, name)
 832
 833     def __enter__(self):
 834         return self
 835
 836     def __exit__(self, type, value, tb):
 837         self.stream.close()
 838
 839 ### Shortcuts
 840
 841 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 842
 843     """ Open an encoded file using the given mode and return
 844         a wrapped version providing transparent encoding/decoding.
 845
 846         Note: The wrapped version will only accept the object format
 847         defined by the codecs, i.e. Unicode objects for most builtin
 848         codecs. Output is also codec dependent and will usually be
 849         Unicode as well.
 850
 851         Files are always opened in binary mode, even if no binary mode
 852         was specified. This is done to avoid data loss due to encodings
 853         using 8-bit values. The default file mode is 'rb' meaning to
 854         open the file in binary read mode.
 855
 856         encoding specifies the encoding which is to be used for the
 857         file.
 858
 859         errors may be given to define the error handling. It defaults
 860         to 'strict' which causes ValueErrors to be raised in case an
 861         encoding error occurs.
 862
 863         buffering has the same meaning as for the builtin open() API.
 864         It defaults to line buffered.
 865
 866         The returned wrapped file object provides an extra attribute
 867         .encoding which allows querying the used encoding. This
 868         attribute is only available if an encoding was specified as
 869         parameter.
 870
 871     """
 872     if encoding is not None:
 873         if 'U' in mode:
 874             # No automatic conversion of '\n' is done on reading and writing
 875             mode = mode.strip().replace('U', '')
 876             if mode[:1] not in set('rwa'):
 877                 mode = 'r' + mode
 878         if 'b' not in mode:
 879             # Force opening of the file in binary mode
 880             mode = mode + 'b'
 881     file = __builtin__.open(filename, mode, buffering)
 882     if encoding is None:
 883         return file
 884     info = lookup(encoding)
 885     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
 886     # Add attributes to simplify introspection
 887     srw.encoding = encoding
 888     return srw
 889
 890 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 891
 892     """ Return a wrapped version of file which provides transparent
 893         encoding translation.
 894
 895         Strings written to the wrapped file are interpreted according
 896         to the given data_encoding and then written to the original
 897         file as string using file_encoding. The intermediate encoding
 898         will usually be Unicode but depends on the specified codecs.
 899
 900         Strings are read from the file using file_encoding and then
 901         passed back to the caller as string using data_encoding.
 902
 903         If file_encoding is not given, it defaults to data_encoding.
 904
 905         errors may be given to define the error handling. It defaults
 906         to 'strict' which causes ValueErrors to be raised in case an
 907         encoding error occurs.
 908
 909         The returned wrapped file object provides two extra attributes
 910         .data_encoding and .file_encoding which reflect the given
 911         parameters of the same name. The attributes can be used for
 912         introspection by Python programs.
 913
 914     """
 915     if file_encoding is None:
 916         file_encoding = data_encoding
 917     data_info = lookup(data_encoding)
 918     file_info = lookup(file_encoding)
 919     sr = StreamRecoder(file, data_info.encode, data_info.decode,
 920                        file_info.streamreader, file_info.streamwriter, errors)
 921     # Add attributes to simplify introspection
 922     sr.data_encoding = data_encoding
 923     sr.file_encoding = file_encoding
 924     return sr
 925
 926 ### Helpers for codec lookup
 927
 928 def getencoder(encoding):
 929
 930     """ Lookup up the codec for the given encoding and return
 931         its encoder function.
 932
 933         Raises a LookupError in case the encoding cannot be found.
 934
 935     """
 936     return lookup(encoding).encode
 937
 938 def getdecoder(encoding):
 939
 940     """ Lookup up the codec for the given encoding and return
 941         its decoder function.
 942
 943         Raises a LookupError in case the encoding cannot be found.
 944
 945     """
 946     return lookup(encoding).decode
 947
 948 def getincrementalencoder(encoding):
 949
 950     """ Lookup up the codec for the given encoding and return
 951         its IncrementalEncoder class or factory function.
 952
 953         Raises a LookupError in case the encoding cannot be found
 954         or the codecs doesn't provide an incremental encoder.
 955
 956     """
 957     encoder = lookup(encoding).incrementalencoder
 958     if encoder is None:
 959         raise LookupError(encoding)
 960     return encoder
 961
 962 def getincrementaldecoder(encoding):
 963
 964     """ Lookup up the codec for the given encoding and return
 965         its IncrementalDecoder class or factory function.
 966
 967         Raises a LookupError in case the encoding cannot be found
 968         or the codecs doesn't provide an incremental decoder.
 969
 970     """
 971     decoder = lookup(encoding).incrementaldecoder
 972     if decoder is None:
 973         raise LookupError(encoding)
 974     return decoder
 975
 976 def getreader(encoding):
 977
 978     """ Lookup up the codec for the given encoding and return
 979         its StreamReader class or factory function.
 980
 981         Raises a LookupError in case the encoding cannot be found.
 982
 983     """
 984     return lookup(encoding).streamreader
 985
 986 def getwriter(encoding):
 987
 988     """ Lookup up the codec for the given encoding and return
 989         its StreamWriter class or factory function.
 990
 991         Raises a LookupError in case the encoding cannot be found.
 992
 993     """
 994     return lookup(encoding).streamwriter
 995
 996 def iterencode(iterator, encoding, errors='strict', **kwargs):
 997     """
 998     Encoding iterator.
 999
1000     Encodes the input strings from the iterator using a IncrementalEncoder.
1001
1002     errors and kwargs are passed through to the IncrementalEncoder
1003     constructor.
1004     """
1005     encoder = getincrementalencoder(encoding)(errors, **kwargs)
1006     for input in iterator:
1007         output = encoder.encode(input)
1008         if output:
1009             yield output
1010     output = encoder.encode("", True)
1011     if output:
1012         yield output
1013
1014 def iterdecode(iterator, encoding, errors='strict', **kwargs):
1015     """
1016     Decoding iterator.
1017
1018     Decodes the input strings from the iterator using a IncrementalDecoder.
1019
1020     errors and kwargs are passed through to the IncrementalDecoder
1021     constructor.
1022     """
1023     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1024     for input in iterator:
1025         output = decoder.decode(input)
1026         if output:
1027             yield output
1028     output = decoder.decode("", True)
1029     if output:
1030         yield output
1031
1032 ### Helpers for charmap-based codecs
1033
1034 def make_identity_dict(rng):
1035
1036     """ make_identity_dict(rng) -> dict
1037
1038         Return a dictionary where elements of the rng sequence are
1039         mapped to themselves.
1040
1041     """
1042     res = {}
1043     for i in rng:
1044         res[i]=i
1045     return res
1046
1047 def make_encoding_map(decoding_map):
1048
1049     """ Creates an encoding map from a decoding map.
1050
1051         If a target mapping in the decoding map occurs multiple
1052         times, then that target is mapped to None (undefined mapping),
1053         causing an exception when encountered by the charmap codec
1054         during translation.
1055
1056         One example where this happens is cp875.py which decodes
1057         multiple character to \u001a.
1058
1059     """
1060     m = {}
1061     for k,v in decoding_map.items():
1062         if not v in m:
1063             m[v] = k
1064         else:
1065             m[v] = None
1066     return m
1067
1068 ### error handlers
1069
1070 try:
1071     strict_errors = lookup_error("strict")
1072     ignore_errors = lookup_error("ignore")
1073     replace_errors = lookup_error("replace")
1074     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1075     backslashreplace_errors = lookup_error("backslashreplace")
1076 except LookupError:
1077     # In --disable-unicode builds, these error handler are missing
1078     strict_errors = None
1079     ignore_errors = None
1080     replace_errors = None
1081     xmlcharrefreplace_errors = None
1082     backslashreplace_errors = None
1083
1084 # Tell modulefinder that using codecs probably needs the encodings
1085 # package
1086 _false = 0
1087 if _false:
1088     import encodings
1089
1090 ### Tests
1091
1092 if __name__ == '__main__':
1093
1094     # Make stdout translate Latin-1 output into UTF-8 output
1095     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1096
1097     # Have stdin translate Latin-1 input into UTF-8 input
1098     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')