Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import __builtin__, sys
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError, why:
  17     raise SystemError('Failed to load the builtin codecs: %s' % why)
  18
  19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
  21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
  22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
  23            "strict_errors", "ignore_errors", "replace_errors",
  24            "xmlcharrefreplace_errors",
  25            "register_error", "lookup_error"]
  26
  27 ### Constants
  28
  29 #
  30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
  31 # and its possible byte string values
  32 # for UTF8/UTF16/UTF32 output and little/big endian machines
  33 #
  34
  35 # UTF-8
  36 BOM_UTF8 = '\xef\xbb\xbf'
  37
  38 # UTF-16, little endian
  39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
  40
  41 # UTF-16, big endian
  42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
  43
  44 # UTF-32, little endian
  45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
  46
  47 # UTF-32, big endian
  48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
  49
  50 if sys.byteorder == 'little':
  51
  52     # UTF-16, native endianness
  53     BOM = BOM_UTF16 = BOM_UTF16_LE
  54
  55     # UTF-32, native endianness
  56     BOM_UTF32 = BOM_UTF32_LE
  57
  58 else:
  59
  60     # UTF-16, native endianness
  61     BOM = BOM_UTF16 = BOM_UTF16_BE
  62
  63     # UTF-32, native endianness
  64     BOM_UTF32 = BOM_UTF32_BE
  65
  66 # Old broken names (don't use in new code)
  67 BOM32_LE = BOM_UTF16_LE
  68 BOM32_BE = BOM_UTF16_BE
  69 BOM64_LE = BOM_UTF32_LE
  70 BOM64_BE = BOM_UTF32_BE
  71
  72
  73 ### Codec base classes (defining the API)
  74
  75 class CodecInfo(tuple):
  76
  77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
  78         incrementalencoder=None, incrementaldecoder=None, name=None):
  79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
  80         self.name = name
  81         self.encode = encode
  82         self.decode = decode
  83         self.incrementalencoder = incrementalencoder
  84         self.incrementaldecoder = incrementaldecoder
  85         self.streamwriter = streamwriter
  86         self.streamreader = streamreader
  87         return self
  88
  89     def __repr__(self):
  90         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
  91
  92 class Codec:
  93
  94     """ Defines the interface for stateless encoders/decoders.
  95
  96         The .encode()/.decode() methods may use different error
  97         handling schemes by providing the errors argument. These
  98         string values are predefined:
  99
 100          'strict' - raise a ValueError error (or a subclass)
 101          'ignore' - ignore the character and continue with the next
 102          'replace' - replace with a suitable replacement character;
 103                     Python will use the official U+FFFD REPLACEMENT
 104                     CHARACTER for the builtin Unicode codecs on
 105                     decoding and '?' on encoding.
 106          'xmlcharrefreplace' - Replace with the appropriate XML
 107                                character reference (only for encoding).
 108          'backslashreplace'  - Replace with backslashed escape sequences
 109                                (only for encoding).
 110
 111         The set of allowed values can be extended via register_error.
 112
 113     """
 114     def encode(self, input, errors='strict'):
 115
 116         """ Encodes the object input and returns a tuple (output
 117             object, length consumed).
 118
 119             errors defines the error handling to apply. It defaults to
 120             'strict' handling.
 121
 122             The method may not store state in the Codec instance. Use
 123             StreamCodec for codecs which have to keep state in order to
 124             make encoding/decoding efficient.
 125
 126             The encoder must be able to handle zero length input and
 127             return an empty object of the output object type in this
 128             situation.
 129
 130         """
 131         raise NotImplementedError
 132
 133     def decode(self, input, errors='strict'):
 134
 135         """ Decodes the object input and returns a tuple (output
 136             object, length consumed).
 137
 138             input must be an object which provides the bf_getreadbuf
 139             buffer slot. Python strings, buffer objects and memory
 140             mapped files are examples of objects providing this slot.
 141
 142             errors defines the error handling to apply. It defaults to
 143             'strict' handling.
 144
 145             The method may not store state in the Codec instance. Use
 146             StreamCodec for codecs which have to keep state in order to
 147             make encoding/decoding efficient.
 148
 149             The decoder must be able to handle zero length input and
 150             return an empty object of the output object type in this
 151             situation.
 152
 153         """
 154         raise NotImplementedError
 155
 156 class IncrementalEncoder(object):
 157     """
 158     An IncrementalEncoder encodes an input in multiple steps. The input can be
 159     passed piece by piece to the encode() method. The IncrementalEncoder remembers
 160     the state of the Encoding process between calls to encode().
 161     """
 162     def __init__(self, errors='strict'):
 163         """
 164         Creates an IncrementalEncoder instance.
 165
 166         The IncrementalEncoder may use different error handling schemes by
 167         providing the errors keyword argument. See the module docstring
 168         for a list of possible values.
 169         """
 170         self.errors = errors
 171         self.buffer = ""
 172
 173     def encode(self, input, final=False):
 174         """
 175         Encodes input and returns the resulting object.
 176         """
 177         raise NotImplementedError
 178
 179     def reset(self):
 180         """
 181         Resets the encoder to the initial state.
 182         """
 183
 184     def getstate(self):
 185         """
 186         Return the current state of the encoder.
 187         """
 188         return 0
 189
 190     def setstate(self, state):
 191         """
 192         Set the current state of the encoder. state must have been
 193         returned by getstate().
 194         """
 195
 196 class BufferedIncrementalEncoder(IncrementalEncoder):
 197     """
 198     This subclass of IncrementalEncoder can be used as the baseclass for an
 199     incremental encoder if the encoder must keep some of the output in a
 200     buffer between calls to encode().
 201     """
 202     def __init__(self, errors='strict'):
 203         IncrementalEncoder.__init__(self, errors)
 204         self.buffer = "" # unencoded input that is kept between calls to encode()
 205
 206     def _buffer_encode(self, input, errors, final):
 207         # Overwrite this method in subclasses: It must encode input
 208         # and return an (output, length consumed) tuple
 209         raise NotImplementedError
 210
 211     def encode(self, input, final=False):
 212         # encode input (taking the buffer into account)
 213         data = self.buffer + input
 214         (result, consumed) = self._buffer_encode(data, self.errors, final)
 215         # keep unencoded input until the next call
 216         self.buffer = data[consumed:]
 217         return result
 218
 219     def reset(self):
 220         IncrementalEncoder.reset(self)
 221         self.buffer = ""
 222
 223     def getstate(self):
 224         return self.buffer or 0
 225
 226     def setstate(self, state):
 227         self.buffer = state or ""
 228
 229 class IncrementalDecoder(object):
 230     """
 231     An IncrementalDecoder decodes an input in multiple steps. The input can be
 232     passed piece by piece to the decode() method. The IncrementalDecoder
 233     remembers the state of the decoding process between calls to decode().
 234     """
 235     def __init__(self, errors='strict'):
 236         """
 237         Creates a IncrementalDecoder instance.
 238
 239         The IncrementalDecoder may use different error handling schemes by
 240         providing the errors keyword argument. See the module docstring
 241         for a list of possible values.
 242         """
 243         self.errors = errors
 244
 245     def decode(self, input, final=False):
 246         """
 247         Decodes input and returns the resulting object.
 248         """
 249         raise NotImplementedError
 250
 251     def reset(self):
 252         """
 253         Resets the decoder to the initial state.
 254         """
 255
 256     def getstate(self):
 257         """
 258         Return the current state of the decoder.
 259
 260         This must be a (buffered_input, additional_state_info) tuple.
 261         buffered_input must be a bytes object containing bytes that
 262         were passed to decode() that have not yet been converted.
 263         additional_state_info must be a non-negative integer
 264         representing the state of the decoder WITHOUT yet having
 265         processed the contents of buffered_input.  In the initial state
 266         and after reset(), getstate() must return (b"", 0).
 267         """
 268         return (b"", 0)
 269
 270     def setstate(self, state):
 271         """
 272         Set the current state of the decoder.
 273
 274         state must have been returned by getstate().  The effect of
 275         setstate((b"", 0)) must be equivalent to reset().
 276         """
 277
 278 class BufferedIncrementalDecoder(IncrementalDecoder):
 279     """
 280     This subclass of IncrementalDecoder can be used as the baseclass for an
 281     incremental decoder if the decoder must be able to handle incomplete byte
 282     sequences.
 283     """
 284     def __init__(self, errors='strict'):
 285         IncrementalDecoder.__init__(self, errors)
 286         self.buffer = "" # undecoded input that is kept between calls to decode()
 287
 288     def _buffer_decode(self, input, errors, final):
 289         # Overwrite this method in subclasses: It must decode input
 290         # and return an (output, length consumed) tuple
 291         raise NotImplementedError
 292
 293     def decode(self, input, final=False):
 294         # decode input (taking the buffer into account)
 295         data = self.buffer + input
 296         (result, consumed) = self._buffer_decode(data, self.errors, final)
 297         # keep undecoded input until the next call
 298         self.buffer = data[consumed:]
 299         return result
 300
 301     def reset(self):
 302         IncrementalDecoder.reset(self)
 303         self.buffer = ""
 304
 305     def getstate(self):
 306         # additional state info is always 0
 307         return (self.buffer, 0)
 308
 309     def setstate(self, state):
 310         # ignore additional state info
 311         self.buffer = state[0]
 312
 313 #
 314 # The StreamWriter and StreamReader class provide generic working
 315 # interfaces which can be used to implement new encoding submodules
 316 # very easily. See encodings/utf_8.py for an example on how this is
 317 # done.
 318 #
 319
 320 class StreamWriter(Codec):
 321
 322     def __init__(self, stream, errors='strict'):
 323
 324         """ Creates a StreamWriter instance.
 325
 326             stream must be a file-like object open for writing
 327             (binary) data.
 328
 329             The StreamWriter may use different error handling
 330             schemes by providing the errors keyword argument. These
 331             parameters are predefined:
 332
 333              'strict' - raise a ValueError (or a subclass)
 334              'ignore' - ignore the character and continue with the next
 335              'replace'- replace with a suitable replacement character
 336              'xmlcharrefreplace' - Replace with the appropriate XML
 337                                    character reference.
 338              'backslashreplace'  - Replace with backslashed escape
 339                                    sequences (only for encoding).
 340
 341             The set of allowed parameter values can be extended via
 342             register_error.
 343         """
 344         self.stream = stream
 345         self.errors = errors
 346
 347     def write(self, object):
 348
 349         """ Writes the object's contents encoded to self.stream.
 350         """
 351         data, consumed = self.encode(object, self.errors)
 352         self.stream.write(data)
 353
 354     def writelines(self, list):
 355
 356         """ Writes the concatenated list of strings to the stream
 357             using .write().
 358         """
 359         self.write(''.join(list))
 360
 361     def reset(self):
 362
 363         """ Flushes and resets the codec buffers used for keeping state.
 364
 365             Calling this method should ensure that the data on the
 366             output is put into a clean state, that allows appending
 367             of new fresh data without having to rescan the whole
 368             stream to recover state.
 369
 370         """
 371         pass
 372
 373     def __getattr__(self, name,
 374                     getattr=getattr):
 375
 376         """ Inherit all other methods from the underlying stream.
 377         """
 378         return getattr(self.stream, name)
 379
 380     def __enter__(self):
 381         return self
 382
 383     def __exit__(self, type, value, tb):
 384         self.stream.close()
 385
 386 ###
 387
 388 class StreamReader(Codec):
 389
 390     def __init__(self, stream, errors='strict'):
 391
 392         """ Creates a StreamReader instance.
 393
 394             stream must be a file-like object open for reading
 395             (binary) data.
 396
 397             The StreamReader may use different error handling
 398             schemes by providing the errors keyword argument. These
 399             parameters are predefined:
 400
 401              'strict' - raise a ValueError (or a subclass)
 402              'ignore' - ignore the character and continue with the next
 403              'replace'- replace with a suitable replacement character;
 404
 405             The set of allowed parameter values can be extended via
 406             register_error.
 407         """
 408         self.stream = stream
 409         self.errors = errors
 410         self.bytebuffer = ""
 411         # For str->str decoding this will stay a str
 412         # For str->unicode decoding the first read will promote it to unicode
 413         self.charbuffer = ""
 414         self.linebuffer = None
 415
 416     def decode(self, input, errors='strict'):
 417         raise NotImplementedError
 418
 419     def read(self, size=-1, chars=-1, firstline=False):
 420
 421         """ Decodes data from the stream self.stream and returns the
 422             resulting object.
 423
 424             chars indicates the number of characters to read from the
 425             stream. read() will never return more than chars
 426             characters, but it might return less, if there are not enough
 427             characters available.
 428
 429             size indicates the approximate maximum number of bytes to
 430             read from the stream for decoding purposes. The decoder
 431             can modify this setting as appropriate. The default value
 432             -1 indicates to read and decode as much as possible.  size
 433             is intended to prevent having to decode huge files in one
 434             step.
 435
 436             If firstline is true, and a UnicodeDecodeError happens
 437             after the first line terminator in the input only the first line
 438             will be returned, the rest of the input will be kept until the
 439             next call to read().
 440
 441             The method should use a greedy read strategy meaning that
 442             it should read as much data as is allowed within the
 443             definition of the encoding and the given size, e.g.  if
 444             optional encoding endings or state markers are available
 445             on the stream, these should be read too.
 446         """
 447         # If we have lines cached, first merge them back into characters
 448         if self.linebuffer:
 449             self.charbuffer = "".join(self.linebuffer)
 450             self.linebuffer = None
 451
 452         # read until we get the required number of characters (if available)
 453         while True:
 454             # can the request can be satisfied from the character buffer?
 455             if chars < 0:
 456                 if size < 0:
 457                     if self.charbuffer:
 458                         break
 459                 elif len(self.charbuffer) >= size:
 460                     break
 461             else:
 462                 if len(self.charbuffer) >= chars:
 463                     break
 464             # we need more data
 465             if size < 0:
 466                 newdata = self.stream.read()
 467             else:
 468                 newdata = self.stream.read(size)
 469             # decode bytes (those remaining from the last call included)
 470             data = self.bytebuffer + newdata
 471             try:
 472                 newchars, decodedbytes = self.decode(data, self.errors)
 473             except UnicodeDecodeError, exc:
 474                 if firstline:
 475                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
 476                     lines = newchars.splitlines(True)
 477                     if len(lines)<=1:
 478                         raise
 479                 else:
 480                     raise
 481             # keep undecoded bytes until the next call
 482             self.bytebuffer = data[decodedbytes:]
 483             # put new characters in the character buffer
 484             self.charbuffer += newchars
 485             # there was no data available
 486             if not newdata:
 487                 break
 488         if chars < 0:
 489             # Return everything we've got
 490             result = self.charbuffer
 491             self.charbuffer = ""
 492         else:
 493             # Return the first chars characters
 494             result = self.charbuffer[:chars]
 495             self.charbuffer = self.charbuffer[chars:]
 496         return result
 497
 498     def readline(self, size=None, keepends=True):
 499
 500         """ Read one line from the input stream and return the
 501             decoded data.
 502
 503             size, if given, is passed as size argument to the
 504             read() method.
 505
 506         """
 507         # If we have lines cached from an earlier read, return
 508         # them unconditionally
 509         if self.linebuffer:
 510             line = self.linebuffer[0]
 511             del self.linebuffer[0]
 512             if len(self.linebuffer) == 1:
 513                 # revert to charbuffer mode; we might need more data
 514                 # next time
 515                 self.charbuffer = self.linebuffer[0]
 516                 self.linebuffer = None
 517             if not keepends:
 518                 line = line.splitlines(False)[0]
 519             return line
 520
 521         readsize = size or 72
 522         line = ""
 523         # If size is given, we call read() only once
 524         while True:
 525             data = self.read(readsize, firstline=True)
 526             if data:
 527                 # If we're at a "\r" read one extra character (which might
 528                 # be a "\n") to get a proper line ending. If the stream is
 529                 # temporarily exhausted we return the wrong line ending.
 530                 if data.endswith("\r"):
 531                     data += self.read(size=1, chars=1)
 532
 533             line += data
 534             lines = line.splitlines(True)
 535             if lines:
 536                 if len(lines) > 1:
 537                     # More than one line result; the first line is a full line
 538                     # to return
 539                     line = lines[0]
 540                     del lines[0]
 541                     if len(lines) > 1:
 542                         # cache the remaining lines
 543                         lines[-1] += self.charbuffer
 544                         self.linebuffer = lines
 545                         self.charbuffer = None
 546                     else:
 547                         # only one remaining line, put it back into charbuffer
 548                         self.charbuffer = lines[0] + self.charbuffer
 549                     if not keepends:
 550                         line = line.splitlines(False)[0]
 551                     break
 552                 line0withend = lines[0]
 553                 line0withoutend = lines[0].splitlines(False)[0]
 554                 if line0withend != line0withoutend: # We really have a line end
 555                     # Put the rest back together and keep it until the next call
 556                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
 557                     if keepends:
 558                         line = line0withend
 559                     else:
 560                         line = line0withoutend
 561                     break
 562             # we didn't get anything or this was our only try
 563             if not data or size is not None:
 564                 if line and not keepends:
 565                     line = line.splitlines(False)[0]
 566                 break
 567             if readsize<8000:
 568                 readsize *= 2
 569         return line
 570
 571     def readlines(self, sizehint=None, keepends=True):
 572
 573         """ Read all lines available on the input stream
 574             and return them as list of lines.
 575
 576             Line breaks are implemented using the codec's decoder
 577             method and are included in the list entries.
 578
 579             sizehint, if given, is ignored since there is no efficient
 580             way to finding the true end-of-line.
 581
 582         """
 583         data = self.read()
 584         return data.splitlines(keepends)
 585
 586     def reset(self):
 587
 588         """ Resets the codec buffers used for keeping state.
 589
 590             Note that no stream repositioning should take place.
 591             This method is primarily intended to be able to recover
 592             from decoding errors.
 593
 594         """
 595         self.bytebuffer = ""
 596         self.charbuffer = u""
 597         self.linebuffer = None
 598
 599     def seek(self, offset, whence=0):
 600         """ Set the input stream's current position.
 601
 602             Resets the codec buffers used for keeping state.
 603         """
 604         self.reset()
 605         self.stream.seek(offset, whence)
 606
 607     def next(self):
 608
 609         """ Return the next decoded line from the input stream."""
 610         line = self.readline()
 611         if line:
 612             return line
 613         raise StopIteration
 614
 615     def __iter__(self):
 616         return self
 617
 618     def __getattr__(self, name,
 619                     getattr=getattr):
 620
 621         """ Inherit all other methods from the underlying stream.
 622         """
 623         return getattr(self.stream, name)
 624
 625     def __enter__(self):
 626         return self
 627
 628     def __exit__(self, type, value, tb):
 629         self.stream.close()
 630
 631 ###
 632
 633 class StreamReaderWriter:
 634
 635     """ StreamReaderWriter instances allow wrapping streams which
 636         work in both read and write modes.
 637
 638         The design is such that one can use the factory functions
 639         returned by the codec.lookup() function to construct the
 640         instance.
 641
 642     """
 643     # Optional attributes set by the file wrappers below
 644     encoding = 'unknown'
 645
 646     def __init__(self, stream, Reader, Writer, errors='strict'):
 647
 648         """ Creates a StreamReaderWriter instance.
 649
 650             stream must be a Stream-like object.
 651
 652             Reader, Writer must be factory functions or classes
 653             providing the StreamReader, StreamWriter interface resp.
 654
 655             Error handling is done in the same way as defined for the
 656             StreamWriter/Readers.
 657
 658         """
 659         self.stream = stream
 660         self.reader = Reader(stream, errors)
 661         self.writer = Writer(stream, errors)
 662         self.errors = errors
 663
 664     def read(self, size=-1):
 665
 666         return self.reader.read(size)
 667
 668     def readline(self, size=None):
 669
 670         return self.reader.readline(size)
 671
 672     def readlines(self, sizehint=None):
 673
 674         return self.reader.readlines(sizehint)
 675
 676     def next(self):
 677
 678         """ Return the next decoded line from the input stream."""
 679         return self.reader.next()
 680
 681     def __iter__(self):
 682         return self
 683
 684     def write(self, data):
 685
 686         return self.writer.write(data)
 687
 688     def writelines(self, list):
 689
 690         return self.writer.writelines(list)
 691
 692     def reset(self):
 693
 694         self.reader.reset()
 695         self.writer.reset()
 696
 697     def __getattr__(self, name,
 698                     getattr=getattr):
 699
 700         """ Inherit all other methods from the underlying stream.
 701         """
 702         return getattr(self.stream, name)
 703
 704     # these are needed to make "with codecs.open(...)" work properly
 705
 706     def __enter__(self):
 707         return self
 708
 709     def __exit__(self, type, value, tb):
 710         self.stream.close()
 711
 712 ###
 713
 714 class StreamRecoder:
 715
 716     """ StreamRecoder instances provide a frontend - backend
 717         view of encoding data.
 718
 719         They use the complete set of APIs returned by the
 720         codecs.lookup() function to implement their task.
 721
 722         Data written to the stream is first decoded into an
 723         intermediate format (which is dependent on the given codec
 724         combination) and then written to the stream using an instance
 725         of the provided Writer class.
 726
 727         In the other direction, data is read from the stream using a
 728         Reader instance and then return encoded data to the caller.
 729
 730     """
 731     # Optional attributes set by the file wrappers below
 732     data_encoding = 'unknown'
 733     file_encoding = 'unknown'
 734
 735     def __init__(self, stream, encode, decode, Reader, Writer,
 736                  errors='strict'):
 737
 738         """ Creates a StreamRecoder instance which implements a two-way
 739             conversion: encode and decode work on the frontend (the
 740             input to .read() and output of .write()) while
 741             Reader and Writer work on the backend (reading and
 742             writing to the stream).
 743
 744             You can use these objects to do transparent direct
 745             recodings from e.g. latin-1 to utf-8 and back.
 746
 747             stream must be a file-like object.
 748
 749             encode, decode must adhere to the Codec interface, Reader,
 750             Writer must be factory functions or classes providing the
 751             StreamReader, StreamWriter interface resp.
 752
 753             encode and decode are needed for the frontend translation,
 754             Reader and Writer for the backend translation. Unicode is
 755             used as intermediate encoding.
 756
 757             Error handling is done in the same way as defined for the
 758             StreamWriter/Readers.
 759
 760         """
 761         self.stream = stream
 762         self.encode = encode
 763         self.decode = decode
 764         self.reader = Reader(stream, errors)
 765         self.writer = Writer(stream, errors)
 766         self.errors = errors
 767
 768     def read(self, size=-1):
 769
 770         data = self.reader.read(size)
 771         data, bytesencoded = self.encode(data, self.errors)
 772         return data
 773
 774     def readline(self, size=None):
 775
 776         if size is None:
 777             data = self.reader.readline()
 778         else:
 779             data = self.reader.readline(size)
 780         data, bytesencoded = self.encode(data, self.errors)
 781         return data
 782
 783     def readlines(self, sizehint=None):
 784
 785         data = self.reader.read()
 786         data, bytesencoded = self.encode(data, self.errors)
 787         return data.splitlines(1)
 788
 789     def next(self):
 790
 791         """ Return the next decoded line from the input stream."""
 792         data = self.reader.next()
 793         data, bytesencoded = self.encode(data, self.errors)
 794         return data
 795
 796     def __iter__(self):
 797         return self
 798
 799     def write(self, data):
 800
 801         data, bytesdecoded = self.decode(data, self.errors)
 802         return self.writer.write(data)
 803
 804     def writelines(self, list):
 805
 806         data = ''.join(list)
 807         data, bytesdecoded = self.decode(data, self.errors)
 808         return self.writer.write(data)
 809
 810     def reset(self):
 811
 812         self.reader.reset()
 813         self.writer.reset()
 814
 815     def __getattr__(self, name,
 816                     getattr=getattr):
 817
 818         """ Inherit all other methods from the underlying stream.
 819         """
 820         return getattr(self.stream, name)
 821
 822     def __enter__(self):
 823         return self
 824
 825     def __exit__(self, type, value, tb):
 826         self.stream.close()
 827
 828 ### Shortcuts
 829
 830 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 831
 832     """ Open an encoded file using the given mode and return
 833         a wrapped version providing transparent encoding/decoding.
 834
 835         Note: The wrapped version will only accept the object format
 836         defined by the codecs, i.e. Unicode objects for most builtin
 837         codecs. Output is also codec dependent and will usually be
 838         Unicode as well.
 839
 840         Files are always opened in binary mode, even if no binary mode
 841         was specified. This is done to avoid data loss due to encodings
 842         using 8-bit values. The default file mode is 'rb' meaning to
 843         open the file in binary read mode.
 844
 845         encoding specifies the encoding which is to be used for the
 846         file.
 847
 848         errors may be given to define the error handling. It defaults
 849         to 'strict' which causes ValueErrors to be raised in case an
 850         encoding error occurs.
 851
 852         buffering has the same meaning as for the builtin open() API.
 853         It defaults to line buffered.
 854
 855         The returned wrapped file object provides an extra attribute
 856         .encoding which allows querying the used encoding. This
 857         attribute is only available if an encoding was specified as
 858         parameter.
 859
 860     """
 861     if encoding is not None and \
 862        'b' not in mode:
 863         # Force opening of the file in binary mode
 864         mode = mode + 'b'
 865     file = __builtin__.open(filename, mode, buffering)
 866     if encoding is None:
 867         return file
 868     info = lookup(encoding)
 869     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
 870     # Add attributes to simplify introspection
 871     srw.encoding = encoding
 872     return srw
 873
 874 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 875
 876     """ Return a wrapped version of file which provides transparent
 877         encoding translation.
 878
 879         Strings written to the wrapped file are interpreted according
 880         to the given data_encoding and then written to the original
 881         file as string using file_encoding. The intermediate encoding
 882         will usually be Unicode but depends on the specified codecs.
 883
 884         Strings are read from the file using file_encoding and then
 885         passed back to the caller as string using data_encoding.
 886
 887         If file_encoding is not given, it defaults to data_encoding.
 888
 889         errors may be given to define the error handling. It defaults
 890         to 'strict' which causes ValueErrors to be raised in case an
 891         encoding error occurs.
 892
 893         The returned wrapped file object provides two extra attributes
 894         .data_encoding and .file_encoding which reflect the given
 895         parameters of the same name. The attributes can be used for
 896         introspection by Python programs.
 897
 898     """
 899     if file_encoding is None:
 900         file_encoding = data_encoding
 901     data_info = lookup(data_encoding)
 902     file_info = lookup(file_encoding)
 903     sr = StreamRecoder(file, data_info.encode, data_info.decode,
 904                        file_info.streamreader, file_info.streamwriter, errors)
 905     # Add attributes to simplify introspection
 906     sr.data_encoding = data_encoding
 907     sr.file_encoding = file_encoding
 908     return sr
 909
 910 ### Helpers for codec lookup
 911
 912 def getencoder(encoding):
 913
 914     """ Lookup up the codec for the given encoding and return
 915         its encoder function.
 916
 917         Raises a LookupError in case the encoding cannot be found.
 918
 919     """
 920     return lookup(encoding).encode
 921
 922 def getdecoder(encoding):
 923
 924     """ Lookup up the codec for the given encoding and return
 925         its decoder function.
 926
 927         Raises a LookupError in case the encoding cannot be found.
 928
 929     """
 930     return lookup(encoding).decode
 931
 932 def getincrementalencoder(encoding):
 933
 934     """ Lookup up the codec for the given encoding and return
 935         its IncrementalEncoder class or factory function.
 936
 937         Raises a LookupError in case the encoding cannot be found
 938         or the codecs doesn't provide an incremental encoder.
 939
 940     """
 941     encoder = lookup(encoding).incrementalencoder
 942     if encoder is None:
 943         raise LookupError(encoding)
 944     return encoder
 945
 946 def getincrementaldecoder(encoding):
 947
 948     """ Lookup up the codec for the given encoding and return
 949         its IncrementalDecoder class or factory function.
 950
 951         Raises a LookupError in case the encoding cannot be found
 952         or the codecs doesn't provide an incremental decoder.
 953
 954     """
 955     decoder = lookup(encoding).incrementaldecoder
 956     if decoder is None:
 957         raise LookupError(encoding)
 958     return decoder
 959
 960 def getreader(encoding):
 961
 962     """ Lookup up the codec for the given encoding and return
 963         its StreamReader class or factory function.
 964
 965         Raises a LookupError in case the encoding cannot be found.
 966
 967     """
 968     return lookup(encoding).streamreader
 969
 970 def getwriter(encoding):
 971
 972     """ Lookup up the codec for the given encoding and return
 973         its StreamWriter class or factory function.
 974
 975         Raises a LookupError in case the encoding cannot be found.
 976
 977     """
 978     return lookup(encoding).streamwriter
 979
 980 def iterencode(iterator, encoding, errors='strict', **kwargs):
 981     """
 982     Encoding iterator.
 983
 984     Encodes the input strings from the iterator using a IncrementalEncoder.
 985
 986     errors and kwargs are passed through to the IncrementalEncoder
 987     constructor.
 988     """
 989     encoder = getincrementalencoder(encoding)(errors, **kwargs)
 990     for input in iterator:
 991         output = encoder.encode(input)
 992         if output:
 993             yield output
 994     output = encoder.encode("", True)
 995     if output:
 996         yield output
 997
 998 def iterdecode(iterator, encoding, errors='strict', **kwargs):
 999     """
1000     Decoding iterator.
1001
1002     Decodes the input strings from the iterator using a IncrementalDecoder.
1003
1004     errors and kwargs are passed through to the IncrementalDecoder
1005     constructor.
1006     """
1007     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1008     for input in iterator:
1009         output = decoder.decode(input)
1010         if output:
1011             yield output
1012     output = decoder.decode("", True)
1013     if output:
1014         yield output
1015
1016 ### Helpers for charmap-based codecs
1017
1018 def make_identity_dict(rng):
1019
1020     """ make_identity_dict(rng) -> dict
1021
1022         Return a dictionary where elements of the rng sequence are
1023         mapped to themselves.
1024
1025     """
1026     res = {}
1027     for i in rng:
1028         res[i]=i
1029     return res
1030
1031 def make_encoding_map(decoding_map):
1032
1033     """ Creates an encoding map from a decoding map.
1034
1035         If a target mapping in the decoding map occurs multiple
1036         times, then that target is mapped to None (undefined mapping),
1037         causing an exception when encountered by the charmap codec
1038         during translation.
1039
1040         One example where this happens is cp875.py which decodes
1041         multiple character to \u001a.
1042
1043     """
1044     m = {}
1045     for k,v in decoding_map.items():
1046         if not v in m:
1047             m[v] = k
1048         else:
1049             m[v] = None
1050     return m
1051
1052 ### error handlers
1053
1054 try:
1055     strict_errors = lookup_error("strict")
1056     ignore_errors = lookup_error("ignore")
1057     replace_errors = lookup_error("replace")
1058     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1059     backslashreplace_errors = lookup_error("backslashreplace")
1060 except LookupError:
1061     # In --disable-unicode builds, these error handler are missing
1062     strict_errors = None
1063     ignore_errors = None
1064     replace_errors = None
1065     xmlcharrefreplace_errors = None
1066     backslashreplace_errors = None
1067
1068 # Tell modulefinder that using codecs probably needs the encodings
1069 # package
1070 _false = 0
1071 if _false:
1072     import encodings
1073
1074 ### Tests
1075
1076 if __name__ == '__main__':
1077
1078     # Make stdout translate Latin-1 output into UTF-8 output
1079     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1080
1081     # Have stdin translate Latin-1 input into UTF-8 input
1082     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')