Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import __builtin__, sys
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError, why:
  17     raise SystemError('Failed to load the builtin codecs: %s' % why)
  18
  19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
  21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
  22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
  23            "strict_errors", "ignore_errors", "replace_errors",
  24            "xmlcharrefreplace_errors",
  25            "register_error", "lookup_error"]
  26
  27 ### Constants
  28
  29 #
  30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
  31 # and its possible byte string values
  32 # for UTF8/UTF16/UTF32 output and little/big endian machines
  33 #
  34
  35 # UTF-8
  36 BOM_UTF8 = '\xef\xbb\xbf'
  37
  38 # UTF-16, little endian
  39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
  40
  41 # UTF-16, big endian
  42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
  43
  44 # UTF-32, little endian
  45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
  46
  47 # UTF-32, big endian
  48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
  49
  50 if sys.byteorder == 'little':
  51
  52     # UTF-16, native endianness
  53     BOM = BOM_UTF16 = BOM_UTF16_LE
  54
  55     # UTF-32, native endianness
  56     BOM_UTF32 = BOM_UTF32_LE
  57
  58 else:
  59
  60     # UTF-16, native endianness
  61     BOM = BOM_UTF16 = BOM_UTF16_BE
  62
  63     # UTF-32, native endianness
  64     BOM_UTF32 = BOM_UTF32_BE
  65
  66 # Old broken names (don't use in new code)
  67 BOM32_LE = BOM_UTF16_LE
  68 BOM32_BE = BOM_UTF16_BE
  69 BOM64_LE = BOM_UTF32_LE
  70 BOM64_BE = BOM_UTF32_BE
  71
  72
  73 ### Codec base classes (defining the API)
  74
  75 class CodecInfo(tuple):
  76
  77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
  78         incrementalencoder=None, incrementaldecoder=None, name=None):
  79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
  80         self.name = name
  81         self.encode = encode
  82         self.decode = decode
  83         self.incrementalencoder = incrementalencoder
  84         self.incrementaldecoder = incrementaldecoder
  85         self.streamwriter = streamwriter
  86         self.streamreader = streamreader
  87         return self
  88
  89     def __repr__(self):
  90         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
  91
  92 class Codec:
  93
  94     """ Defines the interface for stateless encoders/decoders.
  95
  96         The .encode()/.decode() methods may use different error
  97         handling schemes by providing the errors argument. These
  98         string values are predefined:
  99
 100          'strict' - raise a ValueError error (or a subclass)
 101          'ignore' - ignore the character and continue with the next
 102          'replace' - replace with a suitable replacement character;
 103                     Python will use the official U+FFFD REPLACEMENT
 104                     CHARACTER for the builtin Unicode codecs on
 105                     decoding and '?' on encoding.
 106          'xmlcharrefreplace' - Replace with the appropriate XML
 107                                character reference (only for encoding).
 108          'backslashreplace'  - Replace with backslashed escape sequences
 109                                (only for encoding).
 110
 111         The set of allowed values can be extended via register_error.
 112
 113     """
 114     def encode(self, input, errors='strict'):
 115
 116         """ Encodes the object input and returns a tuple (output
 117             object, length consumed).
 118
 119             errors defines the error handling to apply. It defaults to
 120             'strict' handling.
 121
 122             The method may not store state in the Codec instance. Use
 123             StreamCodec for codecs which have to keep state in order to
 124             make encoding/decoding efficient.
 125
 126             The encoder must be able to handle zero length input and
 127             return an empty object of the output object type in this
 128             situation.
 129
 130         """
 131         raise NotImplementedError
 132
 133     def decode(self, input, errors='strict'):
 134
 135         """ Decodes the object input and returns a tuple (output
 136             object, length consumed).
 137
 138             input must be an object which provides the bf_getreadbuf
 139             buffer slot. Python strings, buffer objects and memory
 140             mapped files are examples of objects providing this slot.
 141
 142             errors defines the error handling to apply. It defaults to
 143             'strict' handling.
 144
 145             The method may not store state in the Codec instance. Use
 146             StreamCodec for codecs which have to keep state in order to
 147             make encoding/decoding efficient.
 148
 149             The decoder must be able to handle zero length input and
 150             return an empty object of the output object type in this
 151             situation.
 152
 153         """
 154         raise NotImplementedError
 155
 156 class IncrementalEncoder(object):
 157     """
 158     An IncrementalEncoder encodes an input in multiple steps. The input can be
 159     passed piece by piece to the encode() method. The IncrementalEncoder remembers
 160     the state of the Encoding process between calls to encode().
 161     """
 162     def __init__(self, errors='strict'):
 163         """
 164         Creates an IncrementalEncoder instance.
 165
 166         The IncrementalEncoder may use different error handling schemes by
 167         providing the errors keyword argument. See the module docstring
 168         for a list of possible values.
 169         """
 170         self.errors = errors
 171         self.buffer = ""
 172
 173     def encode(self, input, final=False):
 174         """
 175         Encodes input and returns the resulting object.
 176         """
 177         raise NotImplementedError
 178
 179     def reset(self):
 180         """
 181         Resets the encoder to the initial state.
 182         """
 183
 184 class BufferedIncrementalEncoder(IncrementalEncoder):
 185     """
 186     This subclass of IncrementalEncoder can be used as the baseclass for an
 187     incremental encoder if the encoder must keep some of the output in a
 188     buffer between calls to encode().
 189     """
 190     def __init__(self, errors='strict'):
 191         IncrementalEncoder.__init__(self, errors)
 192         self.buffer = "" # unencoded input that is kept between calls to encode()
 193
 194     def _buffer_encode(self, input, errors, final):
 195         # Overwrite this method in subclasses: It must encode input
 196         # and return an (output, length consumed) tuple
 197         raise NotImplementedError
 198
 199     def encode(self, input, final=False):
 200         # encode input (taking the buffer into account)
 201         data = self.buffer + input
 202         (result, consumed) = self._buffer_encode(data, self.errors, final)
 203         # keep unencoded input until the next call
 204         self.buffer = data[consumed:]
 205         return result
 206
 207     def reset(self):
 208         IncrementalEncoder.reset(self)
 209         self.buffer = ""
 210
 211 class IncrementalDecoder(object):
 212     """
 213     An IncrementalDecoder decodes an input in multiple steps. The input can be
 214     passed piece by piece to the decode() method. The IncrementalDecoder
 215     remembers the state of the decoding process between calls to decode().
 216     """
 217     def __init__(self, errors='strict'):
 218         """
 219         Creates a IncrementalDecoder instance.
 220
 221         The IncrementalDecoder may use different error handling schemes by
 222         providing the errors keyword argument. See the module docstring
 223         for a list of possible values.
 224         """
 225         self.errors = errors
 226
 227     def decode(self, input, final=False):
 228         """
 229         Decodes input and returns the resulting object.
 230         """
 231         raise NotImplementedError
 232
 233     def reset(self):
 234         """
 235         Resets the decoder to the initial state.
 236         """
 237
 238 class BufferedIncrementalDecoder(IncrementalDecoder):
 239     """
 240     This subclass of IncrementalDecoder can be used as the baseclass for an
 241     incremental decoder if the decoder must be able to handle incomplete byte
 242     sequences.
 243     """
 244     def __init__(self, errors='strict'):
 245         IncrementalDecoder.__init__(self, errors)
 246         self.buffer = "" # undecoded input that is kept between calls to decode()
 247
 248     def _buffer_decode(self, input, errors, final):
 249         # Overwrite this method in subclasses: It must decode input
 250         # and return an (output, length consumed) tuple
 251         raise NotImplementedError
 252
 253     def decode(self, input, final=False):
 254         # decode input (taking the buffer into account)
 255         data = self.buffer + input
 256         (result, consumed) = self._buffer_decode(data, self.errors, final)
 257         # keep undecoded input until the next call
 258         self.buffer = data[consumed:]
 259         return result
 260
 261     def reset(self):
 262         IncrementalDecoder.reset(self)
 263         self.buffer = ""
 264
 265 #
 266 # The StreamWriter and StreamReader class provide generic working
 267 # interfaces which can be used to implement new encoding submodules
 268 # very easily. See encodings/utf_8.py for an example on how this is
 269 # done.
 270 #
 271
 272 class StreamWriter(Codec):
 273
 274     def __init__(self, stream, errors='strict'):
 275
 276         """ Creates a StreamWriter instance.
 277
 278             stream must be a file-like object open for writing
 279             (binary) data.
 280
 281             The StreamWriter may use different error handling
 282             schemes by providing the errors keyword argument. These
 283             parameters are predefined:
 284
 285              'strict' - raise a ValueError (or a subclass)
 286              'ignore' - ignore the character and continue with the next
 287              'replace'- replace with a suitable replacement character
 288              'xmlcharrefreplace' - Replace with the appropriate XML
 289                                    character reference.
 290              'backslashreplace'  - Replace with backslashed escape
 291                                    sequences (only for encoding).
 292
 293             The set of allowed parameter values can be extended via
 294             register_error.
 295         """
 296         self.stream = stream
 297         self.errors = errors
 298
 299     def write(self, object):
 300
 301         """ Writes the object's contents encoded to self.stream.
 302         """
 303         data, consumed = self.encode(object, self.errors)
 304         self.stream.write(data)
 305
 306     def writelines(self, list):
 307
 308         """ Writes the concatenated list of strings to the stream
 309             using .write().
 310         """
 311         self.write(''.join(list))
 312
 313     def reset(self):
 314
 315         """ Flushes and resets the codec buffers used for keeping state.
 316
 317             Calling this method should ensure that the data on the
 318             output is put into a clean state, that allows appending
 319             of new fresh data without having to rescan the whole
 320             stream to recover state.
 321
 322         """
 323         pass
 324
 325     def __getattr__(self, name,
 326                     getattr=getattr):
 327
 328         """ Inherit all other methods from the underlying stream.
 329         """
 330         return getattr(self.stream, name)
 331
 332 ###
 333
 334 class StreamReader(Codec):
 335
 336     def __init__(self, stream, errors='strict'):
 337
 338         """ Creates a StreamReader instance.
 339
 340             stream must be a file-like object open for reading
 341             (binary) data.
 342
 343             The StreamReader may use different error handling
 344             schemes by providing the errors keyword argument. These
 345             parameters are predefined:
 346
 347              'strict' - raise a ValueError (or a subclass)
 348              'ignore' - ignore the character and continue with the next
 349              'replace'- replace with a suitable replacement character;
 350
 351             The set of allowed parameter values can be extended via
 352             register_error.
 353         """
 354         self.stream = stream
 355         self.errors = errors
 356         self.bytebuffer = ""
 357         # For str->str decoding this will stay a str
 358         # For str->unicode decoding the first read will promote it to unicode
 359         self.charbuffer = ""
 360         self.linebuffer = None
 361
 362     def decode(self, input, errors='strict'):
 363         raise NotImplementedError
 364
 365     def read(self, size=-1, chars=-1, firstline=False):
 366
 367         """ Decodes data from the stream self.stream and returns the
 368             resulting object.
 369
 370             chars indicates the number of characters to read from the
 371             stream. read() will never return more than chars
 372             characters, but it might return less, if there are not enough
 373             characters available.
 374
 375             size indicates the approximate maximum number of bytes to
 376             read from the stream for decoding purposes. The decoder
 377             can modify this setting as appropriate. The default value
 378             -1 indicates to read and decode as much as possible.  size
 379             is intended to prevent having to decode huge files in one
 380             step.
 381
 382             If firstline is true, and a UnicodeDecodeError happens
 383             after the first line terminator in the input only the first line
 384             will be returned, the rest of the input will be kept until the
 385             next call to read().
 386
 387             The method should use a greedy read strategy meaning that
 388             it should read as much data as is allowed within the
 389             definition of the encoding and the given size, e.g.  if
 390             optional encoding endings or state markers are available
 391             on the stream, these should be read too.
 392         """
 393         # If we have lines cached, first merge them back into characters
 394         if self.linebuffer:
 395             self.charbuffer = "".join(self.linebuffer)
 396             self.linebuffer = None
 397
 398         # read until we get the required number of characters (if available)
 399         while True:
 400             # can the request can be satisfied from the character buffer?
 401             if chars < 0:
 402                 if size < 0:
 403                     if self.charbuffer:
 404                         break
 405                 elif len(self.charbuffer) >= size:
 406                     break
 407             else:
 408                 if len(self.charbuffer) >= chars:
 409                     break
 410             # we need more data
 411             if size < 0:
 412                 newdata = self.stream.read()
 413             else:
 414                 newdata = self.stream.read(size)
 415             # decode bytes (those remaining from the last call included)
 416             data = self.bytebuffer + newdata
 417             try:
 418                 newchars, decodedbytes = self.decode(data, self.errors)
 419             except UnicodeDecodeError, exc:
 420                 if firstline:
 421                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
 422                     lines = newchars.splitlines(True)
 423                     if len(lines)<=1:
 424                         raise
 425                 else:
 426                     raise
 427             # keep undecoded bytes until the next call
 428             self.bytebuffer = data[decodedbytes:]
 429             # put new characters in the character buffer
 430             self.charbuffer += newchars
 431             # there was no data available
 432             if not newdata:
 433                 break
 434         if chars < 0:
 435             # Return everything we've got
 436             result = self.charbuffer
 437             self.charbuffer = ""
 438         else:
 439             # Return the first chars characters
 440             result = self.charbuffer[:chars]
 441             self.charbuffer = self.charbuffer[chars:]
 442         return result
 443
 444     def readline(self, size=None, keepends=True):
 445
 446         """ Read one line from the input stream and return the
 447             decoded data.
 448
 449             size, if given, is passed as size argument to the
 450             read() method.
 451
 452         """
 453         # If we have lines cached from an earlier read, return
 454         # them unconditionally
 455         if self.linebuffer:
 456             line = self.linebuffer[0]
 457             del self.linebuffer[0]
 458             if len(self.linebuffer) == 1:
 459                 # revert to charbuffer mode; we might need more data
 460                 # next time
 461                 self.charbuffer = self.linebuffer[0]
 462                 self.linebuffer = None
 463             if not keepends:
 464                 line = line.splitlines(False)[0]
 465             return line
 466
 467         readsize = size or 72
 468         line = ""
 469         # If size is given, we call read() only once
 470         while True:
 471             data = self.read(readsize, firstline=True)
 472             if data:
 473                 # If we're at a "\r" read one extra character (which might
 474                 # be a "\n") to get a proper line ending. If the stream is
 475                 # temporarily exhausted we return the wrong line ending.
 476                 if data.endswith("\r"):
 477                     data += self.read(size=1, chars=1)
 478
 479             line += data
 480             lines = line.splitlines(True)
 481             if lines:
 482                 if len(lines) > 1:
 483                     # More than one line result; the first line is a full line
 484                     # to return
 485                     line = lines[0]
 486                     del lines[0]
 487                     if len(lines) > 1:
 488                         # cache the remaining lines
 489                         lines[-1] += self.charbuffer
 490                         self.linebuffer = lines
 491                         self.charbuffer = None
 492                     else:
 493                         # only one remaining line, put it back into charbuffer
 494                         self.charbuffer = lines[0] + self.charbuffer
 495                     if not keepends:
 496                         line = line.splitlines(False)[0]
 497                     break
 498                 line0withend = lines[0]
 499                 line0withoutend = lines[0].splitlines(False)[0]
 500                 if line0withend != line0withoutend: # We really have a line end
 501                     # Put the rest back together and keep it until the next call
 502                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
 503                     if keepends:
 504                         line = line0withend
 505                     else:
 506                         line = line0withoutend
 507                     break
 508             # we didn't get anything or this was our only try
 509             if not data or size is not None:
 510                 if line and not keepends:
 511                     line = line.splitlines(False)[0]
 512                 break
 513             if readsize<8000:
 514                 readsize *= 2
 515         return line
 516
 517     def readlines(self, sizehint=None, keepends=True):
 518
 519         """ Read all lines available on the input stream
 520             and return them as list of lines.
 521
 522             Line breaks are implemented using the codec's decoder
 523             method and are included in the list entries.
 524
 525             sizehint, if given, is ignored since there is no efficient
 526             way to finding the true end-of-line.
 527
 528         """
 529         data = self.read()
 530         return data.splitlines(keepends)
 531
 532     def reset(self):
 533
 534         """ Resets the codec buffers used for keeping state.
 535
 536             Note that no stream repositioning should take place.
 537             This method is primarily intended to be able to recover
 538             from decoding errors.
 539
 540         """
 541         self.bytebuffer = ""
 542         self.charbuffer = u""
 543         self.linebuffer = None
 544
 545     def seek(self, offset, whence=0):
 546         """ Set the input stream's current position.
 547
 548             Resets the codec buffers used for keeping state.
 549         """
 550         self.reset()
 551         self.stream.seek(offset, whence)
 552
 553     def next(self):
 554
 555         """ Return the next decoded line from the input stream."""
 556         line = self.readline()
 557         if line:
 558             return line
 559         raise StopIteration
 560
 561     def __iter__(self):
 562         return self
 563
 564     def __getattr__(self, name,
 565                     getattr=getattr):
 566
 567         """ Inherit all other methods from the underlying stream.
 568         """
 569         return getattr(self.stream, name)
 570
 571 ###
 572
 573 class StreamReaderWriter:
 574
 575     """ StreamReaderWriter instances allow wrapping streams which
 576         work in both read and write modes.
 577
 578         The design is such that one can use the factory functions
 579         returned by the codec.lookup() function to construct the
 580         instance.
 581
 582     """
 583     # Optional attributes set by the file wrappers below
 584     encoding = 'unknown'
 585
 586     def __init__(self, stream, Reader, Writer, errors='strict'):
 587
 588         """ Creates a StreamReaderWriter instance.
 589
 590             stream must be a Stream-like object.
 591
 592             Reader, Writer must be factory functions or classes
 593             providing the StreamReader, StreamWriter interface resp.
 594
 595             Error handling is done in the same way as defined for the
 596             StreamWriter/Readers.
 597
 598         """
 599         self.stream = stream
 600         self.reader = Reader(stream, errors)
 601         self.writer = Writer(stream, errors)
 602         self.errors = errors
 603
 604     def read(self, size=-1):
 605
 606         return self.reader.read(size)
 607
 608     def readline(self, size=None):
 609
 610         return self.reader.readline(size)
 611
 612     def readlines(self, sizehint=None):
 613
 614         return self.reader.readlines(sizehint)
 615
 616     def next(self):
 617
 618         """ Return the next decoded line from the input stream."""
 619         return self.reader.next()
 620
 621     def __iter__(self):
 622         return self
 623
 624     def write(self, data):
 625
 626         return self.writer.write(data)
 627
 628     def writelines(self, list):
 629
 630         return self.writer.writelines(list)
 631
 632     def reset(self):
 633
 634         self.reader.reset()
 635         self.writer.reset()
 636
 637     def __getattr__(self, name,
 638                     getattr=getattr):
 639
 640         """ Inherit all other methods from the underlying stream.
 641         """
 642         return getattr(self.stream, name)
 643
 644 ###
 645
 646 class StreamRecoder:
 647
 648     """ StreamRecoder instances provide a frontend - backend
 649         view of encoding data.
 650
 651         They use the complete set of APIs returned by the
 652         codecs.lookup() function to implement their task.
 653
 654         Data written to the stream is first decoded into an
 655         intermediate format (which is dependent on the given codec
 656         combination) and then written to the stream using an instance
 657         of the provided Writer class.
 658
 659         In the other direction, data is read from the stream using a
 660         Reader instance and then return encoded data to the caller.
 661
 662     """
 663     # Optional attributes set by the file wrappers below
 664     data_encoding = 'unknown'
 665     file_encoding = 'unknown'
 666
 667     def __init__(self, stream, encode, decode, Reader, Writer,
 668                  errors='strict'):
 669
 670         """ Creates a StreamRecoder instance which implements a two-way
 671             conversion: encode and decode work on the frontend (the
 672             input to .read() and output of .write()) while
 673             Reader and Writer work on the backend (reading and
 674             writing to the stream).
 675
 676             You can use these objects to do transparent direct
 677             recodings from e.g. latin-1 to utf-8 and back.
 678
 679             stream must be a file-like object.
 680
 681             encode, decode must adhere to the Codec interface, Reader,
 682             Writer must be factory functions or classes providing the
 683             StreamReader, StreamWriter interface resp.
 684
 685             encode and decode are needed for the frontend translation,
 686             Reader and Writer for the backend translation. Unicode is
 687             used as intermediate encoding.
 688
 689             Error handling is done in the same way as defined for the
 690             StreamWriter/Readers.
 691
 692         """
 693         self.stream = stream
 694         self.encode = encode
 695         self.decode = decode
 696         self.reader = Reader(stream, errors)
 697         self.writer = Writer(stream, errors)
 698         self.errors = errors
 699
 700     def read(self, size=-1):
 701
 702         data = self.reader.read(size)
 703         data, bytesencoded = self.encode(data, self.errors)
 704         return data
 705
 706     def readline(self, size=None):
 707
 708         if size is None:
 709             data = self.reader.readline()
 710         else:
 711             data = self.reader.readline(size)
 712         data, bytesencoded = self.encode(data, self.errors)
 713         return data
 714
 715     def readlines(self, sizehint=None):
 716
 717         data = self.reader.read()
 718         data, bytesencoded = self.encode(data, self.errors)
 719         return data.splitlines(1)
 720
 721     def next(self):
 722
 723         """ Return the next decoded line from the input stream."""
 724         data = self.reader.next()
 725         data, bytesencoded = self.encode(data, self.errors)
 726         return data
 727
 728     def __iter__(self):
 729         return self
 730
 731     def write(self, data):
 732
 733         data, bytesdecoded = self.decode(data, self.errors)
 734         return self.writer.write(data)
 735
 736     def writelines(self, list):
 737
 738         data = ''.join(list)
 739         data, bytesdecoded = self.decode(data, self.errors)
 740         return self.writer.write(data)
 741
 742     def reset(self):
 743
 744         self.reader.reset()
 745         self.writer.reset()
 746
 747     def __getattr__(self, name,
 748                     getattr=getattr):
 749
 750         """ Inherit all other methods from the underlying stream.
 751         """
 752         return getattr(self.stream, name)
 753
 754 ### Shortcuts
 755
 756 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 757
 758     """ Open an encoded file using the given mode and return
 759         a wrapped version providing transparent encoding/decoding.
 760
 761         Note: The wrapped version will only accept the object format
 762         defined by the codecs, i.e. Unicode objects for most builtin
 763         codecs. Output is also codec dependent and will usually be
 764         Unicode as well.
 765
 766         Files are always opened in binary mode, even if no binary mode
 767         was specified. This is done to avoid data loss due to encodings
 768         using 8-bit values. The default file mode is 'rb' meaning to
 769         open the file in binary read mode.
 770
 771         encoding specifies the encoding which is to be used for the
 772         file.
 773
 774         errors may be given to define the error handling. It defaults
 775         to 'strict' which causes ValueErrors to be raised in case an
 776         encoding error occurs.
 777
 778         buffering has the same meaning as for the builtin open() API.
 779         It defaults to line buffered.
 780
 781         The returned wrapped file object provides an extra attribute
 782         .encoding which allows querying the used encoding. This
 783         attribute is only available if an encoding was specified as
 784         parameter.
 785
 786     """
 787     if encoding is not None and \
 788        'b' not in mode:
 789         # Force opening of the file in binary mode
 790         mode = mode + 'b'
 791     file = __builtin__.open(filename, mode, buffering)
 792     if encoding is None:
 793         return file
 794     info = lookup(encoding)
 795     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
 796     # Add attributes to simplify introspection
 797     srw.encoding = encoding
 798     return srw
 799
 800 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 801
 802     """ Return a wrapped version of file which provides transparent
 803         encoding translation.
 804
 805         Strings written to the wrapped file are interpreted according
 806         to the given data_encoding and then written to the original
 807         file as string using file_encoding. The intermediate encoding
 808         will usually be Unicode but depends on the specified codecs.
 809
 810         Strings are read from the file using file_encoding and then
 811         passed back to the caller as string using data_encoding.
 812
 813         If file_encoding is not given, it defaults to data_encoding.
 814
 815         errors may be given to define the error handling. It defaults
 816         to 'strict' which causes ValueErrors to be raised in case an
 817         encoding error occurs.
 818
 819         The returned wrapped file object provides two extra attributes
 820         .data_encoding and .file_encoding which reflect the given
 821         parameters of the same name. The attributes can be used for
 822         introspection by Python programs.
 823
 824     """
 825     if file_encoding is None:
 826         file_encoding = data_encoding
 827     info = lookup(data_encoding)
 828     sr = StreamRecoder(file, info.encode, info.decode,
 829                        info.streamreader, info.streamwriter, errors)
 830     # Add attributes to simplify introspection
 831     sr.data_encoding = data_encoding
 832     sr.file_encoding = file_encoding
 833     return sr
 834
 835 ### Helpers for codec lookup
 836
 837 def getencoder(encoding):
 838
 839     """ Lookup up the codec for the given encoding and return
 840         its encoder function.
 841
 842         Raises a LookupError in case the encoding cannot be found.
 843
 844     """
 845     return lookup(encoding).encode
 846
 847 def getdecoder(encoding):
 848
 849     """ Lookup up the codec for the given encoding and return
 850         its decoder function.
 851
 852         Raises a LookupError in case the encoding cannot be found.
 853
 854     """
 855     return lookup(encoding).decode
 856
 857 def getincrementalencoder(encoding):
 858
 859     """ Lookup up the codec for the given encoding and return
 860         its IncrementalEncoder class or factory function.
 861
 862         Raises a LookupError in case the encoding cannot be found
 863         or the codecs doesn't provide an incremental encoder.
 864
 865     """
 866     encoder = lookup(encoding).incrementalencoder
 867     if encoder is None:
 868         raise LookupError(encoding)
 869     return encoder
 870
 871 def getincrementaldecoder(encoding):
 872
 873     """ Lookup up the codec for the given encoding and return
 874         its IncrementalDecoder class or factory function.
 875
 876         Raises a LookupError in case the encoding cannot be found
 877         or the codecs doesn't provide an incremental decoder.
 878
 879     """
 880     decoder = lookup(encoding).incrementaldecoder
 881     if decoder is None:
 882         raise LookupError(encoding)
 883     return decoder
 884
 885 def getreader(encoding):
 886
 887     """ Lookup up the codec for the given encoding and return
 888         its StreamReader class or factory function.
 889
 890         Raises a LookupError in case the encoding cannot be found.
 891
 892     """
 893     return lookup(encoding).streamreader
 894
 895 def getwriter(encoding):
 896
 897     """ Lookup up the codec for the given encoding and return
 898         its StreamWriter class or factory function.
 899
 900         Raises a LookupError in case the encoding cannot be found.
 901
 902     """
 903     return lookup(encoding).streamwriter
 904
 905 def iterencode(iterator, encoding, errors='strict', **kwargs):
 906     """
 907     Encoding iterator.
 908
 909     Encodes the input strings from the iterator using a IncrementalEncoder.
 910
 911     errors and kwargs are passed through to the IncrementalEncoder
 912     constructor.
 913     """
 914     encoder = getincrementalencoder(encoding)(errors, **kwargs)
 915     for input in iterator:
 916         output = encoder.encode(input)
 917         if output:
 918             yield output
 919     output = encoder.encode("", True)
 920     if output:
 921         yield output
 922
 923 def iterdecode(iterator, encoding, errors='strict', **kwargs):
 924     """
 925     Decoding iterator.
 926
 927     Decodes the input strings from the iterator using a IncrementalDecoder.
 928
 929     errors and kwargs are passed through to the IncrementalDecoder
 930     constructor.
 931     """
 932     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
 933     for input in iterator:
 934         output = decoder.decode(input)
 935         if output:
 936             yield output
 937     output = decoder.decode("", True)
 938     if output:
 939         yield output
 940
 941 ### Helpers for charmap-based codecs
 942
 943 def make_identity_dict(rng):
 944
 945     """ make_identity_dict(rng) -> dict
 946
 947         Return a dictionary where elements of the rng sequence are
 948         mapped to themselves.
 949
 950     """
 951     res = {}
 952     for i in rng:
 953         res[i]=i
 954     return res
 955
 956 def make_encoding_map(decoding_map):
 957
 958     """ Creates an encoding map from a decoding map.
 959
 960         If a target mapping in the decoding map occurs multiple
 961         times, then that target is mapped to None (undefined mapping),
 962         causing an exception when encountered by the charmap codec
 963         during translation.
 964
 965         One example where this happens is cp875.py which decodes
 966         multiple character to \u001a.
 967
 968     """
 969     m = {}
 970     for k,v in decoding_map.items():
 971         if not v in m:
 972             m[v] = k
 973         else:
 974             m[v] = None
 975     return m
 976
 977 ### error handlers
 978
 979 try:
 980     strict_errors = lookup_error("strict")
 981     ignore_errors = lookup_error("ignore")
 982     replace_errors = lookup_error("replace")
 983     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
 984     backslashreplace_errors = lookup_error("backslashreplace")
 985 except LookupError:
 986     # In --disable-unicode builds, these error handler are missing
 987     strict_errors = None
 988     ignore_errors = None
 989     replace_errors = None
 990     xmlcharrefreplace_errors = None
 991     backslashreplace_errors = None
 992
 993 # Tell modulefinder that using codecs probably needs the encodings
 994 # package
 995 _false = 0
 996 if _false:
 997     import encodings
 998
 999 ### Tests
1000
1001 if __name__ == '__main__':
1002
1003     # Make stdout translate Latin-1 output into UTF-8 output
1004     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1005
1006     # Have stdin translate Latin-1 input into UTF-8 input
1007     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')