Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import __builtin__, sys
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError, why:
  17     raise SystemError('Failed to load the builtin codecs: %s' % why)
  18
  19 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  20            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
  21            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
  22            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
  23            "strict_errors", "ignore_errors", "replace_errors",
  24            "xmlcharrefreplace_errors",
  25            "register_error", "lookup_error"]
  26
  27 ### Constants
  28
  29 #
  30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
  31 # and its possible byte string values
  32 # for UTF8/UTF16/UTF32 output and little/big endian machines
  33 #
  34
  35 # UTF-8
  36 BOM_UTF8 = '\xef\xbb\xbf'
  37
  38 # UTF-16, little endian
  39 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
  40
  41 # UTF-16, big endian
  42 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
  43
  44 # UTF-32, little endian
  45 BOM_UTF32_LE = '\xff\xfe\x00\x00'
  46
  47 # UTF-32, big endian
  48 BOM_UTF32_BE = '\x00\x00\xfe\xff'
  49
  50 if sys.byteorder == 'little':
  51
  52     # UTF-16, native endianness
  53     BOM = BOM_UTF16 = BOM_UTF16_LE
  54
  55     # UTF-32, native endianness
  56     BOM_UTF32 = BOM_UTF32_LE
  57
  58 else:
  59
  60     # UTF-16, native endianness
  61     BOM = BOM_UTF16 = BOM_UTF16_BE
  62
  63     # UTF-32, native endianness
  64     BOM_UTF32 = BOM_UTF32_BE
  65
  66 # Old broken names (don't use in new code)
  67 BOM32_LE = BOM_UTF16_LE
  68 BOM32_BE = BOM_UTF16_BE
  69 BOM64_LE = BOM_UTF32_LE
  70 BOM64_BE = BOM_UTF32_BE
  71
  72
  73 ### Codec base classes (defining the API)
  74
  75 class CodecInfo(tuple):
  76
  77     def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
  78         incrementalencoder=None, incrementaldecoder=None, name=None):
  79         self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
  80         self.name = name
  81         self.encode = encode
  82         self.decode = decode
  83         self.incrementalencoder = incrementalencoder
  84         self.incrementaldecoder = incrementaldecoder
  85         self.streamwriter = streamwriter
  86         self.streamreader = streamreader
  87         return self
  88
  89     def __repr__(self):
  90         return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
  91
  92 class Codec:
  93
  94     """ Defines the interface for stateless encoders/decoders.
  95
  96         The .encode()/.decode() methods may use different error
  97         handling schemes by providing the errors argument. These
  98         string values are predefined:
  99
 100          'strict' - raise a ValueError error (or a subclass)
 101          'ignore' - ignore the character and continue with the next
 102          'replace' - replace with a suitable replacement character;
 103                     Python will use the official U+FFFD REPLACEMENT
 104                     CHARACTER for the builtin Unicode codecs on
 105                     decoding and '?' on encoding.
 106          'xmlcharrefreplace' - Replace with the appropriate XML
 107                                character reference (only for encoding).
 108          'backslashreplace'  - Replace with backslashed escape sequences
 109                                (only for encoding).
 110
 111         The set of allowed values can be extended via register_error.
 112
 113     """
 114     def encode(self, input, errors='strict'):
 115
 116         """ Encodes the object input and returns a tuple (output
 117             object, length consumed).
 118
 119             errors defines the error handling to apply. It defaults to
 120             'strict' handling.
 121
 122             The method may not store state in the Codec instance. Use
 123             StreamCodec for codecs which have to keep state in order to
 124             make encoding/decoding efficient.
 125
 126             The encoder must be able to handle zero length input and
 127             return an empty object of the output object type in this
 128             situation.
 129
 130         """
 131         raise NotImplementedError
 132
 133     def decode(self, input, errors='strict'):
 134
 135         """ Decodes the object input and returns a tuple (output
 136             object, length consumed).
 137
 138             input must be an object which provides the bf_getreadbuf
 139             buffer slot. Python strings, buffer objects and memory
 140             mapped files are examples of objects providing this slot.
 141
 142             errors defines the error handling to apply. It defaults to
 143             'strict' handling.
 144
 145             The method may not store state in the Codec instance. Use
 146             StreamCodec for codecs which have to keep state in order to
 147             make encoding/decoding efficient.
 148
 149             The decoder must be able to handle zero length input and
 150             return an empty object of the output object type in this
 151             situation.
 152
 153         """
 154         raise NotImplementedError
 155
 156 class IncrementalEncoder(object):
 157     """
 158     An IncrementalEncoder encodes an input in multiple steps. The input can be
 159     passed piece by piece to the encode() method. The IncrementalEncoder remembers
 160     the state of the Encoding process between calls to encode().
 161     """
 162     def __init__(self, errors='strict'):
 163         """
 164         Creates an IncrementalEncoder instance.
 165
 166         The IncrementalEncoder may use different error handling schemes by
 167         providing the errors keyword argument. See the module docstring
 168         for a list of possible values.
 169         """
 170         self.errors = errors
 171         self.buffer = ""
 172
 173     def encode(self, input, final=False):
 174         """
 175         Encodes input and returns the resulting object.
 176         """
 177         raise NotImplementedError
 178
 179     def reset(self):
 180         """
 181         Resets the encoder to the initial state.
 182         """
 183
 184 class BufferedIncrementalEncoder(IncrementalEncoder):
 185     """
 186     This subclass of IncrementalEncoder can be used as the baseclass for an
 187     incremental encoder if the encoder must keep some of the output in a
 188     buffer between calls to encode().
 189     """
 190     def __init__(self, errors='strict'):
 191         IncrementalEncoder.__init__(self, errors)
 192         self.buffer = "" # unencoded input that is kept between calls to encode()
 193
 194     def _buffer_encode(self, input, errors, final):
 195         # Overwrite this method in subclasses: It must encode input
 196         # and return an (output, length consumed) tuple
 197         raise NotImplementedError
 198
 199     def encode(self, input, final=False):
 200         # encode input (taking the buffer into account)
 201         data = self.buffer + input
 202         (result, consumed) = self._buffer_encode(data, self.errors, final)
 203         # keep unencoded input until the next call
 204         self.buffer = data[consumed:]
 205         return result
 206
 207     def reset(self):
 208         IncrementalEncoder.reset(self)
 209         self.buffer = ""
 210
 211 class IncrementalDecoder(object):
 212     """
 213     An IncrementalDecoder decodes an input in multiple steps. The input can be
 214     passed piece by piece to the decode() method. The IncrementalDecoder
 215     remembers the state of the decoding process between calls to decode().
 216     """
 217     def __init__(self, errors='strict'):
 218         """
 219         Creates a IncrementalDecoder instance.
 220
 221         The IncrementalDecoder may use different error handling schemes by
 222         providing the errors keyword argument. See the module docstring
 223         for a list of possible values.
 224         """
 225         self.errors = errors
 226
 227     def decode(self, input, final=False):
 228         """
 229         Decodes input and returns the resulting object.
 230         """
 231         raise NotImplementedError
 232
 233     def reset(self):
 234         """
 235         Resets the decoder to the initial state.
 236         """
 237
 238 class BufferedIncrementalDecoder(IncrementalDecoder):
 239     """
 240     This subclass of IncrementalDecoder can be used as the baseclass for an
 241     incremental decoder if the decoder must be able to handle incomplete byte
 242     sequences.
 243     """
 244     def __init__(self, errors='strict'):
 245         IncrementalDecoder.__init__(self, errors)
 246         self.buffer = "" # undecoded input that is kept between calls to decode()
 247
 248     def _buffer_decode(self, input, errors, final):
 249         # Overwrite this method in subclasses: It must decode input
 250         # and return an (output, length consumed) tuple
 251         raise NotImplementedError
 252
 253     def decode(self, input, final=False):
 254         # decode input (taking the buffer into account)
 255         data = self.buffer + input
 256         (result, consumed) = self._buffer_decode(data, self.errors, final)
 257         # keep undecoded input until the next call
 258         self.buffer = data[consumed:]
 259         return result
 260
 261     def reset(self):
 262         IncrementalDecoder.reset(self)
 263         self.buffer = ""
 264
 265 #
 266 # The StreamWriter and StreamReader class provide generic working
 267 # interfaces which can be used to implement new encoding submodules
 268 # very easily. See encodings/utf_8.py for an example on how this is
 269 # done.
 270 #
 271
 272 class StreamWriter(Codec):
 273
 274     def __init__(self, stream, errors='strict'):
 275
 276         """ Creates a StreamWriter instance.
 277
 278             stream must be a file-like object open for writing
 279             (binary) data.
 280
 281             The StreamWriter may use different error handling
 282             schemes by providing the errors keyword argument. These
 283             parameters are predefined:
 284
 285              'strict' - raise a ValueError (or a subclass)
 286              'ignore' - ignore the character and continue with the next
 287              'replace'- replace with a suitable replacement character
 288              'xmlcharrefreplace' - Replace with the appropriate XML
 289                                    character reference.
 290              'backslashreplace'  - Replace with backslashed escape
 291                                    sequences (only for encoding).
 292
 293             The set of allowed parameter values can be extended via
 294             register_error.
 295         """
 296         self.stream = stream
 297         self.errors = errors
 298
 299     def write(self, object):
 300
 301         """ Writes the object's contents encoded to self.stream.
 302         """
 303         data, consumed = self.encode(object, self.errors)
 304         self.stream.write(data)
 305
 306     def writelines(self, list):
 307
 308         """ Writes the concatenated list of strings to the stream
 309             using .write().
 310         """
 311         self.write(''.join(list))
 312
 313     def reset(self):
 314
 315         """ Flushes and resets the codec buffers used for keeping state.
 316
 317             Calling this method should ensure that the data on the
 318             output is put into a clean state, that allows appending
 319             of new fresh data without having to rescan the whole
 320             stream to recover state.
 321
 322         """
 323         pass
 324
 325     def __getattr__(self, name,
 326                     getattr=getattr):
 327
 328         """ Inherit all other methods from the underlying stream.
 329         """
 330         return getattr(self.stream, name)
 331
 332     def __enter__(self):
 333         return self
 334
 335     def __exit__(self, type, value, tb):
 336         self.stream.close()
 337
 338 ###
 339
 340 class StreamReader(Codec):
 341
 342     def __init__(self, stream, errors='strict'):
 343
 344         """ Creates a StreamReader instance.
 345
 346             stream must be a file-like object open for reading
 347             (binary) data.
 348
 349             The StreamReader may use different error handling
 350             schemes by providing the errors keyword argument. These
 351             parameters are predefined:
 352
 353              'strict' - raise a ValueError (or a subclass)
 354              'ignore' - ignore the character and continue with the next
 355              'replace'- replace with a suitable replacement character;
 356
 357             The set of allowed parameter values can be extended via
 358             register_error.
 359         """
 360         self.stream = stream
 361         self.errors = errors
 362         self.bytebuffer = ""
 363         # For str->str decoding this will stay a str
 364         # For str->unicode decoding the first read will promote it to unicode
 365         self.charbuffer = ""
 366         self.linebuffer = None
 367
 368     def decode(self, input, errors='strict'):
 369         raise NotImplementedError
 370
 371     def read(self, size=-1, chars=-1, firstline=False):
 372
 373         """ Decodes data from the stream self.stream and returns the
 374             resulting object.
 375
 376             chars indicates the number of characters to read from the
 377             stream. read() will never return more than chars
 378             characters, but it might return less, if there are not enough
 379             characters available.
 380
 381             size indicates the approximate maximum number of bytes to
 382             read from the stream for decoding purposes. The decoder
 383             can modify this setting as appropriate. The default value
 384             -1 indicates to read and decode as much as possible.  size
 385             is intended to prevent having to decode huge files in one
 386             step.
 387
 388             If firstline is true, and a UnicodeDecodeError happens
 389             after the first line terminator in the input only the first line
 390             will be returned, the rest of the input will be kept until the
 391             next call to read().
 392
 393             The method should use a greedy read strategy meaning that
 394             it should read as much data as is allowed within the
 395             definition of the encoding and the given size, e.g.  if
 396             optional encoding endings or state markers are available
 397             on the stream, these should be read too.
 398         """
 399         # If we have lines cached, first merge them back into characters
 400         if self.linebuffer:
 401             self.charbuffer = "".join(self.linebuffer)
 402             self.linebuffer = None
 403
 404         # read until we get the required number of characters (if available)
 405         while True:
 406             # can the request can be satisfied from the character buffer?
 407             if chars < 0:
 408                 if size < 0:
 409                     if self.charbuffer:
 410                         break
 411                 elif len(self.charbuffer) >= size:
 412                     break
 413             else:
 414                 if len(self.charbuffer) >= chars:
 415                     break
 416             # we need more data
 417             if size < 0:
 418                 newdata = self.stream.read()
 419             else:
 420                 newdata = self.stream.read(size)
 421             # decode bytes (those remaining from the last call included)
 422             data = self.bytebuffer + newdata
 423             try:
 424                 newchars, decodedbytes = self.decode(data, self.errors)
 425             except UnicodeDecodeError, exc:
 426                 if firstline:
 427                     newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
 428                     lines = newchars.splitlines(True)
 429                     if len(lines)<=1:
 430                         raise
 431                 else:
 432                     raise
 433             # keep undecoded bytes until the next call
 434             self.bytebuffer = data[decodedbytes:]
 435             # put new characters in the character buffer
 436             self.charbuffer += newchars
 437             # there was no data available
 438             if not newdata:
 439                 break
 440         if chars < 0:
 441             # Return everything we've got
 442             result = self.charbuffer
 443             self.charbuffer = ""
 444         else:
 445             # Return the first chars characters
 446             result = self.charbuffer[:chars]
 447             self.charbuffer = self.charbuffer[chars:]
 448         return result
 449
 450     def readline(self, size=None, keepends=True):
 451
 452         """ Read one line from the input stream and return the
 453             decoded data.
 454
 455             size, if given, is passed as size argument to the
 456             read() method.
 457
 458         """
 459         # If we have lines cached from an earlier read, return
 460         # them unconditionally
 461         if self.linebuffer:
 462             line = self.linebuffer[0]
 463             del self.linebuffer[0]
 464             if len(self.linebuffer) == 1:
 465                 # revert to charbuffer mode; we might need more data
 466                 # next time
 467                 self.charbuffer = self.linebuffer[0]
 468                 self.linebuffer = None
 469             if not keepends:
 470                 line = line.splitlines(False)[0]
 471             return line
 472
 473         readsize = size or 72
 474         line = ""
 475         # If size is given, we call read() only once
 476         while True:
 477             data = self.read(readsize, firstline=True)
 478             if data:
 479                 # If we're at a "\r" read one extra character (which might
 480                 # be a "\n") to get a proper line ending. If the stream is
 481                 # temporarily exhausted we return the wrong line ending.
 482                 if data.endswith("\r"):
 483                     data += self.read(size=1, chars=1)
 484
 485             line += data
 486             lines = line.splitlines(True)
 487             if lines:
 488                 if len(lines) > 1:
 489                     # More than one line result; the first line is a full line
 490                     # to return
 491                     line = lines[0]
 492                     del lines[0]
 493                     if len(lines) > 1:
 494                         # cache the remaining lines
 495                         lines[-1] += self.charbuffer
 496                         self.linebuffer = lines
 497                         self.charbuffer = None
 498                     else:
 499                         # only one remaining line, put it back into charbuffer
 500                         self.charbuffer = lines[0] + self.charbuffer
 501                     if not keepends:
 502                         line = line.splitlines(False)[0]
 503                     break
 504                 line0withend = lines[0]
 505                 line0withoutend = lines[0].splitlines(False)[0]
 506                 if line0withend != line0withoutend: # We really have a line end
 507                     # Put the rest back together and keep it until the next call
 508                     self.charbuffer = "".join(lines[1:]) + self.charbuffer
 509                     if keepends:
 510                         line = line0withend
 511                     else:
 512                         line = line0withoutend
 513                     break
 514             # we didn't get anything or this was our only try
 515             if not data or size is not None:
 516                 if line and not keepends:
 517                     line = line.splitlines(False)[0]
 518                 break
 519             if readsize<8000:
 520                 readsize *= 2
 521         return line
 522
 523     def readlines(self, sizehint=None, keepends=True):
 524
 525         """ Read all lines available on the input stream
 526             and return them as list of lines.
 527
 528             Line breaks are implemented using the codec's decoder
 529             method and are included in the list entries.
 530
 531             sizehint, if given, is ignored since there is no efficient
 532             way to finding the true end-of-line.
 533
 534         """
 535         data = self.read()
 536         return data.splitlines(keepends)
 537
 538     def reset(self):
 539
 540         """ Resets the codec buffers used for keeping state.
 541
 542             Note that no stream repositioning should take place.
 543             This method is primarily intended to be able to recover
 544             from decoding errors.
 545
 546         """
 547         self.bytebuffer = ""
 548         self.charbuffer = u""
 549         self.linebuffer = None
 550
 551     def seek(self, offset, whence=0):
 552         """ Set the input stream's current position.
 553
 554             Resets the codec buffers used for keeping state.
 555         """
 556         self.reset()
 557         self.stream.seek(offset, whence)
 558
 559     def next(self):
 560
 561         """ Return the next decoded line from the input stream."""
 562         line = self.readline()
 563         if line:
 564             return line
 565         raise StopIteration
 566
 567     def __iter__(self):
 568         return self
 569
 570     def __getattr__(self, name,
 571                     getattr=getattr):
 572
 573         """ Inherit all other methods from the underlying stream.
 574         """
 575         return getattr(self.stream, name)
 576
 577     def __enter__(self):
 578         return self
 579
 580     def __exit__(self, type, value, tb):
 581         self.stream.close()
 582
 583 ###
 584
 585 class StreamReaderWriter:
 586
 587     """ StreamReaderWriter instances allow wrapping streams which
 588         work in both read and write modes.
 589
 590         The design is such that one can use the factory functions
 591         returned by the codec.lookup() function to construct the
 592         instance.
 593
 594     """
 595     # Optional attributes set by the file wrappers below
 596     encoding = 'unknown'
 597
 598     def __init__(self, stream, Reader, Writer, errors='strict'):
 599
 600         """ Creates a StreamReaderWriter instance.
 601
 602             stream must be a Stream-like object.
 603
 604             Reader, Writer must be factory functions or classes
 605             providing the StreamReader, StreamWriter interface resp.
 606
 607             Error handling is done in the same way as defined for the
 608             StreamWriter/Readers.
 609
 610         """
 611         self.stream = stream
 612         self.reader = Reader(stream, errors)
 613         self.writer = Writer(stream, errors)
 614         self.errors = errors
 615
 616     def read(self, size=-1):
 617
 618         return self.reader.read(size)
 619
 620     def readline(self, size=None):
 621
 622         return self.reader.readline(size)
 623
 624     def readlines(self, sizehint=None):
 625
 626         return self.reader.readlines(sizehint)
 627
 628     def next(self):
 629
 630         """ Return the next decoded line from the input stream."""
 631         return self.reader.next()
 632
 633     def __iter__(self):
 634         return self
 635
 636     def write(self, data):
 637
 638         return self.writer.write(data)
 639
 640     def writelines(self, list):
 641
 642         return self.writer.writelines(list)
 643
 644     def reset(self):
 645
 646         self.reader.reset()
 647         self.writer.reset()
 648
 649     def __getattr__(self, name,
 650                     getattr=getattr):
 651
 652         """ Inherit all other methods from the underlying stream.
 653         """
 654         return getattr(self.stream, name)
 655
 656     # these are needed to make "with codecs.open(...)" work properly
 657
 658     def __enter__(self):
 659         return self
 660
 661     def __exit__(self, type, value, tb):
 662         self.stream.close()
 663
 664 ###
 665
 666 class StreamRecoder:
 667
 668     """ StreamRecoder instances provide a frontend - backend
 669         view of encoding data.
 670
 671         They use the complete set of APIs returned by the
 672         codecs.lookup() function to implement their task.
 673
 674         Data written to the stream is first decoded into an
 675         intermediate format (which is dependent on the given codec
 676         combination) and then written to the stream using an instance
 677         of the provided Writer class.
 678
 679         In the other direction, data is read from the stream using a
 680         Reader instance and then return encoded data to the caller.
 681
 682     """
 683     # Optional attributes set by the file wrappers below
 684     data_encoding = 'unknown'
 685     file_encoding = 'unknown'
 686
 687     def __init__(self, stream, encode, decode, Reader, Writer,
 688                  errors='strict'):
 689
 690         """ Creates a StreamRecoder instance which implements a two-way
 691             conversion: encode and decode work on the frontend (the
 692             input to .read() and output of .write()) while
 693             Reader and Writer work on the backend (reading and
 694             writing to the stream).
 695
 696             You can use these objects to do transparent direct
 697             recodings from e.g. latin-1 to utf-8 and back.
 698
 699             stream must be a file-like object.
 700
 701             encode, decode must adhere to the Codec interface, Reader,
 702             Writer must be factory functions or classes providing the
 703             StreamReader, StreamWriter interface resp.
 704
 705             encode and decode are needed for the frontend translation,
 706             Reader and Writer for the backend translation. Unicode is
 707             used as intermediate encoding.
 708
 709             Error handling is done in the same way as defined for the
 710             StreamWriter/Readers.
 711
 712         """
 713         self.stream = stream
 714         self.encode = encode
 715         self.decode = decode
 716         self.reader = Reader(stream, errors)
 717         self.writer = Writer(stream, errors)
 718         self.errors = errors
 719
 720     def read(self, size=-1):
 721
 722         data = self.reader.read(size)
 723         data, bytesencoded = self.encode(data, self.errors)
 724         return data
 725
 726     def readline(self, size=None):
 727
 728         if size is None:
 729             data = self.reader.readline()
 730         else:
 731             data = self.reader.readline(size)
 732         data, bytesencoded = self.encode(data, self.errors)
 733         return data
 734
 735     def readlines(self, sizehint=None):
 736
 737         data = self.reader.read()
 738         data, bytesencoded = self.encode(data, self.errors)
 739         return data.splitlines(1)
 740
 741     def next(self):
 742
 743         """ Return the next decoded line from the input stream."""
 744         data = self.reader.next()
 745         data, bytesencoded = self.encode(data, self.errors)
 746         return data
 747
 748     def __iter__(self):
 749         return self
 750
 751     def write(self, data):
 752
 753         data, bytesdecoded = self.decode(data, self.errors)
 754         return self.writer.write(data)
 755
 756     def writelines(self, list):
 757
 758         data = ''.join(list)
 759         data, bytesdecoded = self.decode(data, self.errors)
 760         return self.writer.write(data)
 761
 762     def reset(self):
 763
 764         self.reader.reset()
 765         self.writer.reset()
 766
 767     def __getattr__(self, name,
 768                     getattr=getattr):
 769
 770         """ Inherit all other methods from the underlying stream.
 771         """
 772         return getattr(self.stream, name)
 773
 774     def __enter__(self):
 775         return self
 776
 777     def __exit__(self, type, value, tb):
 778         self.stream.close()
 779
 780 ### Shortcuts
 781
 782 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 783
 784     """ Open an encoded file using the given mode and return
 785         a wrapped version providing transparent encoding/decoding.
 786
 787         Note: The wrapped version will only accept the object format
 788         defined by the codecs, i.e. Unicode objects for most builtin
 789         codecs. Output is also codec dependent and will usually be
 790         Unicode as well.
 791
 792         Files are always opened in binary mode, even if no binary mode
 793         was specified. This is done to avoid data loss due to encodings
 794         using 8-bit values. The default file mode is 'rb' meaning to
 795         open the file in binary read mode.
 796
 797         encoding specifies the encoding which is to be used for the
 798         file.
 799
 800         errors may be given to define the error handling. It defaults
 801         to 'strict' which causes ValueErrors to be raised in case an
 802         encoding error occurs.
 803
 804         buffering has the same meaning as for the builtin open() API.
 805         It defaults to line buffered.
 806
 807         The returned wrapped file object provides an extra attribute
 808         .encoding which allows querying the used encoding. This
 809         attribute is only available if an encoding was specified as
 810         parameter.
 811
 812     """
 813     if encoding is not None and \
 814        'b' not in mode:
 815         # Force opening of the file in binary mode
 816         mode = mode + 'b'
 817     file = __builtin__.open(filename, mode, buffering)
 818     if encoding is None:
 819         return file
 820     info = lookup(encoding)
 821     srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
 822     # Add attributes to simplify introspection
 823     srw.encoding = encoding
 824     return srw
 825
 826 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 827
 828     """ Return a wrapped version of file which provides transparent
 829         encoding translation.
 830
 831         Strings written to the wrapped file are interpreted according
 832         to the given data_encoding and then written to the original
 833         file as string using file_encoding. The intermediate encoding
 834         will usually be Unicode but depends on the specified codecs.
 835
 836         Strings are read from the file using file_encoding and then
 837         passed back to the caller as string using data_encoding.
 838
 839         If file_encoding is not given, it defaults to data_encoding.
 840
 841         errors may be given to define the error handling. It defaults
 842         to 'strict' which causes ValueErrors to be raised in case an
 843         encoding error occurs.
 844
 845         The returned wrapped file object provides two extra attributes
 846         .data_encoding and .file_encoding which reflect the given
 847         parameters of the same name. The attributes can be used for
 848         introspection by Python programs.
 849
 850     """
 851     if file_encoding is None:
 852         file_encoding = data_encoding
 853     data_info = lookup(data_encoding)
 854     file_info = lookup(file_encoding)
 855     sr = StreamRecoder(file, data_info.encode, data_info.decode,
 856                        file_info.streamreader, file_info.streamwriter, errors)
 857     # Add attributes to simplify introspection
 858     sr.data_encoding = data_encoding
 859     sr.file_encoding = file_encoding
 860     return sr
 861
 862 ### Helpers for codec lookup
 863
 864 def getencoder(encoding):
 865
 866     """ Lookup up the codec for the given encoding and return
 867         its encoder function.
 868
 869         Raises a LookupError in case the encoding cannot be found.
 870
 871     """
 872     return lookup(encoding).encode
 873
 874 def getdecoder(encoding):
 875
 876     """ Lookup up the codec for the given encoding and return
 877         its decoder function.
 878
 879         Raises a LookupError in case the encoding cannot be found.
 880
 881     """
 882     return lookup(encoding).decode
 883
 884 def getincrementalencoder(encoding):
 885
 886     """ Lookup up the codec for the given encoding and return
 887         its IncrementalEncoder class or factory function.
 888
 889         Raises a LookupError in case the encoding cannot be found
 890         or the codecs doesn't provide an incremental encoder.
 891
 892     """
 893     encoder = lookup(encoding).incrementalencoder
 894     if encoder is None:
 895         raise LookupError(encoding)
 896     return encoder
 897
 898 def getincrementaldecoder(encoding):
 899
 900     """ Lookup up the codec for the given encoding and return
 901         its IncrementalDecoder class or factory function.
 902
 903         Raises a LookupError in case the encoding cannot be found
 904         or the codecs doesn't provide an incremental decoder.
 905
 906     """
 907     decoder = lookup(encoding).incrementaldecoder
 908     if decoder is None:
 909         raise LookupError(encoding)
 910     return decoder
 911
 912 def getreader(encoding):
 913
 914     """ Lookup up the codec for the given encoding and return
 915         its StreamReader class or factory function.
 916
 917         Raises a LookupError in case the encoding cannot be found.
 918
 919     """
 920     return lookup(encoding).streamreader
 921
 922 def getwriter(encoding):
 923
 924     """ Lookup up the codec for the given encoding and return
 925         its StreamWriter class or factory function.
 926
 927         Raises a LookupError in case the encoding cannot be found.
 928
 929     """
 930     return lookup(encoding).streamwriter
 931
 932 def iterencode(iterator, encoding, errors='strict', **kwargs):
 933     """
 934     Encoding iterator.
 935
 936     Encodes the input strings from the iterator using a IncrementalEncoder.
 937
 938     errors and kwargs are passed through to the IncrementalEncoder
 939     constructor.
 940     """
 941     encoder = getincrementalencoder(encoding)(errors, **kwargs)
 942     for input in iterator:
 943         output = encoder.encode(input)
 944         if output:
 945             yield output
 946     output = encoder.encode("", True)
 947     if output:
 948         yield output
 949
 950 def iterdecode(iterator, encoding, errors='strict', **kwargs):
 951     """
 952     Decoding iterator.
 953
 954     Decodes the input strings from the iterator using a IncrementalDecoder.
 955
 956     errors and kwargs are passed through to the IncrementalDecoder
 957     constructor.
 958     """
 959     decoder = getincrementaldecoder(encoding)(errors, **kwargs)
 960     for input in iterator:
 961         output = decoder.decode(input)
 962         if output:
 963             yield output
 964     output = decoder.decode("", True)
 965     if output:
 966         yield output
 967
 968 ### Helpers for charmap-based codecs
 969
 970 def make_identity_dict(rng):
 971
 972     """ make_identity_dict(rng) -> dict
 973
 974         Return a dictionary where elements of the rng sequence are
 975         mapped to themselves.
 976
 977     """
 978     res = {}
 979     for i in rng:
 980         res[i]=i
 981     return res
 982
 983 def make_encoding_map(decoding_map):
 984
 985     """ Creates an encoding map from a decoding map.
 986
 987         If a target mapping in the decoding map occurs multiple
 988         times, then that target is mapped to None (undefined mapping),
 989         causing an exception when encountered by the charmap codec
 990         during translation.
 991
 992         One example where this happens is cp875.py which decodes
 993         multiple character to \u001a.
 994
 995     """
 996     m = {}
 997     for k,v in decoding_map.items():
 998         if not v in m:
 999             m[v] = k
1000         else:
1001             m[v] = None
1002     return m
1003
1004 ### error handlers
1005
1006 try:
1007     strict_errors = lookup_error("strict")
1008     ignore_errors = lookup_error("ignore")
1009     replace_errors = lookup_error("replace")
1010     xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1011     backslashreplace_errors = lookup_error("backslashreplace")
1012 except LookupError:
1013     # In --disable-unicode builds, these error handler are missing
1014     strict_errors = None
1015     ignore_errors = None
1016     replace_errors = None
1017     xmlcharrefreplace_errors = None
1018     backslashreplace_errors = None
1019
1020 # Tell modulefinder that using codecs probably needs the encodings
1021 # package
1022 _false = 0
1023 if _false:
1024     import encodings
1025
1026 ### Tests
1027
1028 if __name__ == '__main__':
1029
1030     # Make stdout translate Latin-1 output into UTF-8 output
1031     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1032
1033     # Have stdin translate Latin-1 input into UTF-8 input
1034     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')