From 76be9443c22bc1291ea01585c128e1951620f030 Mon Sep 17 00:00:00 2001
From: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>
Date: Thu, 20 Oct 2011 23:04:46 +0000
Subject: [PATCH] Work around encoding problems in Py3k. Fixes [ 3395948 ]

git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@7196 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
---
 HISTORY.txt                 |  7 ++--
 docutils/error_reporting.py | 13 +++++---
 docutils/frontend.py        |  2 +-
 docutils/io.py              | 78 ++++++++++++++++++++++++++++++++-------------
 4 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/HISTORY.txt b/HISTORY.txt
index 97a691d31..0c2432d12 100644
--- a/HISTORY.txt
+++ b/HISTORY.txt
@@ -22,11 +22,14 @@ Changes Since 0.8.1
   - reStructuredText "code" role and directive with syntax highlighting
     by Pygments_.
   - "code" option of the "include" directive.
-
-  - Fix parse_option_marker for options arguments containing ``=``.
+  - Fix parse_option_marker for option arguments containing ``=``.
 
 .. _Pygments: http://pygments.org/
 
+* docutils/io.py
+
+  - Fix [ 3395948 ] (Work around encoding problems in Py3k).
+
 * docutils/writers/latex2e/__init__.py
 
   - Support the `abbreviation` and `acronym` standard roles.
diff --git a/docutils/error_reporting.py b/docutils/error_reporting.py
index 85fa8cb1b..19ab24b74 100644
--- a/docutils/error_reporting.py
+++ b/docutils/error_reporting.py
@@ -4,12 +4,12 @@
 # :Id: $Id$
 # :Copyright: © 2011 Günter Milde.
 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
-# 
+#
 #    Copying and distribution of this file, with or without modification,
 #    are permitted in any medium without royalty provided the copyright
 #    notice and this notice are preserved.
 #    This file is offered as-is, without any warranty.
-# 
+#
 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
 
 """
@@ -184,13 +184,17 @@ class ErrorOutput(object):
         except UnicodeEncodeError:
             self.stream.write(data.encode(self.encoding, self.encoding_errors))
         except TypeError: # in Python 3, stderr expects unicode
-            self.stream.write(unicode(data, self.encoding, self.decoding_errors))
+            if self.stream in (sys.stderr, sys.stdout):
+                self.stream.buffer.write(data) # write bytes to raw stream
+            else:
+                self.stream.write(unicode(data, self.encoding,
+                                          self.decoding_errors))
 
     def close(self):
         """
         Close the error-output stream.
 
-        Ignored if the stream is` sys.stderr` or `sys.stdout` or has no 
+        Ignored if the stream is` sys.stderr` or `sys.stdout` or has no
         close() method.
         """
         if self.stream in (sys.stdout, sys.stderr):
@@ -199,4 +203,3 @@ class ErrorOutput(object):
             self.stream.close()
         except AttributeError:
             pass
-
diff --git a/docutils/frontend.py b/docutils/frontend.py
index b4cde7514..b3c62f0ee 100644
--- a/docutils/frontend.py
+++ b/docutils/frontend.py
@@ -38,7 +38,7 @@ from optparse import SUPPRESS_HELP
 import docutils
 import docutils.utils
 import docutils.nodes
-from docutils.error_reporting import locale_encoding, ErrorOutput
+from docutils.error_reporting import locale_encoding, ErrorOutput, ErrorString
 
 
 def store_multiple(option, opt, value, parser, *args, **kwargs):
diff --git a/docutils/io.py b/docutils/io.py
index 4330b2eed..40630af55 100644
--- a/docutils/io.py
+++ b/docutils/io.py
@@ -10,6 +10,7 @@ will exist for a variety of input/output mechanisms.
 __docformat__ = 'reStructuredText'
 
 import sys
+import os
 import re
 import codecs
 from docutils import TransformSpec
@@ -84,10 +85,9 @@ class Input(TransformSpec):
                 # Apply heuristics only if no encoding is explicitly given and
                 # no BOM found.  Start with UTF-8, because that only matches
                 # data that *IS* UTF-8:
-                encodings = [enc for enc in ('utf-8',
-                                             locale_encoding, # can be None
-                                             'latin-1') # fallback encoding
-                             if enc]
+                encodings = ['utf-8', 'latin-1']
+                if locale_encoding:
+                    encodings.insert(1, locale_encoding)
         for enc in encodings:
             try:
                 decoded = unicode(data, enc, self.error_handler)
@@ -105,7 +105,7 @@ class Input(TransformSpec):
     coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
     """Encoding declaration pattern."""
 
-    byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # actually 'utf-8-sig'
+    byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
                         (codecs.BOM_UTF16_BE, 'utf-16-be'),
                         (codecs.BOM_UTF16_LE, 'utf-16-le'),)
     """Sequence of (start_bytes, encoding) tuples for encoding detection.
@@ -224,6 +224,15 @@ class FileInput(Input):
                     sys.exit(1)
             else:
                 self.source = sys.stdin
+        elif (sys.version_info >= (3,0) and
+              self.encoding and hasattr(self.source, 'encoding') and
+              self.encoding != self.source.encoding and
+              codecs.lookup(self.encoding) !=
+              codecs.lookup(self.source.encoding)):
+            # TODO: re-open, warn or raise error?
+            raise UnicodeError('Encoding clash: encoding given is "%s" '
+                               'but source is opened with encoding "%s".' %
+                               (self.encoding, self.source.encoding))
         if not source_path:
             try:
                 self.source_path = self.source.name
@@ -234,8 +243,25 @@ class FileInput(Input):
         """
         Read and decode a single file and return the data (Unicode string).
         """
-        try:
-            data = self.source.read()
+        try: # In Python < 2.5, try...except has to be nested in try...finally.
+            try:
+                if self.source is sys.stdin and sys.version_info >= (3,0):
+                    # read as binary data to circumvent auto-decoding
+                    data = self.source.buffer.read()
+                    # normalize newlines
+                    data = b('\n').join(data.splitlines()) + b('\n')
+                else:
+                    data = self.source.read()
+            except (UnicodeError, LookupError), err: # (in Py3k read() decodes)
+                if not self.encoding and self.source_path:
+                    # re-read in binary mode and decode with heuristics
+                    b_source = open(self.source_path, 'rb')
+                    data = b_source.read()
+                    b_source.close()
+                    # normalize newlines
+                    data = b('\n').join(data.splitlines()) + b('\n')
+                else:
+                    raise
         finally:
             if self.autoclose:
                 self.close()
@@ -245,12 +271,7 @@ class FileInput(Input):
         """
         Return lines of a single file as list of Unicode strings.
         """
-        try:
-            lines = self.source.readlines()
-        finally:
-            if self.autoclose:
-                self.close()
-        return [self.decode(line) for line in lines]
+        return self.read().splitlines(True)
 
     def close(self):
         if self.source is not sys.stdin:
@@ -302,7 +323,6 @@ class FileOutput(Output):
                       'errors': self.error_handler}
         else:
             kwargs = {}
-
         try:
             self.destination = open(self.destination_path, 'w', **kwargs)
         except IOError, error:
@@ -317,20 +337,34 @@ class FileOutput(Output):
     def write(self, data):
         """Encode `data`, write it to a single file, and return it.
 
-        In Python 3, a (unicode) string is returned.
+        In Python 3, `data` is returned unchanged.
         """
-        if sys.version_info >= (3,0):
-            output = data # in py3k, write expects a (Unicode) string
-        else:
-            output = self.encode(data)
+        if sys.version_info < (3,0):
+            data = self.encode(data)
         if not self.opened:
             self.open()
-        try:
-            self.destination.write(output)
+        try: # In Python < 2.5, try...except has to be nested in try...finally.
+            try:
+                if (sys.version_info >= (3,0) and self.encoding and
+                    hasattr(self.destination,'encoding') and
+                    self.encoding != self.destination.encoding and
+                    codecs.lookup(self.encoding) !=
+                    codecs.lookup(self.destination.encoding)):
+                    # encode self, write bytes
+                    bdata = self.encode(data)
+                    if os.linesep != '\n':
+                        bdata = bdata.replace('\n', os.linesep)
+                    sys.stdout.buffer.write(bdata)
+                else:
+                    self.destination.write(data)
+            except (UnicodeError, LookupError), err: # can only happen in py3k
+                raise UnicodeError(
+                    'Unable to encode output data. output-encoding is: '
+                    '%s.\n(%s)' % (self.encoding, ErrorString(err)))
         finally:
             if self.autoclose:
                 self.close()
-        return output
+        return data
 
     def close(self):
         if self.destination not in (sys.stdout, sys.stderr):
-- 
2.11.4.GIT