From 76be9443c22bc1291ea01585c128e1951620f030 Mon Sep 17 00:00:00 2001 From: milde Date: Thu, 20 Oct 2011 23:04:46 +0000 Subject: [PATCH] Work around encoding problems in Py3k. Fixes [ 3395948 ] git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk/docutils@7196 929543f6-e4f2-0310-98a6-ba3bd3dd1d04 --- HISTORY.txt | 7 ++-- docutils/error_reporting.py | 13 +++++--- docutils/frontend.py | 2 +- docutils/io.py | 78 ++++++++++++++++++++++++++++++++------------- 4 files changed, 70 insertions(+), 30 deletions(-) diff --git a/HISTORY.txt b/HISTORY.txt index 97a691d31..0c2432d12 100644 --- a/HISTORY.txt +++ b/HISTORY.txt @@ -22,11 +22,14 @@ Changes Since 0.8.1 - reStructuredText "code" role and directive with syntax highlighting by Pygments_. - "code" option of the "include" directive. - - - Fix parse_option_marker for options arguments containing ``=``. + - Fix parse_option_marker for option arguments containing ``=``. .. _Pygments: http://pygments.org/ +* docutils/io.py + + - Fix [ 3395948 ] (Work around encoding problems in Py3k). + * docutils/writers/latex2e/__init__.py - Support the `abbreviation` and `acronym` standard roles. diff --git a/docutils/error_reporting.py b/docutils/error_reporting.py index 85fa8cb1b..19ab24b74 100644 --- a/docutils/error_reporting.py +++ b/docutils/error_reporting.py @@ -4,12 +4,12 @@ # :Id: $Id$ # :Copyright: © 2011 Günter Milde. # :License: Released under the terms of the `2-Clause BSD license`_, in short: -# +# # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. # This file is offered as-is, without any warranty. -# +# # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause """ @@ -184,13 +184,17 @@ class ErrorOutput(object): except UnicodeEncodeError: self.stream.write(data.encode(self.encoding, self.encoding_errors)) except TypeError: # in Python 3, stderr expects unicode - self.stream.write(unicode(data, self.encoding, self.decoding_errors)) + if self.stream in (sys.stderr, sys.stdout): + self.stream.buffer.write(data) # write bytes to raw stream + else: + self.stream.write(unicode(data, self.encoding, + self.decoding_errors)) def close(self): """ Close the error-output stream. - Ignored if the stream is` sys.stderr` or `sys.stdout` or has no + Ignored if the stream is` sys.stderr` or `sys.stdout` or has no close() method. """ if self.stream in (sys.stdout, sys.stderr): @@ -199,4 +203,3 @@ class ErrorOutput(object): self.stream.close() except AttributeError: pass - diff --git a/docutils/frontend.py b/docutils/frontend.py index b4cde7514..b3c62f0ee 100644 --- a/docutils/frontend.py +++ b/docutils/frontend.py @@ -38,7 +38,7 @@ from optparse import SUPPRESS_HELP import docutils import docutils.utils import docutils.nodes -from docutils.error_reporting import locale_encoding, ErrorOutput +from docutils.error_reporting import locale_encoding, ErrorOutput, ErrorString def store_multiple(option, opt, value, parser, *args, **kwargs): diff --git a/docutils/io.py b/docutils/io.py index 4330b2eed..40630af55 100644 --- a/docutils/io.py +++ b/docutils/io.py @@ -10,6 +10,7 @@ will exist for a variety of input/output mechanisms. __docformat__ = 'reStructuredText' import sys +import os import re import codecs from docutils import TransformSpec @@ -84,10 +85,9 @@ class Input(TransformSpec): # Apply heuristics only if no encoding is explicitly given and # no BOM found. Start with UTF-8, because that only matches # data that *IS* UTF-8: - encodings = [enc for enc in ('utf-8', - locale_encoding, # can be None - 'latin-1') # fallback encoding - if enc] + encodings = ['utf-8', 'latin-1'] + if locale_encoding: + encodings.insert(1, locale_encoding) for enc in encodings: try: decoded = unicode(data, enc, self.error_handler) @@ -105,7 +105,7 @@ class Input(TransformSpec): coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)")) """Encoding declaration pattern.""" - byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # actually 'utf-8-sig' + byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5 (codecs.BOM_UTF16_BE, 'utf-16-be'), (codecs.BOM_UTF16_LE, 'utf-16-le'),) """Sequence of (start_bytes, encoding) tuples for encoding detection. @@ -224,6 +224,15 @@ class FileInput(Input): sys.exit(1) else: self.source = sys.stdin + elif (sys.version_info >= (3,0) and + self.encoding and hasattr(self.source, 'encoding') and + self.encoding != self.source.encoding and + codecs.lookup(self.encoding) != + codecs.lookup(self.source.encoding)): + # TODO: re-open, warn or raise error? + raise UnicodeError('Encoding clash: encoding given is "%s" ' + 'but source is opened with encoding "%s".' % + (self.encoding, self.source.encoding)) if not source_path: try: self.source_path = self.source.name @@ -234,8 +243,25 @@ class FileInput(Input): """ Read and decode a single file and return the data (Unicode string). """ - try: - data = self.source.read() + try: # In Python < 2.5, try...except has to be nested in try...finally. + try: + if self.source is sys.stdin and sys.version_info >= (3,0): + # read as binary data to circumvent auto-decoding + data = self.source.buffer.read() + # normalize newlines + data = b('\n').join(data.splitlines()) + b('\n') + else: + data = self.source.read() + except (UnicodeError, LookupError), err: # (in Py3k read() decodes) + if not self.encoding and self.source_path: + # re-read in binary mode and decode with heuristics + b_source = open(self.source_path, 'rb') + data = b_source.read() + b_source.close() + # normalize newlines + data = b('\n').join(data.splitlines()) + b('\n') + else: + raise finally: if self.autoclose: self.close() @@ -245,12 +271,7 @@ class FileInput(Input): """ Return lines of a single file as list of Unicode strings. """ - try: - lines = self.source.readlines() - finally: - if self.autoclose: - self.close() - return [self.decode(line) for line in lines] + return self.read().splitlines(True) def close(self): if self.source is not sys.stdin: @@ -302,7 +323,6 @@ class FileOutput(Output): 'errors': self.error_handler} else: kwargs = {} - try: self.destination = open(self.destination_path, 'w', **kwargs) except IOError, error: @@ -317,20 +337,34 @@ class FileOutput(Output): def write(self, data): """Encode `data`, write it to a single file, and return it. - In Python 3, a (unicode) string is returned. + In Python 3, `data` is returned unchanged. """ - if sys.version_info >= (3,0): - output = data # in py3k, write expects a (Unicode) string - else: - output = self.encode(data) + if sys.version_info < (3,0): + data = self.encode(data) if not self.opened: self.open() - try: - self.destination.write(output) + try: # In Python < 2.5, try...except has to be nested in try...finally. + try: + if (sys.version_info >= (3,0) and self.encoding and + hasattr(self.destination,'encoding') and + self.encoding != self.destination.encoding and + codecs.lookup(self.encoding) != + codecs.lookup(self.destination.encoding)): + # encode self, write bytes + bdata = self.encode(data) + if os.linesep != '\n': + bdata = bdata.replace('\n', os.linesep) + sys.stdout.buffer.write(bdata) + else: + self.destination.write(data) + except (UnicodeError, LookupError), err: # can only happen in py3k + raise UnicodeError( + 'Unable to encode output data. output-encoding is: ' + '%s.\n(%s)' % (self.encoding, ErrorString(err))) finally: if self.autoclose: self.close() - return output + return data def close(self): if self.destination not in (sys.stdout, sys.stderr): -- 2.11.4.GIT