Fix [ 348 ] Since Python 3.4, the 'U' universal newlines mode has been deprecated...
[docutils.git] / docutils / docutils / io.py
blobdfcb1a73f786990a2c37ed5573057db8c4db8db7
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 exist for a variety of input/output mechanisms.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 import os
14 import re
15 import codecs
16 from docutils import TransformSpec
17 from docutils.utils.error_reporting import locale_encoding, ErrorString, ErrorOutput
20 class InputError(IOError): pass
21 class OutputError(IOError): pass
23 def check_encoding(stream, encoding):
24 """Test, whether the encoding of `stream` matches `encoding`.
26 Returns
28 :None: if `encoding` or `stream.encoding` are not a valid encoding
29 argument (e.g. ``None``) or `stream.encoding is missing.
30 :True: if the encoding argument resolves to the same value as `encoding`,
31 :False: if the encodings differ.
32 """
33 try:
34 return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
35 except (LookupError, AttributeError, TypeError):
36 return None
39 class Input(TransformSpec):
41 """
42 Abstract base class for input wrappers.
43 """
45 component_type = 'input'
47 default_source_path = None
49 def __init__(self, source=None, source_path=None, encoding=None,
50 error_handler='strict'):
51 self.encoding = encoding
52 """Text encoding for the input source."""
54 self.error_handler = error_handler
55 """Text decoding error handler."""
57 self.source = source
58 """The source of input data."""
60 self.source_path = source_path
61 """A text reference to the source."""
63 if not source_path:
64 self.source_path = self.default_source_path
66 self.successful_encoding = None
67 """The encoding that successfully decoded the source data."""
69 def __repr__(self):
70 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
71 self.source_path)
73 def read(self):
74 raise NotImplementedError
76 def decode(self, data):
77 """
78 Decode a string, `data`, heuristically.
79 Raise UnicodeError if unsuccessful.
81 The client application should call ``locale.setlocale`` at the
82 beginning of processing::
84 locale.setlocale(locale.LC_ALL, '')
85 """
86 if self.encoding and self.encoding.lower() == 'unicode':
87 assert isinstance(data, unicode), (
88 'input encoding is "unicode" '
89 'but input is not a unicode object')
90 if isinstance(data, unicode):
91 # Accept unicode even if self.encoding != 'unicode'.
92 return data
93 if self.encoding:
94 # We believe the user/application when the encoding is
95 # explicitly given.
96 encodings = [self.encoding]
97 else:
98 data_encoding = self.determine_encoding_from_data(data)
99 if data_encoding:
100 # If the data declares its encoding (explicitly or via a BOM),
101 # we believe it.
102 encodings = [data_encoding]
103 else:
104 # Apply heuristics only if no encoding is explicitly given and
105 # no BOM found. Start with UTF-8, because that only matches
106 # data that *IS* UTF-8:
107 encodings = ['utf-8', 'latin-1']
108 if locale_encoding:
109 encodings.insert(1, locale_encoding)
110 for enc in encodings:
111 try:
112 decoded = unicode(data, enc, self.error_handler)
113 self.successful_encoding = enc
114 # Return decoded, removing BOMs.
115 return decoded.replace(u'\ufeff', u'')
116 except (UnicodeError, LookupError), err:
117 error = err # in Python 3, the <exception instance> is
118 # local to the except clause
119 raise UnicodeError(
120 'Unable to decode input data. Tried the following encodings: '
121 '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
122 ErrorString(error)))
124 coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")
125 """Encoding declaration pattern."""
127 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'),
128 (codecs.BOM_UTF16_BE, 'utf-16-be'),
129 (codecs.BOM_UTF16_LE, 'utf-16-le'),)
130 """Sequence of (start_bytes, encoding) tuples for encoding detection.
131 The first bytes of input data are checked against the start_bytes strings.
132 A match indicates the given encoding."""
134 def determine_encoding_from_data(self, data):
136 Try to determine the encoding of `data` by looking *in* `data`.
137 Check for a byte order mark (BOM) or an encoding declaration.
139 # check for a byte order mark:
140 for start_bytes, encoding in self.byte_order_marks:
141 if data.startswith(start_bytes):
142 return encoding
143 # check for an encoding declaration pattern in first 2 lines of file:
144 for line in data.splitlines()[:2]:
145 match = self.coding_slug.search(line)
146 if match:
147 return match.group(1).decode('ascii')
148 return None
151 class Output(TransformSpec):
154 Abstract base class for output wrappers.
157 component_type = 'output'
159 default_destination_path = None
161 def __init__(self, destination=None, destination_path=None,
162 encoding=None, error_handler='strict'):
163 self.encoding = encoding
164 """Text encoding for the output destination."""
166 self.error_handler = error_handler or 'strict'
167 """Text encoding error handler."""
169 self.destination = destination
170 """The destination for output data."""
172 self.destination_path = destination_path
173 """A text reference to the destination."""
175 if not destination_path:
176 self.destination_path = self.default_destination_path
178 def __repr__(self):
179 return ('%s: destination=%r, destination_path=%r'
180 % (self.__class__, self.destination, self.destination_path))
182 def write(self, data):
183 """`data` is a Unicode string, to be encoded by `self.encode`."""
184 raise NotImplementedError
186 def encode(self, data):
187 if self.encoding and self.encoding.lower() == 'unicode':
188 assert isinstance(data, unicode), (
189 'the encoding given is "unicode" but the output is not '
190 'a Unicode string')
191 return data
192 if not isinstance(data, unicode):
193 # Non-unicode (e.g. bytes) output.
194 return data
195 else:
196 return data.encode(self.encoding, self.error_handler)
199 class FileInput(Input):
202 Input for single, simple file-like objects.
204 def __init__(self, source=None, source_path=None,
205 encoding=None, error_handler='strict',
206 autoclose=True,
207 mode='r' if sys.version_info >= (3, 4) else 'rU', **kwargs):
209 :Parameters:
210 - `source`: either a file-like object (which is read directly), or
211 `None` (which implies `sys.stdin` if no `source_path` given).
212 - `source_path`: a path to a file, which is opened and then read.
213 - `encoding`: the expected text encoding of the input file.
214 - `error_handler`: the encoding error handler to use.
215 - `autoclose`: close automatically after read (except when
216 `sys.stdin` is the source).
217 - `mode`: how the file is to be opened (see standard function
218 `open`). The default 'rU' provides universal newline support
219 for text files on Python < 3.4.
221 Input.__init__(self, source, source_path, encoding, error_handler)
222 self.autoclose = autoclose
223 self._stderr = ErrorOutput()
224 # deprecation warning
225 for key in kwargs:
226 if key == 'handle_io_errors':
227 sys.stderr.write('deprecation warning: '
228 'io.FileInput() argument `handle_io_errors` '
229 'is ignored since Docutils 0.10 (2012-12-16) '
230 'and will soon be removed.')
231 else:
232 raise TypeError('__init__() got an unexpected keyword '
233 "argument '%s'" % key)
235 if source is None:
236 if source_path:
237 # Specify encoding in Python 3
238 if sys.version_info >= (3,0):
239 kwargs = {'encoding': self.encoding,
240 'errors': self.error_handler}
241 else:
242 kwargs = {}
244 try:
245 self.source = open(source_path, mode, **kwargs)
246 except IOError, error:
247 raise InputError(error.errno, error.strerror, source_path)
248 else:
249 self.source = sys.stdin
250 elif (sys.version_info >= (3,0) and
251 check_encoding(self.source, self.encoding) is False):
252 # TODO: re-open, warn or raise error?
253 raise UnicodeError('Encoding clash: encoding given is "%s" '
254 'but source is opened with encoding "%s".' %
255 (self.encoding, self.source.encoding))
256 if not source_path:
257 try:
258 self.source_path = self.source.name
259 except AttributeError:
260 pass
262 def read(self):
264 Read and decode a single file and return the data (Unicode string).
266 try:
267 if self.source is sys.stdin and sys.version_info >= (3,0):
268 # read as binary data to circumvent auto-decoding
269 data = self.source.buffer.read()
270 # normalize newlines
271 data = b'\n'.join(data.splitlines()) + b'\n'
272 else:
273 data = self.source.read()
274 except (UnicodeError, LookupError), err: # (in Py3k read() decodes)
275 if not self.encoding and self.source_path:
276 # re-read in binary mode and decode with heuristics
277 b_source = open(self.source_path, 'rb')
278 data = b_source.read()
279 b_source.close()
280 # normalize newlines
281 data = b'\n'.join(data.splitlines()) + b'\n'
282 else:
283 raise
284 finally:
285 if self.autoclose:
286 self.close()
287 return self.decode(data)
289 def readlines(self):
291 Return lines of a single file as list of Unicode strings.
293 return self.read().splitlines(True)
295 def close(self):
296 if self.source is not sys.stdin:
297 self.source.close()
300 class FileOutput(Output):
303 Output for single, simple file-like objects.
306 mode = 'w'
307 """The mode argument for `open()`."""
308 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
309 # (Do not use binary mode ('wb') for text files, as this prevents the
310 # conversion of newlines to the system specific default.)
312 def __init__(self, destination=None, destination_path=None,
313 encoding=None, error_handler='strict', autoclose=True,
314 handle_io_errors=None, mode=None):
316 :Parameters:
317 - `destination`: either a file-like object (which is written
318 directly) or `None` (which implies `sys.stdout` if no
319 `destination_path` given).
320 - `destination_path`: a path to a file, which is opened and then
321 written.
322 - `encoding`: the text encoding of the output file.
323 - `error_handler`: the encoding error handler to use.
324 - `autoclose`: close automatically after write (except when
325 `sys.stdout` or `sys.stderr` is the destination).
326 - `handle_io_errors`: ignored, deprecated, will be removed.
327 - `mode`: how the file is to be opened (see standard function
328 `open`). The default is 'w', providing universal newline
329 support for text files.
331 Output.__init__(self, destination, destination_path,
332 encoding, error_handler)
333 self.opened = True
334 self.autoclose = autoclose
335 if mode is not None:
336 self.mode = mode
337 self._stderr = ErrorOutput()
338 if destination is None:
339 if destination_path:
340 self.opened = False
341 else:
342 self.destination = sys.stdout
343 elif (# destination is file-type object -> check mode:
344 mode and hasattr(self.destination, 'mode')
345 and mode != self.destination.mode):
346 print >>self._stderr, ('Warning: Destination mode "%s" '
347 'differs from specified mode "%s"' %
348 (self.destination.mode, mode))
349 if not destination_path:
350 try:
351 self.destination_path = self.destination.name
352 except AttributeError:
353 pass
355 def open(self):
356 # Specify encoding in Python 3.
357 if sys.version_info >= (3,0) and 'b' not in self.mode:
358 kwargs = {'encoding': self.encoding,
359 'errors': self.error_handler}
360 else:
361 kwargs = {}
362 try:
363 self.destination = open(self.destination_path, self.mode, **kwargs)
364 except IOError, error:
365 raise OutputError(error.errno, error.strerror,
366 self.destination_path)
367 self.opened = True
369 def write(self, data):
370 """Encode `data`, write it to a single file, and return it.
372 With Python 3 or binary output mode, `data` is returned unchanged,
373 except when specified encoding and output encoding differ.
375 if not self.opened:
376 self.open()
377 if ('b' not in self.mode and sys.version_info < (3,0)
378 or check_encoding(self.destination, self.encoding) is False
380 data = self.encode(data)
381 if sys.version_info >= (3,0) and os.linesep != '\n':
382 data = data.replace(b'\n', bytes(os.linesep, 'ascii')) # fix endings
384 try:
385 self.destination.write(data)
386 except TypeError, e:
387 if sys.version_info >= (3,0) and isinstance(data, bytes):
388 try:
389 self.destination.buffer.write(data)
390 except AttributeError:
391 if check_encoding(self.destination,
392 self.encoding) is False:
393 raise ValueError('Encoding of %s (%s) differs \n'
394 ' from specified encoding (%s)' %
395 (self.destination_path or 'destination',
396 self.destination.encoding, self.encoding))
397 else:
398 raise e
399 except (UnicodeError, LookupError), err:
400 raise UnicodeError(
401 'Unable to encode output data. output-encoding is: '
402 '%s.\n(%s)' % (self.encoding, ErrorString(err)))
403 finally:
404 if self.autoclose:
405 self.close()
406 return data
408 def close(self):
409 if self.destination not in (sys.stdout, sys.stderr):
410 self.destination.close()
411 self.opened = False
414 class BinaryFileOutput(FileOutput):
416 A version of docutils.io.FileOutput which writes to a binary file.
418 # Used by core.publish_cmdline_to_binary() which in turn is used by
419 # rst2odt (OpenOffice writer)
420 mode = 'wb'
423 class StringInput(Input):
426 Direct string input.
429 default_source_path = '<string>'
431 def read(self):
432 """Decode and return the source string."""
433 return self.decode(self.source)
436 class StringOutput(Output):
439 Direct string output.
442 default_destination_path = '<string>'
444 def write(self, data):
445 """Encode `data`, store it in `self.destination`, and return it."""
446 self.destination = self.encode(data)
447 return self.destination
450 class NullInput(Input):
453 Degenerate input: read nothing.
456 default_source_path = 'null input'
458 def read(self):
459 """Return a null string."""
460 return u''
463 class NullOutput(Output):
466 Degenerate output: write nothing.
469 default_destination_path = 'null output'
471 def write(self, data):
472 """Do nothing ([don't even] send data to the bit bucket)."""
473 pass
476 class DocTreeInput(Input):
479 Adapter for document tree input.
481 The document tree must be passed in the ``source`` parameter.
484 default_source_path = 'doctree input'
486 def read(self):
487 """Return the document tree."""
488 return self.source