Make tools/ compatible with both, Python 2 and 3 without 2to3-conversion.
[docutils.git] / docutils / io.py
blobb992e6d710305272e61743e6bcf766804f996ed1
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 import os
14 import re
15 import codecs
16 from docutils import TransformSpec
17 from docutils._compat import b
18 from docutils.error_reporting import locale_encoding, ErrorString, ErrorOutput
21 class InputError(IOError): pass
22 class OutputError(IOError): pass
24 def check_encoding(stream, encoding):
25 """Test, whether the encoding of `stream` matches `encoding`.
27 Returns
29 :None: if `encoding` or `stream.encoding` are not a valid encoding
30 argument (e.g. ``None``) or `stream.encoding is missing.
31 :True: if the encoding argument resolves to the same value as `encoding`,
32 :False: if the encodings differ.
33 """
34 try:
35 return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
36 except (LookupError, AttributeError, TypeError):
37 return None
40 class Input(TransformSpec):
42 """
43 Abstract base class for input wrappers.
44 """
46 component_type = 'input'
48 default_source_path = None
50 def __init__(self, source=None, source_path=None, encoding=None,
51 error_handler='strict'):
52 self.encoding = encoding
53 """Text encoding for the input source."""
55 self.error_handler = error_handler
56 """Text decoding error handler."""
58 self.source = source
59 """The source of input data."""
61 self.source_path = source_path
62 """A text reference to the source."""
64 if not source_path:
65 self.source_path = self.default_source_path
67 self.successful_encoding = None
68 """The encoding that successfully decoded the source data."""
70 def __repr__(self):
71 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
72 self.source_path)
74 def read(self):
75 raise NotImplementedError
77 def decode(self, data):
78 """
79 Decode a string, `data`, heuristically.
80 Raise UnicodeError if unsuccessful.
82 The client application should call ``locale.setlocale`` at the
83 beginning of processing::
85 locale.setlocale(locale.LC_ALL, '')
86 """
87 if self.encoding and self.encoding.lower() == 'unicode':
88 assert isinstance(data, unicode), (
89 'input encoding is "unicode" '
90 'but input is not a unicode object')
91 if isinstance(data, unicode):
92 # Accept unicode even if self.encoding != 'unicode'.
93 return data
94 if self.encoding:
95 # We believe the user/application when the encoding is
96 # explicitly given.
97 encodings = [self.encoding]
98 else:
99 data_encoding = self.determine_encoding_from_data(data)
100 if data_encoding:
101 # If the data declares its encoding (explicitly or via a BOM),
102 # we believe it.
103 encodings = [data_encoding]
104 else:
105 # Apply heuristics only if no encoding is explicitly given and
106 # no BOM found. Start with UTF-8, because that only matches
107 # data that *IS* UTF-8:
108 encodings = ['utf-8', 'latin-1']
109 if locale_encoding:
110 encodings.insert(1, locale_encoding)
111 for enc in encodings:
112 try:
113 decoded = unicode(data, enc, self.error_handler)
114 self.successful_encoding = enc
115 # Return decoded, removing BOMs.
116 return decoded.replace(u'\ufeff', u'')
117 except (UnicodeError, LookupError), err:
118 error = err # in Python 3, the <exception instance> is
119 # local to the except clause
120 raise UnicodeError(
121 'Unable to decode input data. Tried the following encodings: '
122 '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
123 ErrorString(error)))
125 coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
126 """Encoding declaration pattern."""
128 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
129 (codecs.BOM_UTF16_BE, 'utf-16-be'),
130 (codecs.BOM_UTF16_LE, 'utf-16-le'),)
131 """Sequence of (start_bytes, encoding) tuples for encoding detection.
132 The first bytes of input data are checked against the start_bytes strings.
133 A match indicates the given encoding."""
135 def determine_encoding_from_data(self, data):
137 Try to determine the encoding of `data` by looking *in* `data`.
138 Check for a byte order mark (BOM) or an encoding declaration.
140 # check for a byte order mark:
141 for start_bytes, encoding in self.byte_order_marks:
142 if data.startswith(start_bytes):
143 return encoding
144 # check for an encoding declaration pattern in first 2 lines of file:
145 for line in data.splitlines()[:2]:
146 match = self.coding_slug.search(line)
147 if match:
148 return match.group(1).decode('ascii')
149 return None
152 class Output(TransformSpec):
155 Abstract base class for output wrappers.
158 component_type = 'output'
160 default_destination_path = None
162 def __init__(self, destination=None, destination_path=None,
163 encoding=None, error_handler='strict'):
164 self.encoding = encoding
165 """Text encoding for the output destination."""
167 self.error_handler = error_handler or 'strict'
168 """Text encoding error handler."""
170 self.destination = destination
171 """The destination for output data."""
173 self.destination_path = destination_path
174 """A text reference to the destination."""
176 if not destination_path:
177 self.destination_path = self.default_destination_path
179 def __repr__(self):
180 return ('%s: destination=%r, destination_path=%r'
181 % (self.__class__, self.destination, self.destination_path))
183 def write(self, data):
184 """`data` is a Unicode string, to be encoded by `self.encode`."""
185 raise NotImplementedError
187 def encode(self, data):
188 if self.encoding and self.encoding.lower() == 'unicode':
189 assert isinstance(data, unicode), (
190 'the encoding given is "unicode" but the output is not '
191 'a Unicode string')
192 return data
193 if not isinstance(data, unicode):
194 # Non-unicode (e.g. binary) output.
195 return data
196 else:
197 return data.encode(self.encoding, self.error_handler)
200 class FileInput(Input):
203 Input for single, simple file-like objects.
205 def __init__(self, source=None, source_path=None,
206 encoding=None, error_handler='strict',
207 autoclose=True, handle_io_errors=True, mode='rU'):
209 :Parameters:
210 - `source`: either a file-like object (which is read directly), or
211 `None` (which implies `sys.stdin` if no `source_path` given).
212 - `source_path`: a path to a file, which is opened and then read.
213 - `encoding`: the expected text encoding of the input file.
214 - `error_handler`: the encoding error handler to use.
215 - `autoclose`: close automatically after read (except when
216 `sys.stdin` is the source).
217 - `handle_io_errors`: summarize I/O errors here, and exit?
218 - `mode`: how the file is to be opened (see standard function
219 `open`). The default 'rU' provides universal newline support
220 for text files.
222 Input.__init__(self, source, source_path, encoding, error_handler)
223 self.autoclose = autoclose
224 self.handle_io_errors = handle_io_errors
225 self._stderr = ErrorOutput()
227 if source is None:
228 if source_path:
229 # Specify encoding in Python 3
230 if sys.version_info >= (3,0):
231 kwargs = {'encoding': self.encoding,
232 'errors': self.error_handler}
233 else:
234 kwargs = {}
236 try:
237 self.source = open(source_path, mode, **kwargs)
238 except IOError, error:
239 if handle_io_errors:
240 print >>self._stderr, ErrorString(error)
241 print >>self._stderr, (
242 u'Unable to open source file for reading ("%s").'
243 u'Exiting.' % source_path)
244 sys.exit(1)
245 raise InputError(error.errno, error.strerror, source_path)
246 else:
247 self.source = sys.stdin
248 elif (sys.version_info >= (3,0) and
249 check_encoding(self.source, self.encoding) is False):
250 # TODO: re-open, warn or raise error?
251 raise UnicodeError('Encoding clash: encoding given is "%s" '
252 'but source is opened with encoding "%s".' %
253 (self.encoding, self.source.encoding))
254 if not source_path:
255 try:
256 self.source_path = self.source.name
257 except AttributeError:
258 pass
260 def read(self):
262 Read and decode a single file and return the data (Unicode string).
264 try: # In Python < 2.5, try...except has to be nested in try...finally.
265 try:
266 if self.source is sys.stdin and sys.version_info >= (3,0):
267 # read as binary data to circumvent auto-decoding
268 data = self.source.buffer.read()
269 # normalize newlines
270 data = b('\n').join(data.splitlines()) + b('\n')
271 else:
272 data = self.source.read()
273 except (UnicodeError, LookupError), err: # (in Py3k read() decodes)
274 if not self.encoding and self.source_path:
275 # re-read in binary mode and decode with heuristics
276 b_source = open(self.source_path, 'rb')
277 data = b_source.read()
278 b_source.close()
279 # normalize newlines
280 data = b('\n').join(data.splitlines()) + b('\n')
281 else:
282 raise
283 finally:
284 if self.autoclose:
285 self.close()
286 return self.decode(data)
288 def readlines(self):
290 Return lines of a single file as list of Unicode strings.
292 return self.read().splitlines(True)
294 def close(self):
295 if self.source is not sys.stdin:
296 self.source.close()
299 class FileOutput(Output):
302 Output for single, simple file-like objects.
305 mode = 'w'
306 """The mode argument for `open()`."""
307 # 'wb' for binary (e.g. OpenOffice) files.
308 # (Do not use binary mode ('wb') for text files, as this prevents the
309 # conversion of newlines to the system specific default.)
311 def __init__(self, destination=None, destination_path=None,
312 encoding=None, error_handler='strict', autoclose=True,
313 handle_io_errors=True, mode=None):
315 :Parameters:
316 - `destination`: either a file-like object (which is written
317 directly) or `None` (which implies `sys.stdout` if no
318 `destination_path` given).
319 - `destination_path`: a path to a file, which is opened and then
320 written.
321 - `encoding`: the text encoding of the output file.
322 - `error_handler`: the encoding error handler to use.
323 - `autoclose`: close automatically after write (except when
324 `sys.stdout` or `sys.stderr` is the destination).
325 - `handle_io_errors`: summarize I/O errors here, and exit?
326 - `mode`: how the file is to be opened (see standard function
327 `open`). The default is 'w', providing universal newline
328 support for text files.
330 Output.__init__(self, destination, destination_path,
331 encoding, error_handler)
332 self.opened = True
333 self.autoclose = autoclose
334 self.handle_io_errors = handle_io_errors
335 if mode is not None:
336 self.mode = mode
337 self._stderr = ErrorOutput()
338 if destination is None:
339 if destination_path:
340 self.opened = False
341 else:
342 self.destination = sys.stdout
343 elif (# destination is file-type object -> check mode:
344 mode and hasattr(self.destination, 'mode')
345 and mode != self.destination.mode):
346 print >>self._stderr, ('Destination mode "%s" '
347 'differs from specified mode "%s"' %
348 (self.destination.mode, mode))
349 if not destination_path:
350 try:
351 self.destination_path = self.destination.name
352 except AttributeError:
353 pass
354 # Special cases under Python 3: different encoding or binary output
355 if sys.version_info >= (3,0):
356 if ('b' in self.mode
357 and self.destination in (sys.stdout, sys.stderr)
359 self.destination = self.destination.buffer
360 if check_encoding(self.destination, self.encoding) is False:
361 if self.destination in (sys.stdout, sys.stderr):
362 self.destination = self.destination.buffer
363 else: # TODO: try the `write to .buffer` scheme instead?
364 raise ValueError('Encoding of %s (%s) differs \n'
365 ' from specified encoding (%s)' %
366 (self.destination_path or 'destination',
367 destination.encoding, encoding))
370 def open(self):
371 # Specify encoding in Python 3.
372 if sys.version_info >= (3,0):
373 kwargs = {'encoding': self.encoding,
374 'errors': self.error_handler}
375 else:
376 kwargs = {}
377 try:
378 self.destination = open(self.destination_path, self.mode, **kwargs)
379 except IOError, error:
380 if self.handle_io_errors:
381 print >>self._stderr, ErrorString(error)
382 print >>self._stderr, (u'Unable to open destination file'
383 u" for writing ('%s'). Exiting." % self.destination_path)
384 sys.exit(1)
385 raise OutputError(error.errno, error.strerror,
386 self.destination_path)
387 self.opened = True
389 def write(self, data):
390 """Encode `data`, write it to a single file, and return it.
392 With Python 3 or binary output mode, `data` is returned unchanged,
393 except when specified encoding and output encoding differ.
395 if not self.opened:
396 self.open()
397 try: # In Python < 2.5, try...except has to be nested in try...finally.
398 try:
399 if 'b' not in self.mode and (sys.version_info < (3,0) or
400 check_encoding(self.destination, self.encoding) is False):
401 data = self.encode(data)
402 if sys.version_info >= (3,0) and os.linesep != '\n':
403 # writing as binary data -> fix endings
404 data = data.replace('\n', os.linesep)
406 self.destination.write(data)
408 except (UnicodeError, LookupError), err:
409 raise UnicodeError(
410 'Unable to encode output data. output-encoding is: '
411 '%s.\n(%s)' % (self.encoding, ErrorString(err)))
412 finally:
413 if self.autoclose:
414 self.close()
415 return data
417 def close(self):
418 if self.destination not in (sys.stdout, sys.stderr):
419 self.destination.close()
420 self.opened = False
423 class BinaryFileOutput(FileOutput):
425 A version of docutils.io.FileOutput which writes to a binary file.
427 # Used by core.publish_cmdline_to_binary() which in turn is used by
428 # rst2odt (OpenOffice writer)
429 mode = 'wb'
432 class StringInput(Input):
435 Direct string input.
438 default_source_path = '<string>'
440 def read(self):
441 """Decode and return the source string."""
442 return self.decode(self.source)
445 class StringOutput(Output):
448 Direct string output.
451 default_destination_path = '<string>'
453 def write(self, data):
454 """Encode `data`, store it in `self.destination`, and return it."""
455 self.destination = self.encode(data)
456 return self.destination
459 class NullInput(Input):
462 Degenerate input: read nothing.
465 default_source_path = 'null input'
467 def read(self):
468 """Return a null string."""
469 return u''
472 class NullOutput(Output):
475 Degenerate output: write nothing.
478 default_destination_path = 'null output'
480 def write(self, data):
481 """Do nothing ([don't even] send data to the bit bucket)."""
482 pass
485 class DocTreeInput(Input):
488 Adapter for document tree input.
490 The document tree must be passed in the ``source`` parameter.
493 default_source_path = 'doctree input'
495 def read(self):
496 """Return the document tree."""
497 return self.source