Backwards-compatible fix for system-exit on IOError.
[docutils.git] / docutils / io.py
blob842269c95a01bdda86a5c7340b16167dad2681a2
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 import os
14 import re
15 import codecs
16 from docutils import TransformSpec
17 from docutils._compat import b
18 from docutils.error_reporting import locale_encoding, ErrorString, ErrorOutput
21 class InputError(IOError): pass
22 class OutputError(IOError): pass
25 class Input(TransformSpec):
27 """
28 Abstract base class for input wrappers.
29 """
31 component_type = 'input'
33 default_source_path = None
35 def __init__(self, source=None, source_path=None, encoding=None,
36 error_handler='strict'):
37 self.encoding = encoding
38 """Text encoding for the input source."""
40 self.error_handler = error_handler
41 """Text decoding error handler."""
43 self.source = source
44 """The source of input data."""
46 self.source_path = source_path
47 """A text reference to the source."""
49 if not source_path:
50 self.source_path = self.default_source_path
52 self.successful_encoding = None
53 """The encoding that successfully decoded the source data."""
55 def __repr__(self):
56 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
57 self.source_path)
59 def read(self):
60 raise NotImplementedError
62 def decode(self, data):
63 """
64 Decode a string, `data`, heuristically.
65 Raise UnicodeError if unsuccessful.
67 The client application should call ``locale.setlocale`` at the
68 beginning of processing::
70 locale.setlocale(locale.LC_ALL, '')
71 """
72 if self.encoding and self.encoding.lower() == 'unicode':
73 assert isinstance(data, unicode), (
74 'input encoding is "unicode" '
75 'but input is not a unicode object')
76 if isinstance(data, unicode):
77 # Accept unicode even if self.encoding != 'unicode'.
78 return data
79 if self.encoding:
80 # We believe the user/application when the encoding is
81 # explicitly given.
82 encodings = [self.encoding]
83 else:
84 data_encoding = self.determine_encoding_from_data(data)
85 if data_encoding:
86 # If the data declares its encoding (explicitly or via a BOM),
87 # we believe it.
88 encodings = [data_encoding]
89 else:
90 # Apply heuristics only if no encoding is explicitly given and
91 # no BOM found. Start with UTF-8, because that only matches
92 # data that *IS* UTF-8:
93 encodings = ['utf-8', 'latin-1']
94 if locale_encoding:
95 encodings.insert(1, locale_encoding)
96 for enc in encodings:
97 try:
98 decoded = unicode(data, enc, self.error_handler)
99 self.successful_encoding = enc
100 # Return decoded, removing BOMs.
101 return decoded.replace(u'\ufeff', u'')
102 except (UnicodeError, LookupError), err:
103 error = err # in Python 3, the <exception instance> is
104 # local to the except clause
105 raise UnicodeError(
106 'Unable to decode input data. Tried the following encodings: '
107 '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
108 ErrorString(error)))
110 coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
111 """Encoding declaration pattern."""
113 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
114 (codecs.BOM_UTF16_BE, 'utf-16-be'),
115 (codecs.BOM_UTF16_LE, 'utf-16-le'),)
116 """Sequence of (start_bytes, encoding) tuples for encoding detection.
117 The first bytes of input data are checked against the start_bytes strings.
118 A match indicates the given encoding."""
120 def determine_encoding_from_data(self, data):
122 Try to determine the encoding of `data` by looking *in* `data`.
123 Check for a byte order mark (BOM) or an encoding declaration.
125 # check for a byte order mark:
126 for start_bytes, encoding in self.byte_order_marks:
127 if data.startswith(start_bytes):
128 return encoding
129 # check for an encoding declaration pattern in first 2 lines of file:
130 for line in data.splitlines()[:2]:
131 match = self.coding_slug.search(line)
132 if match:
133 return match.group(1).decode('ascii')
134 return None
137 class Output(TransformSpec):
140 Abstract base class for output wrappers.
143 component_type = 'output'
145 default_destination_path = None
147 def __init__(self, destination=None, destination_path=None,
148 encoding=None, error_handler='strict'):
149 self.encoding = encoding
150 """Text encoding for the output destination."""
152 self.error_handler = error_handler or 'strict'
153 """Text encoding error handler."""
155 self.destination = destination
156 """The destination for output data."""
158 self.destination_path = destination_path
159 """A text reference to the destination."""
161 if not destination_path:
162 self.destination_path = self.default_destination_path
164 def __repr__(self):
165 return ('%s: destination=%r, destination_path=%r'
166 % (self.__class__, self.destination, self.destination_path))
168 def write(self, data):
169 """`data` is a Unicode string, to be encoded by `self.encode`."""
170 raise NotImplementedError
172 def encode(self, data):
173 if self.encoding and self.encoding.lower() == 'unicode':
174 assert isinstance(data, unicode), (
175 'the encoding given is "unicode" but the output is not '
176 'a Unicode string')
177 return data
178 if not isinstance(data, unicode):
179 # Non-unicode (e.g. binary) output.
180 return data
181 else:
182 return data.encode(self.encoding, self.error_handler)
185 class FileInput(Input):
188 Input for single, simple file-like objects.
190 def __init__(self, source=None, source_path=None,
191 encoding=None, error_handler='strict',
192 autoclose=True, handle_io_errors=True, mode='rU'):
194 :Parameters:
195 - `source`: either a file-like object (which is read directly), or
196 `None` (which implies `sys.stdin` if no `source_path` given).
197 - `source_path`: a path to a file, which is opened and then read.
198 - `encoding`: the expected text encoding of the input file.
199 - `error_handler`: the encoding error handler to use.
200 - `autoclose`: close automatically after read (except when
201 `sys.stdin` is the source).
202 - `handle_io_errors`: summarize I/O errors here, and exit?
203 - `mode`: how the file is to be opened (see standard function
204 `open`). The default 'rU' provides universal newline support
205 for text files.
207 Input.__init__(self, source, source_path, encoding, error_handler)
208 self.autoclose = autoclose
209 self.handle_io_errors = handle_io_errors
210 self._stderr = ErrorOutput()
212 if source is None:
213 if source_path:
214 # Specify encoding in Python 3
215 if sys.version_info >= (3,0):
216 kwargs = {'encoding': self.encoding,
217 'errors': self.error_handler}
218 else:
219 kwargs = {}
221 try:
222 self.source = open(source_path, mode, **kwargs)
223 except IOError, error:
224 if handle_io_errors:
225 print >>self._stderr, ErrorString(error)
226 print >>self._stderr, (
227 u'Unable to open source file for reading ("%s").'
228 u'Exiting.' % source_path)
229 sys.exit(1)
230 raise InputError(error.errno, error.strerror, source_path)
231 else:
232 self.source = sys.stdin
233 elif (sys.version_info >= (3,0) and
234 self.encoding and hasattr(self.source, 'encoding') and
235 self.encoding != self.source.encoding and
236 codecs.lookup(self.encoding) !=
237 codecs.lookup(self.source.encoding)):
238 # TODO: re-open, warn or raise error?
239 raise UnicodeError('Encoding clash: encoding given is "%s" '
240 'but source is opened with encoding "%s".' %
241 (self.encoding, self.source.encoding))
242 if not source_path:
243 try:
244 self.source_path = self.source.name
245 except AttributeError:
246 pass
248 def read(self):
250 Read and decode a single file and return the data (Unicode string).
252 try: # In Python < 2.5, try...except has to be nested in try...finally.
253 try:
254 if self.source is sys.stdin and sys.version_info >= (3,0):
255 # read as binary data to circumvent auto-decoding
256 data = self.source.buffer.read()
257 # normalize newlines
258 data = b('\n').join(data.splitlines()) + b('\n')
259 else:
260 data = self.source.read()
261 except (UnicodeError, LookupError), err: # (in Py3k read() decodes)
262 if not self.encoding and self.source_path:
263 # re-read in binary mode and decode with heuristics
264 b_source = open(self.source_path, 'rb')
265 data = b_source.read()
266 b_source.close()
267 # normalize newlines
268 data = b('\n').join(data.splitlines()) + b('\n')
269 else:
270 raise
271 finally:
272 if self.autoclose:
273 self.close()
274 return self.decode(data)
276 def readlines(self):
278 Return lines of a single file as list of Unicode strings.
280 return self.read().splitlines(True)
282 def close(self):
283 if self.source is not sys.stdin:
284 self.source.close()
287 class FileOutput(Output):
290 Output for single, simple file-like objects.
293 mode = 'w'
294 """The mode argument for `open()`."""
295 # 'wb' for binary (e.g. OpenOffice) files.
296 # (Do not use binary mode ('wb') for text files, as this prevents the
297 # conversion of newlines to the system specific default.)
299 def __init__(self, destination=None, destination_path=None,
300 encoding=None, error_handler='strict', autoclose=True,
301 handle_io_errors=True, mode=None):
303 :Parameters:
304 - `destination`: either a file-like object (which is written
305 directly) or `None` (which implies `sys.stdout` if no
306 `destination_path` given).
307 - `destination_path`: a path to a file, which is opened and then
308 written.
309 - `encoding`: the text encoding of the output file.
310 - `error_handler`: the encoding error handler to use.
311 - `autoclose`: close automatically after write (except when
312 `sys.stdout` or `sys.stderr` is the destination).
313 - `handle_io_errors`: summarize I/O errors here, and exit?
314 - `mode`: how the file is to be opened (see standard function
315 `open`). The default is 'w', providing universal newline
316 support for text files.
318 Output.__init__(self, destination, destination_path,
319 encoding, error_handler)
320 self.opened = True
321 self.autoclose = autoclose
322 self.handle_io_errors = handle_io_errors
323 if mode is not None:
324 self.mode = mode
325 self._stderr = ErrorOutput()
326 if destination is None:
327 if destination_path:
328 self.opened = False
329 else:
330 self.destination = sys.stdout
331 if not destination_path:
332 try:
333 self.destination_path = self.destination.name
334 except AttributeError:
335 pass
337 def open(self):
338 # Specify encoding in Python 3.
339 if sys.version_info >= (3,0):
340 kwargs = {'encoding': self.encoding,
341 'errors': self.error_handler}
342 else:
343 kwargs = {}
344 try:
345 self.destination = open(self.destination_path, self.mode, **kwargs)
346 except IOError, error:
347 if self.handle_io_errors:
348 print >>self._stderr, ErrorString(error)
349 print >>self._stderr, (u'Unable to open destination file'
350 u" for writing ('%s'). Exiting." % self.destination_path)
351 sys.exit(1)
352 raise OutputError(error.errno, error.strerror,
353 self.destination_path)
354 self.opened = True
356 def write(self, data):
357 """Encode `data`, write it to a single file, and return it.
359 In Python 3, `data` is returned unchanged.
361 if sys.version_info < (3,0):
362 data = self.encode(data)
363 if not self.opened:
364 self.open()
365 try: # In Python < 2.5, try...except has to be nested in try...finally.
366 try:
367 if (sys.version_info >= (3,0) and self.encoding and
368 hasattr(self.destination,'encoding') and
369 self.encoding != self.destination.encoding and
370 codecs.lookup(self.encoding) !=
371 codecs.lookup(self.destination.encoding)):
372 # encode self, write bytes
373 bdata = self.encode(data)
374 if os.linesep != '\n':
375 bdata = bdata.replace('\n', os.linesep)
376 sys.stdout.buffer.write(bdata)
377 else:
378 self.destination.write(data)
379 except (UnicodeError, LookupError), err: # can only happen in py3k
380 raise UnicodeError(
381 'Unable to encode output data. output-encoding is: '
382 '%s.\n(%s)' % (self.encoding, ErrorString(err)))
383 finally:
384 if self.autoclose:
385 self.close()
386 return data
388 def close(self):
389 if self.destination not in (sys.stdout, sys.stderr):
390 self.destination.close()
391 self.opened = False
394 class BinaryFileOutput(FileOutput):
396 A version of docutils.io.FileOutput which writes to a binary file.
398 # Used by core.publish_cmdline_to_binary() which in turn is used by
399 # rst2odt (OpenOffice writer)
400 mode = 'wb'
403 class StringInput(Input):
406 Direct string input.
409 default_source_path = '<string>'
411 def read(self):
412 """Decode and return the source string."""
413 return self.decode(self.source)
416 class StringOutput(Output):
419 Direct string output.
422 default_destination_path = '<string>'
424 def write(self, data):
425 """Encode `data`, store it in `self.destination`, and return it."""
426 self.destination = self.encode(data)
427 return self.destination
430 class NullInput(Input):
433 Degenerate input: read nothing.
436 default_source_path = 'null input'
438 def read(self):
439 """Return a null string."""
440 return u''
443 class NullOutput(Output):
446 Degenerate output: write nothing.
449 default_destination_path = 'null output'
451 def write(self, data):
452 """Do nothing ([don't even] send data to the bit bucket)."""
453 pass
456 class DocTreeInput(Input):
459 Adapter for document tree input.
461 The document tree must be passed in the ``source`` parameter.
464 default_source_path = 'doctree input'
466 def read(self):
467 """Return the document tree."""
468 return self.source