Add test case for issue 2926161 (commented out)
[docutils.git] / docutils / io.py
blob40630af558affb4f852abde621af844b151c55ca
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 import os
14 import re
15 import codecs
16 from docutils import TransformSpec
17 from docutils._compat import b
18 from docutils.error_reporting import locale_encoding, ErrorString, ErrorOutput
20 class Input(TransformSpec):
22 """
23 Abstract base class for input wrappers.
24 """
26 component_type = 'input'
28 default_source_path = None
30 def __init__(self, source=None, source_path=None, encoding=None,
31 error_handler='strict'):
32 self.encoding = encoding
33 """Text encoding for the input source."""
35 self.error_handler = error_handler
36 """Text decoding error handler."""
38 self.source = source
39 """The source of input data."""
41 self.source_path = source_path
42 """A text reference to the source."""
44 if not source_path:
45 self.source_path = self.default_source_path
47 self.successful_encoding = None
48 """The encoding that successfully decoded the source data."""
50 def __repr__(self):
51 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
52 self.source_path)
54 def read(self):
55 raise NotImplementedError
57 def decode(self, data):
58 """
59 Decode a string, `data`, heuristically.
60 Raise UnicodeError if unsuccessful.
62 The client application should call ``locale.setlocale`` at the
63 beginning of processing::
65 locale.setlocale(locale.LC_ALL, '')
66 """
67 if self.encoding and self.encoding.lower() == 'unicode':
68 assert isinstance(data, unicode), (
69 'input encoding is "unicode" '
70 'but input is not a unicode object')
71 if isinstance(data, unicode):
72 # Accept unicode even if self.encoding != 'unicode'.
73 return data
74 if self.encoding:
75 # We believe the user/application when the encoding is
76 # explicitly given.
77 encodings = [self.encoding]
78 else:
79 data_encoding = self.determine_encoding_from_data(data)
80 if data_encoding:
81 # If the data declares its encoding (explicitly or via a BOM),
82 # we believe it.
83 encodings = [data_encoding]
84 else:
85 # Apply heuristics only if no encoding is explicitly given and
86 # no BOM found. Start with UTF-8, because that only matches
87 # data that *IS* UTF-8:
88 encodings = ['utf-8', 'latin-1']
89 if locale_encoding:
90 encodings.insert(1, locale_encoding)
91 for enc in encodings:
92 try:
93 decoded = unicode(data, enc, self.error_handler)
94 self.successful_encoding = enc
95 # Return decoded, removing BOMs.
96 return decoded.replace(u'\ufeff', u'')
97 except (UnicodeError, LookupError), err:
98 error = err # in Python 3, the <exception instance> is
99 # local to the except clause
100 raise UnicodeError(
101 'Unable to decode input data. Tried the following encodings: '
102 '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
103 ErrorString(error)))
105 coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
106 """Encoding declaration pattern."""
108 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
109 (codecs.BOM_UTF16_BE, 'utf-16-be'),
110 (codecs.BOM_UTF16_LE, 'utf-16-le'),)
111 """Sequence of (start_bytes, encoding) tuples for encoding detection.
112 The first bytes of input data are checked against the start_bytes strings.
113 A match indicates the given encoding."""
115 def determine_encoding_from_data(self, data):
117 Try to determine the encoding of `data` by looking *in* `data`.
118 Check for a byte order mark (BOM) or an encoding declaration.
120 # check for a byte order mark:
121 for start_bytes, encoding in self.byte_order_marks:
122 if data.startswith(start_bytes):
123 return encoding
124 # check for an encoding declaration pattern in first 2 lines of file:
125 for line in data.splitlines()[:2]:
126 match = self.coding_slug.search(line)
127 if match:
128 return match.group(1).decode('ascii')
129 return None
132 class Output(TransformSpec):
135 Abstract base class for output wrappers.
138 component_type = 'output'
140 default_destination_path = None
142 def __init__(self, destination=None, destination_path=None,
143 encoding=None, error_handler='strict'):
144 self.encoding = encoding
145 """Text encoding for the output destination."""
147 self.error_handler = error_handler or 'strict'
148 """Text encoding error handler."""
150 self.destination = destination
151 """The destination for output data."""
153 self.destination_path = destination_path
154 """A text reference to the destination."""
156 if not destination_path:
157 self.destination_path = self.default_destination_path
159 def __repr__(self):
160 return ('%s: destination=%r, destination_path=%r'
161 % (self.__class__, self.destination, self.destination_path))
163 def write(self, data):
164 """`data` is a Unicode string, to be encoded by `self.encode`."""
165 raise NotImplementedError
167 def encode(self, data):
168 if self.encoding and self.encoding.lower() == 'unicode':
169 assert isinstance(data, unicode), (
170 'the encoding given is "unicode" but the output is not '
171 'a Unicode string')
172 return data
173 if not isinstance(data, unicode):
174 # Non-unicode (e.g. binary) output.
175 return data
176 else:
177 return data.encode(self.encoding, self.error_handler)
180 class FileInput(Input):
183 Input for single, simple file-like objects.
185 def __init__(self, source=None, source_path=None,
186 encoding=None, error_handler='strict',
187 autoclose=True, handle_io_errors=True, mode='rU'):
189 :Parameters:
190 - `source`: either a file-like object (which is read directly), or
191 `None` (which implies `sys.stdin` if no `source_path` given).
192 - `source_path`: a path to a file, which is opened and then read.
193 - `encoding`: the expected text encoding of the input file.
194 - `error_handler`: the encoding error handler to use.
195 - `autoclose`: close automatically after read (except when
196 `sys.stdin` is the source).
197 - `handle_io_errors`: summarize I/O errors here, and exit?
198 - `mode`: how the file is to be opened (see standard function
199 `open`). The default 'rU' provides universal newline support
200 for text files.
202 Input.__init__(self, source, source_path, encoding, error_handler)
203 self.autoclose = autoclose
204 self.handle_io_errors = handle_io_errors
205 self._stderr = ErrorOutput()
207 if source is None:
208 if source_path:
209 # Specify encoding in Python 3
210 if sys.version_info >= (3,0):
211 kwargs = {'encoding': self.encoding,
212 'errors': self.error_handler}
213 else:
214 kwargs = {}
216 try:
217 self.source = open(source_path, mode, **kwargs)
218 except IOError, error:
219 if not handle_io_errors:
220 raise
221 print >>self._stderr, ErrorString(error)
222 print >>self._stderr, (u'Unable to open source'
223 u" file for reading ('%s'). Exiting." % source_path)
224 sys.exit(1)
225 else:
226 self.source = sys.stdin
227 elif (sys.version_info >= (3,0) and
228 self.encoding and hasattr(self.source, 'encoding') and
229 self.encoding != self.source.encoding and
230 codecs.lookup(self.encoding) !=
231 codecs.lookup(self.source.encoding)):
232 # TODO: re-open, warn or raise error?
233 raise UnicodeError('Encoding clash: encoding given is "%s" '
234 'but source is opened with encoding "%s".' %
235 (self.encoding, self.source.encoding))
236 if not source_path:
237 try:
238 self.source_path = self.source.name
239 except AttributeError:
240 pass
242 def read(self):
244 Read and decode a single file and return the data (Unicode string).
246 try: # In Python < 2.5, try...except has to be nested in try...finally.
247 try:
248 if self.source is sys.stdin and sys.version_info >= (3,0):
249 # read as binary data to circumvent auto-decoding
250 data = self.source.buffer.read()
251 # normalize newlines
252 data = b('\n').join(data.splitlines()) + b('\n')
253 else:
254 data = self.source.read()
255 except (UnicodeError, LookupError), err: # (in Py3k read() decodes)
256 if not self.encoding and self.source_path:
257 # re-read in binary mode and decode with heuristics
258 b_source = open(self.source_path, 'rb')
259 data = b_source.read()
260 b_source.close()
261 # normalize newlines
262 data = b('\n').join(data.splitlines()) + b('\n')
263 else:
264 raise
265 finally:
266 if self.autoclose:
267 self.close()
268 return self.decode(data)
270 def readlines(self):
272 Return lines of a single file as list of Unicode strings.
274 return self.read().splitlines(True)
276 def close(self):
277 if self.source is not sys.stdin:
278 self.source.close()
281 class FileOutput(Output):
284 Output for single, simple file-like objects.
287 def __init__(self, destination=None, destination_path=None,
288 encoding=None, error_handler='strict', autoclose=True,
289 handle_io_errors=True):
291 :Parameters:
292 - `destination`: either a file-like object (which is written
293 directly) or `None` (which implies `sys.stdout` if no
294 `destination_path` given).
295 - `destination_path`: a path to a file, which is opened and then
296 written.
297 - `autoclose`: close automatically after write (except when
298 `sys.stdout` or `sys.stderr` is the destination).
300 Output.__init__(self, destination, destination_path,
301 encoding, error_handler)
302 self.opened = True
303 self.autoclose = autoclose
304 self.handle_io_errors = handle_io_errors
305 self._stderr = ErrorOutput()
306 if destination is None:
307 if destination_path:
308 self.opened = False
309 else:
310 self.destination = sys.stdout
311 if not destination_path:
312 try:
313 self.destination_path = self.destination.name
314 except AttributeError:
315 pass
317 def open(self):
318 # Specify encoding in Python 3.
319 # (Do not use binary mode ('wb') as this prevents the
320 # conversion of newlines to the system specific default.)
321 if sys.version_info >= (3,0):
322 kwargs = {'encoding': self.encoding,
323 'errors': self.error_handler}
324 else:
325 kwargs = {}
326 try:
327 self.destination = open(self.destination_path, 'w', **kwargs)
328 except IOError, error:
329 if not self.handle_io_errors:
330 raise
331 print >>self._stderr, ErrorString(error)
332 print >>self._stderr, (u'Unable to open destination file'
333 u" for writing ('%s'). Exiting." % self.destination_path)
334 sys.exit(1)
335 self.opened = True
337 def write(self, data):
338 """Encode `data`, write it to a single file, and return it.
340 In Python 3, `data` is returned unchanged.
342 if sys.version_info < (3,0):
343 data = self.encode(data)
344 if not self.opened:
345 self.open()
346 try: # In Python < 2.5, try...except has to be nested in try...finally.
347 try:
348 if (sys.version_info >= (3,0) and self.encoding and
349 hasattr(self.destination,'encoding') and
350 self.encoding != self.destination.encoding and
351 codecs.lookup(self.encoding) !=
352 codecs.lookup(self.destination.encoding)):
353 # encode self, write bytes
354 bdata = self.encode(data)
355 if os.linesep != '\n':
356 bdata = bdata.replace('\n', os.linesep)
357 sys.stdout.buffer.write(bdata)
358 else:
359 self.destination.write(data)
360 except (UnicodeError, LookupError), err: # can only happen in py3k
361 raise UnicodeError(
362 'Unable to encode output data. output-encoding is: '
363 '%s.\n(%s)' % (self.encoding, ErrorString(err)))
364 finally:
365 if self.autoclose:
366 self.close()
367 return data
369 def close(self):
370 if self.destination not in (sys.stdout, sys.stderr):
371 self.destination.close()
372 self.opened = False
375 class BinaryFileOutput(FileOutput):
377 A version of docutils.io.FileOutput which writes to a binary file.
379 def open(self):
380 try:
381 self.destination = open(self.destination_path, 'wb')
382 except IOError, error:
383 if not self.handle_io_errors:
384 raise
385 print >>self._stderr, ErrorString(error)
386 print >>self._stderr, (u'Unable to open destination file'
387 u" for writing ('%s'). Exiting." % self.destination_path)
388 sys.exit(1)
389 self.opened = True
392 class StringInput(Input):
395 Direct string input.
398 default_source_path = '<string>'
400 def read(self):
401 """Decode and return the source string."""
402 return self.decode(self.source)
405 class StringOutput(Output):
408 Direct string output.
411 default_destination_path = '<string>'
413 def write(self, data):
414 """Encode `data`, store it in `self.destination`, and return it."""
415 self.destination = self.encode(data)
416 return self.destination
419 class NullInput(Input):
422 Degenerate input: read nothing.
425 default_source_path = 'null input'
427 def read(self):
428 """Return a null string."""
429 return u''
432 class NullOutput(Output):
435 Degenerate output: write nothing.
438 default_destination_path = 'null output'
440 def write(self, data):
441 """Do nothing ([don't even] send data to the bit bucket)."""
442 pass
445 class DocTreeInput(Input):
448 Adapter for document tree input.
450 The document tree must be passed in the ``source`` parameter.
453 default_source_path = 'doctree input'
455 def read(self):
456 """Return the document tree."""
457 return self.source