2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
16 from docutils
import TransformSpec
17 from docutils
.utils
.error_reporting
import locale_encoding
, ErrorString
, ErrorOutput
20 class InputError(IOError): pass
21 class OutputError(IOError): pass
23 def check_encoding(stream
, encoding
):
24 """Test, whether the encoding of `stream` matches `encoding`.
28 :None: if `encoding` or `stream.encoding` are not a valid encoding
29 argument (e.g. ``None``) or `stream.encoding is missing.
30 :True: if the encoding argument resolves to the same value as `encoding`,
31 :False: if the encodings differ.
34 return codecs
.lookup(stream
.encoding
) == codecs
.lookup(encoding
)
35 except (LookupError, AttributeError, TypeError):
39 class Input(TransformSpec
):
42 Abstract base class for input wrappers.
45 component_type
= 'input'
47 default_source_path
= None
49 def __init__(self
, source
=None, source_path
=None, encoding
=None,
50 error_handler
='strict'):
51 self
.encoding
= encoding
52 """Text encoding for the input source."""
54 self
.error_handler
= error_handler
55 """Text decoding error handler."""
58 """The source of input data."""
60 self
.source_path
= source_path
61 """A text reference to the source."""
64 self
.source_path
= self
.default_source_path
66 self
.successful_encoding
= None
67 """The encoding that successfully decoded the source data."""
70 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
74 raise NotImplementedError
76 def decode(self
, data
):
78 Decode a string, `data`, heuristically.
79 Raise UnicodeError if unsuccessful.
81 The client application should call ``locale.setlocale`` at the
82 beginning of processing::
84 locale.setlocale(locale.LC_ALL, '')
86 if self
.encoding
and self
.encoding
.lower() == 'unicode':
87 assert isinstance(data
, unicode), (
88 'input encoding is "unicode" '
89 'but input is not a unicode object')
90 if isinstance(data
, unicode):
91 # Accept unicode even if self.encoding != 'unicode'.
94 # We believe the user/application when the encoding is
96 encodings
= [self
.encoding
]
98 data_encoding
= self
.determine_encoding_from_data(data
)
100 # If the data declares its encoding (explicitly or via a BOM),
102 encodings
= [data_encoding
]
104 # Apply heuristics only if no encoding is explicitly given and
105 # no BOM found. Start with UTF-8, because that only matches
106 # data that *IS* UTF-8:
107 encodings
= ['utf-8', 'latin-1']
109 encodings
.insert(1, locale_encoding
)
110 for enc
in encodings
:
112 decoded
= unicode(data
, enc
, self
.error_handler
)
113 self
.successful_encoding
= enc
114 # Return decoded, removing BOMs.
115 return decoded
.replace(u
'\ufeff', u
'')
116 except (UnicodeError, LookupError), err
:
117 error
= err
# in Python 3, the <exception instance> is
118 # local to the except clause
120 'Unable to decode input data. Tried the following encodings: '
121 '%s.\n(%s)' % (', '.join([repr(enc
) for enc
in encodings
]),
124 coding_slug
= re
.compile(br
"coding[:=]\s*([-\w.]+)")
125 """Encoding declaration pattern."""
127 byte_order_marks
= ((codecs
.BOM_UTF8
, 'utf-8'),
128 (codecs
.BOM_UTF16_BE
, 'utf-16-be'),
129 (codecs
.BOM_UTF16_LE
, 'utf-16-le'),)
130 """Sequence of (start_bytes, encoding) tuples for encoding detection.
131 The first bytes of input data are checked against the start_bytes strings.
132 A match indicates the given encoding."""
134 def determine_encoding_from_data(self
, data
):
136 Try to determine the encoding of `data` by looking *in* `data`.
137 Check for a byte order mark (BOM) or an encoding declaration.
139 # check for a byte order mark:
140 for start_bytes
, encoding
in self
.byte_order_marks
:
141 if data
.startswith(start_bytes
):
143 # check for an encoding declaration pattern in first 2 lines of file:
144 for line
in data
.splitlines()[:2]:
145 match
= self
.coding_slug
.search(line
)
147 return match
.group(1).decode('ascii')
151 class Output(TransformSpec
):
154 Abstract base class for output wrappers.
157 component_type
= 'output'
159 default_destination_path
= None
161 def __init__(self
, destination
=None, destination_path
=None,
162 encoding
=None, error_handler
='strict'):
163 self
.encoding
= encoding
164 """Text encoding for the output destination."""
166 self
.error_handler
= error_handler
or 'strict'
167 """Text encoding error handler."""
169 self
.destination
= destination
170 """The destination for output data."""
172 self
.destination_path
= destination_path
173 """A text reference to the destination."""
175 if not destination_path
:
176 self
.destination_path
= self
.default_destination_path
179 return ('%s: destination=%r, destination_path=%r'
180 % (self
.__class
__, self
.destination
, self
.destination_path
))
182 def write(self
, data
):
183 """`data` is a Unicode string, to be encoded by `self.encode`."""
184 raise NotImplementedError
186 def encode(self
, data
):
187 if self
.encoding
and self
.encoding
.lower() == 'unicode':
188 assert isinstance(data
, unicode), (
189 'the encoding given is "unicode" but the output is not '
192 if not isinstance(data
, unicode):
193 # Non-unicode (e.g. bytes) output.
196 return data
.encode(self
.encoding
, self
.error_handler
)
199 class FileInput(Input
):
202 Input for single, simple file-like objects.
204 def __init__(self
, source
=None, source_path
=None,
205 encoding
=None, error_handler
='strict',
207 mode
='r' if sys
.version_info
>= (3, 4) else 'rU', **kwargs
):
210 - `source`: either a file-like object (which is read directly), or
211 `None` (which implies `sys.stdin` if no `source_path` given).
212 - `source_path`: a path to a file, which is opened and then read.
213 - `encoding`: the expected text encoding of the input file.
214 - `error_handler`: the encoding error handler to use.
215 - `autoclose`: close automatically after read (except when
216 `sys.stdin` is the source).
217 - `mode`: how the file is to be opened (see standard function
218 `open`). The default 'rU' provides universal newline support
219 for text files on Python < 3.4.
221 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
222 self
.autoclose
= autoclose
223 self
._stderr
= ErrorOutput()
224 # deprecation warning
226 if key
== 'handle_io_errors':
227 sys
.stderr
.write('deprecation warning: '
228 'io.FileInput() argument `handle_io_errors` '
229 'is ignored since Docutils 0.10 (2012-12-16) '
230 'and will soon be removed.')
232 raise TypeError('__init__() got an unexpected keyword '
233 "argument '%s'" % key
)
237 # Specify encoding in Python 3
238 if sys
.version_info
>= (3,0):
239 kwargs
= {'encoding': self
.encoding
,
240 'errors': self
.error_handler
}
245 self
.source
= open(source_path
, mode
, **kwargs
)
246 except IOError, error
:
247 raise InputError(error
.errno
, error
.strerror
, source_path
)
249 self
.source
= sys
.stdin
250 elif (sys
.version_info
>= (3,0) and
251 check_encoding(self
.source
, self
.encoding
) is False):
252 # TODO: re-open, warn or raise error?
253 raise UnicodeError('Encoding clash: encoding given is "%s" '
254 'but source is opened with encoding "%s".' %
255 (self
.encoding
, self
.source
.encoding
))
258 self
.source_path
= self
.source
.name
259 except AttributeError:
264 Read and decode a single file and return the data (Unicode string).
267 if self
.source
is sys
.stdin
and sys
.version_info
>= (3,0):
268 # read as binary data to circumvent auto-decoding
269 data
= self
.source
.buffer.read()
271 data
= b
'\n'.join(data
.splitlines()) + b
'\n'
273 data
= self
.source
.read()
274 except (UnicodeError, LookupError), err
: # (in Py3k read() decodes)
275 if not self
.encoding
and self
.source_path
:
276 # re-read in binary mode and decode with heuristics
277 b_source
= open(self
.source_path
, 'rb')
278 data
= b_source
.read()
281 data
= b
'\n'.join(data
.splitlines()) + b
'\n'
287 return self
.decode(data
)
291 Return lines of a single file as list of Unicode strings.
293 return self
.read().splitlines(True)
296 if self
.source
is not sys
.stdin
:
300 class FileOutput(Output
):
303 Output for single, simple file-like objects.
307 """The mode argument for `open()`."""
308 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
309 # (Do not use binary mode ('wb') for text files, as this prevents the
310 # conversion of newlines to the system specific default.)
312 def __init__(self
, destination
=None, destination_path
=None,
313 encoding
=None, error_handler
='strict', autoclose
=True,
314 handle_io_errors
=None, mode
=None):
317 - `destination`: either a file-like object (which is written
318 directly) or `None` (which implies `sys.stdout` if no
319 `destination_path` given).
320 - `destination_path`: a path to a file, which is opened and then
322 - `encoding`: the text encoding of the output file.
323 - `error_handler`: the encoding error handler to use.
324 - `autoclose`: close automatically after write (except when
325 `sys.stdout` or `sys.stderr` is the destination).
326 - `handle_io_errors`: ignored, deprecated, will be removed.
327 - `mode`: how the file is to be opened (see standard function
328 `open`). The default is 'w', providing universal newline
329 support for text files.
331 Output
.__init
__(self
, destination
, destination_path
,
332 encoding
, error_handler
)
334 self
.autoclose
= autoclose
337 self
._stderr
= ErrorOutput()
338 if destination
is None:
342 self
.destination
= sys
.stdout
343 elif (# destination is file-type object -> check mode:
344 mode
and hasattr(self
.destination
, 'mode')
345 and mode
!= self
.destination
.mode
):
346 print >>self
._stderr
, ('Warning: Destination mode "%s" '
347 'differs from specified mode "%s"' %
348 (self
.destination
.mode
, mode
))
349 if not destination_path
:
351 self
.destination_path
= self
.destination
.name
352 except AttributeError:
356 # Specify encoding in Python 3.
357 if sys
.version_info
>= (3,0) and 'b' not in self
.mode
:
358 kwargs
= {'encoding': self
.encoding
,
359 'errors': self
.error_handler
}
363 self
.destination
= open(self
.destination_path
, self
.mode
, **kwargs
)
364 except IOError, error
:
365 raise OutputError(error
.errno
, error
.strerror
,
366 self
.destination_path
)
369 def write(self
, data
):
370 """Encode `data`, write it to a single file, and return it.
372 With Python 3 or binary output mode, `data` is returned unchanged,
373 except when specified encoding and output encoding differ.
377 if ('b' not in self
.mode
and sys
.version_info
< (3,0)
378 or check_encoding(self
.destination
, self
.encoding
) is False
380 data
= self
.encode(data
)
381 if sys
.version_info
>= (3,0) and os
.linesep
!= '\n':
382 data
= data
.replace(b
'\n', bytes(os
.linesep
, 'ascii')) # fix endings
385 self
.destination
.write(data
)
387 if sys
.version_info
>= (3,0) and isinstance(data
, bytes
):
389 self
.destination
.buffer.write(data
)
390 except AttributeError:
391 if check_encoding(self
.destination
,
392 self
.encoding
) is False:
393 raise ValueError('Encoding of %s (%s) differs \n'
394 ' from specified encoding (%s)' %
395 (self
.destination_path
or 'destination',
396 self
.destination
.encoding
, self
.encoding
))
399 except (UnicodeError, LookupError), err
:
401 'Unable to encode output data. output-encoding is: '
402 '%s.\n(%s)' % (self
.encoding
, ErrorString(err
)))
409 if self
.destination
not in (sys
.stdout
, sys
.stderr
):
410 self
.destination
.close()
414 class BinaryFileOutput(FileOutput
):
416 A version of docutils.io.FileOutput which writes to a binary file.
418 # Used by core.publish_cmdline_to_binary() which in turn is used by
419 # rst2odt (OpenOffice writer)
423 class StringInput(Input
):
429 default_source_path
= '<string>'
432 """Decode and return the source string."""
433 return self
.decode(self
.source
)
436 class StringOutput(Output
):
439 Direct string output.
442 default_destination_path
= '<string>'
444 def write(self
, data
):
445 """Encode `data`, store it in `self.destination`, and return it."""
446 self
.destination
= self
.encode(data
)
447 return self
.destination
450 class NullInput(Input
):
453 Degenerate input: read nothing.
456 default_source_path
= 'null input'
459 """Return a null string."""
463 class NullOutput(Output
):
466 Degenerate output: write nothing.
469 default_destination_path
= 'null output'
471 def write(self
, data
):
472 """Do nothing ([don't even] send data to the bit bucket)."""
476 class DocTreeInput(Input
):
479 Adapter for document tree input.
481 The document tree must be passed in the ``source`` parameter.
484 default_source_path
= 'doctree input'
487 """Return the document tree."""