2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
16 from docutils
import TransformSpec
17 from docutils
._compat
import b
18 from docutils
.utils
.error_reporting
import locale_encoding
, ErrorString
, ErrorOutput
21 class InputError(IOError): pass
22 class OutputError(IOError): pass
24 def check_encoding(stream
, encoding
):
25 """Test, whether the encoding of `stream` matches `encoding`.
29 :None: if `encoding` or `stream.encoding` are not a valid encoding
30 argument (e.g. ``None``) or `stream.encoding is missing.
31 :True: if the encoding argument resolves to the same value as `encoding`,
32 :False: if the encodings differ.
35 return codecs
.lookup(stream
.encoding
) == codecs
.lookup(encoding
)
36 except (LookupError, AttributeError, TypeError):
40 class Input(TransformSpec
):
43 Abstract base class for input wrappers.
46 component_type
= 'input'
48 default_source_path
= None
50 def __init__(self
, source
=None, source_path
=None, encoding
=None,
51 error_handler
='strict'):
52 self
.encoding
= encoding
53 """Text encoding for the input source."""
55 self
.error_handler
= error_handler
56 """Text decoding error handler."""
59 """The source of input data."""
61 self
.source_path
= source_path
62 """A text reference to the source."""
65 self
.source_path
= self
.default_source_path
67 self
.successful_encoding
= None
68 """The encoding that successfully decoded the source data."""
71 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
75 raise NotImplementedError
77 def decode(self
, data
):
79 Decode a string, `data`, heuristically.
80 Raise UnicodeError if unsuccessful.
82 The client application should call ``locale.setlocale`` at the
83 beginning of processing::
85 locale.setlocale(locale.LC_ALL, '')
87 if self
.encoding
and self
.encoding
.lower() == 'unicode':
88 assert isinstance(data
, unicode), (
89 'input encoding is "unicode" '
90 'but input is not a unicode object')
91 if isinstance(data
, unicode):
92 # Accept unicode even if self.encoding != 'unicode'.
95 # We believe the user/application when the encoding is
97 encodings
= [self
.encoding
]
99 data_encoding
= self
.determine_encoding_from_data(data
)
101 # If the data declares its encoding (explicitly or via a BOM),
103 encodings
= [data_encoding
]
105 # Apply heuristics only if no encoding is explicitly given and
106 # no BOM found. Start with UTF-8, because that only matches
107 # data that *IS* UTF-8:
108 encodings
= ['utf-8', 'latin-1']
110 encodings
.insert(1, locale_encoding
)
111 for enc
in encodings
:
113 decoded
= unicode(data
, enc
, self
.error_handler
)
114 self
.successful_encoding
= enc
115 # Return decoded, removing BOMs.
116 return decoded
.replace(u
'\ufeff', u
'')
117 except (UnicodeError, LookupError), err
:
118 error
= err
# in Python 3, the <exception instance> is
119 # local to the except clause
121 'Unable to decode input data. Tried the following encodings: '
122 '%s.\n(%s)' % (', '.join([repr(enc
) for enc
in encodings
]),
125 coding_slug
= re
.compile(b("coding[:=]\s*([-\w.]+)"))
126 """Encoding declaration pattern."""
128 byte_order_marks
= ((codecs
.BOM_UTF8
, 'utf-8'), # 'utf-8-sig' new in v2.5
129 (codecs
.BOM_UTF16_BE
, 'utf-16-be'),
130 (codecs
.BOM_UTF16_LE
, 'utf-16-le'),)
131 """Sequence of (start_bytes, encoding) tuples for encoding detection.
132 The first bytes of input data are checked against the start_bytes strings.
133 A match indicates the given encoding."""
135 def determine_encoding_from_data(self
, data
):
137 Try to determine the encoding of `data` by looking *in* `data`.
138 Check for a byte order mark (BOM) or an encoding declaration.
140 # check for a byte order mark:
141 for start_bytes
, encoding
in self
.byte_order_marks
:
142 if data
.startswith(start_bytes
):
144 # check for an encoding declaration pattern in first 2 lines of file:
145 for line
in data
.splitlines()[:2]:
146 match
= self
.coding_slug
.search(line
)
148 return match
.group(1).decode('ascii')
152 class Output(TransformSpec
):
155 Abstract base class for output wrappers.
158 component_type
= 'output'
160 default_destination_path
= None
162 def __init__(self
, destination
=None, destination_path
=None,
163 encoding
=None, error_handler
='strict'):
164 self
.encoding
= encoding
165 """Text encoding for the output destination."""
167 self
.error_handler
= error_handler
or 'strict'
168 """Text encoding error handler."""
170 self
.destination
= destination
171 """The destination for output data."""
173 self
.destination_path
= destination_path
174 """A text reference to the destination."""
176 if not destination_path
:
177 self
.destination_path
= self
.default_destination_path
180 return ('%s: destination=%r, destination_path=%r'
181 % (self
.__class
__, self
.destination
, self
.destination_path
))
183 def write(self
, data
):
184 """`data` is a Unicode string, to be encoded by `self.encode`."""
185 raise NotImplementedError
187 def encode(self
, data
):
188 if self
.encoding
and self
.encoding
.lower() == 'unicode':
189 assert isinstance(data
, unicode), (
190 'the encoding given is "unicode" but the output is not '
193 if not isinstance(data
, unicode):
194 # Non-unicode (e.g. bytes) output.
197 return data
.encode(self
.encoding
, self
.error_handler
)
200 class FileInput(Input
):
203 Input for single, simple file-like objects.
205 def __init__(self
, source
=None, source_path
=None,
206 encoding
=None, error_handler
='strict',
207 autoclose
=True, handle_io_errors
=None, mode
='rU'):
210 - `source`: either a file-like object (which is read directly), or
211 `None` (which implies `sys.stdin` if no `source_path` given).
212 - `source_path`: a path to a file, which is opened and then read.
213 - `encoding`: the expected text encoding of the input file.
214 - `error_handler`: the encoding error handler to use.
215 - `autoclose`: close automatically after read (except when
216 `sys.stdin` is the source).
217 - `handle_io_errors`: ignored, deprecated, will be removed.
218 - `mode`: how the file is to be opened (see standard function
219 `open`). The default 'rU' provides universal newline support
222 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
223 self
.autoclose
= autoclose
224 self
._stderr
= ErrorOutput()
228 # Specify encoding in Python 3
229 if sys
.version_info
>= (3,0):
230 kwargs
= {'encoding': self
.encoding
,
231 'errors': self
.error_handler
}
236 self
.source
= open(source_path
, mode
, **kwargs
)
237 except IOError, error
:
238 raise InputError(error
.errno
, error
.strerror
, source_path
)
240 self
.source
= sys
.stdin
241 elif (sys
.version_info
>= (3,0) and
242 check_encoding(self
.source
, self
.encoding
) is False):
243 # TODO: re-open, warn or raise error?
244 raise UnicodeError('Encoding clash: encoding given is "%s" '
245 'but source is opened with encoding "%s".' %
246 (self
.encoding
, self
.source
.encoding
))
249 self
.source_path
= self
.source
.name
250 except AttributeError:
255 Read and decode a single file and return the data (Unicode string).
257 try: # In Python < 2.5, try...except has to be nested in try...finally.
259 if self
.source
is sys
.stdin
and sys
.version_info
>= (3,0):
260 # read as binary data to circumvent auto-decoding
261 data
= self
.source
.buffer.read()
263 data
= b('\n').join(data
.splitlines()) + b('\n')
265 data
= self
.source
.read()
266 except (UnicodeError, LookupError), err
: # (in Py3k read() decodes)
267 if not self
.encoding
and self
.source_path
:
268 # re-read in binary mode and decode with heuristics
269 b_source
= open(self
.source_path
, 'rb')
270 data
= b_source
.read()
273 data
= b('\n').join(data
.splitlines()) + b('\n')
279 return self
.decode(data
)
283 Return lines of a single file as list of Unicode strings.
285 return self
.read().splitlines(True)
288 if self
.source
is not sys
.stdin
:
292 class FileOutput(Output
):
295 Output for single, simple file-like objects.
299 """The mode argument for `open()`."""
300 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
301 # (Do not use binary mode ('wb') for text files, as this prevents the
302 # conversion of newlines to the system specific default.)
304 def __init__(self
, destination
=None, destination_path
=None,
305 encoding
=None, error_handler
='strict', autoclose
=True,
306 handle_io_errors
=None, mode
=None):
309 - `destination`: either a file-like object (which is written
310 directly) or `None` (which implies `sys.stdout` if no
311 `destination_path` given).
312 - `destination_path`: a path to a file, which is opened and then
314 - `encoding`: the text encoding of the output file.
315 - `error_handler`: the encoding error handler to use.
316 - `autoclose`: close automatically after write (except when
317 `sys.stdout` or `sys.stderr` is the destination).
318 - `handle_io_errors`: ignored, deprecated, will be removed.
319 - `mode`: how the file is to be opened (see standard function
320 `open`). The default is 'w', providing universal newline
321 support for text files.
323 Output
.__init
__(self
, destination
, destination_path
,
324 encoding
, error_handler
)
326 self
.autoclose
= autoclose
329 self
._stderr
= ErrorOutput()
330 if destination
is None:
334 self
.destination
= sys
.stdout
335 elif (# destination is file-type object -> check mode:
336 mode
and hasattr(self
.destination
, 'mode')
337 and mode
!= self
.destination
.mode
):
338 print >>self
._stderr
, ('Warning: Destination mode "%s" '
339 'differs from specified mode "%s"' %
340 (self
.destination
.mode
, mode
))
341 if not destination_path
:
343 self
.destination_path
= self
.destination
.name
344 except AttributeError:
348 # Specify encoding in Python 3.
349 if sys
.version_info
>= (3,0) and 'b' not in self
.mode
:
350 kwargs
= {'encoding': self
.encoding
,
351 'errors': self
.error_handler
}
355 self
.destination
= open(self
.destination_path
, self
.mode
, **kwargs
)
356 except IOError, error
:
357 raise OutputError(error
.errno
, error
.strerror
,
358 self
.destination_path
)
361 def write(self
, data
):
362 """Encode `data`, write it to a single file, and return it.
364 With Python 3 or binary output mode, `data` is returned unchanged,
365 except when specified encoding and output encoding differ.
369 if ('b' not in self
.mode
and sys
.version_info
< (3,0)
370 or check_encoding(self
.destination
, self
.encoding
) is False
372 data
= self
.encode(data
)
373 if sys
.version_info
>= (3,0) and os
.linesep
!= '\n':
374 data
= data
.replace(b('\n'), b(os
.linesep
)) # fix endings
376 try: # In Python < 2.5, try...except has to be nested in try...finally.
378 self
.destination
.write(data
)
380 if sys
.version_info
>= (3,0) and isinstance(data
, bytes
):
382 self
.destination
.buffer.write(data
)
383 except AttributeError:
384 if check_encoding(self
.destination
,
385 self
.encoding
) is False:
386 raise ValueError('Encoding of %s (%s) differs \n'
387 ' from specified encoding (%s)' %
388 (self
.destination_path
or 'destination',
389 self
.destination
.encoding
, self
.encoding
))
392 except (UnicodeError, LookupError), err
:
394 'Unable to encode output data. output-encoding is: '
395 '%s.\n(%s)' % (self
.encoding
, ErrorString(err
)))
402 if self
.destination
not in (sys
.stdout
, sys
.stderr
):
403 self
.destination
.close()
407 class BinaryFileOutput(FileOutput
):
409 A version of docutils.io.FileOutput which writes to a binary file.
411 # Used by core.publish_cmdline_to_binary() which in turn is used by
412 # rst2odt (OpenOffice writer)
416 class StringInput(Input
):
422 default_source_path
= '<string>'
425 """Decode and return the source string."""
426 return self
.decode(self
.source
)
429 class StringOutput(Output
):
432 Direct string output.
435 default_destination_path
= '<string>'
437 def write(self
, data
):
438 """Encode `data`, store it in `self.destination`, and return it."""
439 self
.destination
= self
.encode(data
)
440 return self
.destination
443 class NullInput(Input
):
446 Degenerate input: read nothing.
449 default_source_path
= 'null input'
452 """Return a null string."""
456 class NullOutput(Output
):
459 Degenerate output: write nothing.
462 default_destination_path
= 'null output'
464 def write(self
, data
):
465 """Do nothing ([don't even] send data to the bit bucket)."""
469 class DocTreeInput(Input
):
472 Adapter for document tree input.
474 The document tree must be passed in the ``source`` parameter.
477 default_source_path
= 'doctree input'
480 """Return the document tree."""