2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
16 from docutils
import TransformSpec
17 from docutils
._compat
import b
18 from docutils
.error_reporting
import locale_encoding
, ErrorString
, ErrorOutput
21 class InputError(IOError): pass
22 class OutputError(IOError): pass
24 def check_encoding(stream
, encoding
):
25 """Test, whether the encoding of `stream` matches `encoding`.
29 :None: if `encoding` or `stream.encoding` are not a valid encoding
30 argument (e.g. ``None``) or `stream.encoding is missing.
31 :True: if the encoding argument resolves to the same value as `encoding`,
32 :False: if the encodings differ.
35 return codecs
.lookup(stream
.encoding
) == codecs
.lookup(encoding
)
36 except (LookupError, AttributeError, TypeError):
40 class Input(TransformSpec
):
43 Abstract base class for input wrappers.
46 component_type
= 'input'
48 default_source_path
= None
50 def __init__(self
, source
=None, source_path
=None, encoding
=None,
51 error_handler
='strict'):
52 self
.encoding
= encoding
53 """Text encoding for the input source."""
55 self
.error_handler
= error_handler
56 """Text decoding error handler."""
59 """The source of input data."""
61 self
.source_path
= source_path
62 """A text reference to the source."""
65 self
.source_path
= self
.default_source_path
67 self
.successful_encoding
= None
68 """The encoding that successfully decoded the source data."""
71 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
75 raise NotImplementedError
77 def decode(self
, data
):
79 Decode a string, `data`, heuristically.
80 Raise UnicodeError if unsuccessful.
82 The client application should call ``locale.setlocale`` at the
83 beginning of processing::
85 locale.setlocale(locale.LC_ALL, '')
87 if self
.encoding
and self
.encoding
.lower() == 'unicode':
88 assert isinstance(data
, unicode), (
89 'input encoding is "unicode" '
90 'but input is not a unicode object')
91 if isinstance(data
, unicode):
92 # Accept unicode even if self.encoding != 'unicode'.
95 # We believe the user/application when the encoding is
97 encodings
= [self
.encoding
]
99 data_encoding
= self
.determine_encoding_from_data(data
)
101 # If the data declares its encoding (explicitly or via a BOM),
103 encodings
= [data_encoding
]
105 # Apply heuristics only if no encoding is explicitly given and
106 # no BOM found. Start with UTF-8, because that only matches
107 # data that *IS* UTF-8:
108 encodings
= ['utf-8', 'latin-1']
110 encodings
.insert(1, locale_encoding
)
111 for enc
in encodings
:
113 decoded
= unicode(data
, enc
, self
.error_handler
)
114 self
.successful_encoding
= enc
115 # Return decoded, removing BOMs.
116 return decoded
.replace(u
'\ufeff', u
'')
117 except (UnicodeError, LookupError), err
:
118 error
= err
# in Python 3, the <exception instance> is
119 # local to the except clause
121 'Unable to decode input data. Tried the following encodings: '
122 '%s.\n(%s)' % (', '.join([repr(enc
) for enc
in encodings
]),
125 coding_slug
= re
.compile(b("coding[:=]\s*([-\w.]+)"))
126 """Encoding declaration pattern."""
128 byte_order_marks
= ((codecs
.BOM_UTF8
, 'utf-8'), # 'utf-8-sig' new in v2.5
129 (codecs
.BOM_UTF16_BE
, 'utf-16-be'),
130 (codecs
.BOM_UTF16_LE
, 'utf-16-le'),)
131 """Sequence of (start_bytes, encoding) tuples for encoding detection.
132 The first bytes of input data are checked against the start_bytes strings.
133 A match indicates the given encoding."""
135 def determine_encoding_from_data(self
, data
):
137 Try to determine the encoding of `data` by looking *in* `data`.
138 Check for a byte order mark (BOM) or an encoding declaration.
140 # check for a byte order mark:
141 for start_bytes
, encoding
in self
.byte_order_marks
:
142 if data
.startswith(start_bytes
):
144 # check for an encoding declaration pattern in first 2 lines of file:
145 for line
in data
.splitlines()[:2]:
146 match
= self
.coding_slug
.search(line
)
148 return match
.group(1).decode('ascii')
152 class Output(TransformSpec
):
155 Abstract base class for output wrappers.
158 component_type
= 'output'
160 default_destination_path
= None
162 def __init__(self
, destination
=None, destination_path
=None,
163 encoding
=None, error_handler
='strict'):
164 self
.encoding
= encoding
165 """Text encoding for the output destination."""
167 self
.error_handler
= error_handler
or 'strict'
168 """Text encoding error handler."""
170 self
.destination
= destination
171 """The destination for output data."""
173 self
.destination_path
= destination_path
174 """A text reference to the destination."""
176 if not destination_path
:
177 self
.destination_path
= self
.default_destination_path
180 return ('%s: destination=%r, destination_path=%r'
181 % (self
.__class
__, self
.destination
, self
.destination_path
))
183 def write(self
, data
):
184 """`data` is a Unicode string, to be encoded by `self.encode`."""
185 raise NotImplementedError
187 def encode(self
, data
):
188 if self
.encoding
and self
.encoding
.lower() == 'unicode':
189 assert isinstance(data
, unicode), (
190 'the encoding given is "unicode" but the output is not '
193 if not isinstance(data
, unicode):
194 # Non-unicode (e.g. binary) output.
197 return data
.encode(self
.encoding
, self
.error_handler
)
200 class FileInput(Input
):
203 Input for single, simple file-like objects.
205 def __init__(self
, source
=None, source_path
=None,
206 encoding
=None, error_handler
='strict',
207 autoclose
=True, handle_io_errors
=True, mode
='rU'):
210 - `source`: either a file-like object (which is read directly), or
211 `None` (which implies `sys.stdin` if no `source_path` given).
212 - `source_path`: a path to a file, which is opened and then read.
213 - `encoding`: the expected text encoding of the input file.
214 - `error_handler`: the encoding error handler to use.
215 - `autoclose`: close automatically after read (except when
216 `sys.stdin` is the source).
217 - `handle_io_errors`: summarize I/O errors here, and exit?
218 - `mode`: how the file is to be opened (see standard function
219 `open`). The default 'rU' provides universal newline support
222 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
223 self
.autoclose
= autoclose
224 self
.handle_io_errors
= handle_io_errors
225 self
._stderr
= ErrorOutput()
229 # Specify encoding in Python 3
230 if sys
.version_info
>= (3,0):
231 kwargs
= {'encoding': self
.encoding
,
232 'errors': self
.error_handler
}
237 self
.source
= open(source_path
, mode
, **kwargs
)
238 except IOError, error
:
240 print >>self
._stderr
, ErrorString(error
)
241 print >>self
._stderr
, (
242 u
'Unable to open source file for reading ("%s").'
243 u
'Exiting.' % source_path
)
245 raise InputError(error
.errno
, error
.strerror
, source_path
)
247 self
.source
= sys
.stdin
248 elif (sys
.version_info
>= (3,0) and
249 check_encoding(self
.source
, self
.encoding
) is False):
250 # TODO: re-open, warn or raise error?
251 raise UnicodeError('Encoding clash: encoding given is "%s" '
252 'but source is opened with encoding "%s".' %
253 (self
.encoding
, self
.source
.encoding
))
256 self
.source_path
= self
.source
.name
257 except AttributeError:
262 Read and decode a single file and return the data (Unicode string).
264 try: # In Python < 2.5, try...except has to be nested in try...finally.
266 if self
.source
is sys
.stdin
and sys
.version_info
>= (3,0):
267 # read as binary data to circumvent auto-decoding
268 data
= self
.source
.buffer.read()
270 data
= b('\n').join(data
.splitlines()) + b('\n')
272 data
= self
.source
.read()
273 except (UnicodeError, LookupError), err
: # (in Py3k read() decodes)
274 if not self
.encoding
and self
.source_path
:
275 # re-read in binary mode and decode with heuristics
276 b_source
= open(self
.source_path
, 'rb')
277 data
= b_source
.read()
280 data
= b('\n').join(data
.splitlines()) + b('\n')
286 return self
.decode(data
)
290 Return lines of a single file as list of Unicode strings.
292 return self
.read().splitlines(True)
295 if self
.source
is not sys
.stdin
:
299 class FileOutput(Output
):
302 Output for single, simple file-like objects.
306 """The mode argument for `open()`."""
307 # 'wb' for binary (e.g. OpenOffice) files.
308 # (Do not use binary mode ('wb') for text files, as this prevents the
309 # conversion of newlines to the system specific default.)
311 def __init__(self
, destination
=None, destination_path
=None,
312 encoding
=None, error_handler
='strict', autoclose
=True,
313 handle_io_errors
=True, mode
=None):
316 - `destination`: either a file-like object (which is written
317 directly) or `None` (which implies `sys.stdout` if no
318 `destination_path` given).
319 - `destination_path`: a path to a file, which is opened and then
321 - `encoding`: the text encoding of the output file.
322 - `error_handler`: the encoding error handler to use.
323 - `autoclose`: close automatically after write (except when
324 `sys.stdout` or `sys.stderr` is the destination).
325 - `handle_io_errors`: summarize I/O errors here, and exit?
326 - `mode`: how the file is to be opened (see standard function
327 `open`). The default is 'w', providing universal newline
328 support for text files.
330 Output
.__init
__(self
, destination
, destination_path
,
331 encoding
, error_handler
)
333 self
.autoclose
= autoclose
334 self
.handle_io_errors
= handle_io_errors
337 self
._stderr
= ErrorOutput()
338 if destination
is None:
342 self
.destination
= sys
.stdout
343 elif (# destination is file-type object -> check mode:
344 mode
and hasattr(self
.destination
, 'mode')
345 and mode
!= self
.destination
.mode
):
346 print >>self
._stderr
, ('Destination mode "%s" '
347 'differs from specified mode "%s"' %
348 (self
.destination
.mode
, mode
))
349 if not destination_path
:
351 self
.destination_path
= self
.destination
.name
352 except AttributeError:
354 # Special cases under Python 3: different encoding or binary output
355 if sys
.version_info
>= (3,0):
357 and self
.destination
in (sys
.stdout
, sys
.stderr
)
359 self
.destination
= self
.destination
.buffer
360 if check_encoding(self
.destination
, self
.encoding
) is False:
361 if self
.destination
in (sys
.stdout
, sys
.stderr
):
362 self
.destination
= self
.destination
.buffer
363 else: # TODO: try the `write to .buffer` scheme instead?
364 raise ValueError('Encoding of %s (%s) differs \n'
365 ' from specified encoding (%s)' %
366 (self
.destination_path
or 'destination',
367 destination
.encoding
, encoding
))
371 # Specify encoding in Python 3.
372 if sys
.version_info
>= (3,0):
373 kwargs
= {'encoding': self
.encoding
,
374 'errors': self
.error_handler
}
378 self
.destination
= open(self
.destination_path
, self
.mode
, **kwargs
)
379 except IOError, error
:
380 if self
.handle_io_errors
:
381 print >>self
._stderr
, ErrorString(error
)
382 print >>self
._stderr
, (u
'Unable to open destination file'
383 u
" for writing ('%s'). Exiting." % self
.destination_path
)
385 raise OutputError(error
.errno
, error
.strerror
,
386 self
.destination_path
)
389 def write(self
, data
):
390 """Encode `data`, write it to a single file, and return it.
392 With Python 3 or binary output mode, `data` is returned unchanged,
393 except when specified encoding and output encoding differ.
397 try: # In Python < 2.5, try...except has to be nested in try...finally.
399 if 'b' not in self
.mode
and (sys
.version_info
< (3,0) or
400 check_encoding(self
.destination
, self
.encoding
) is False):
401 data
= self
.encode(data
)
402 if sys
.version_info
>= (3,0) and os
.linesep
!= '\n':
403 # writing as binary data -> fix endings
404 data
= data
.replace('\n', os
.linesep
)
406 self
.destination
.write(data
)
408 except (UnicodeError, LookupError), err
:
410 'Unable to encode output data. output-encoding is: '
411 '%s.\n(%s)' % (self
.encoding
, ErrorString(err
)))
418 if self
.destination
not in (sys
.stdout
, sys
.stderr
):
419 self
.destination
.close()
423 class BinaryFileOutput(FileOutput
):
425 A version of docutils.io.FileOutput which writes to a binary file.
427 # Used by core.publish_cmdline_to_binary() which in turn is used by
428 # rst2odt (OpenOffice writer)
432 class StringInput(Input
):
438 default_source_path
= '<string>'
441 """Decode and return the source string."""
442 return self
.decode(self
.source
)
445 class StringOutput(Output
):
448 Direct string output.
451 default_destination_path
= '<string>'
453 def write(self
, data
):
454 """Encode `data`, store it in `self.destination`, and return it."""
455 self
.destination
= self
.encode(data
)
456 return self
.destination
459 class NullInput(Input
):
462 Degenerate input: read nothing.
465 default_source_path
= 'null input'
468 """Return a null string."""
472 class NullOutput(Output
):
475 Degenerate output: write nothing.
478 default_destination_path
= 'null output'
480 def write(self
, data
):
481 """Do nothing ([don't even] send data to the bit bucket)."""
485 class DocTreeInput(Input
):
488 Adapter for document tree input.
490 The document tree must be passed in the ``source`` parameter.
493 default_source_path
= 'doctree input'
496 """Return the document tree."""