2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
16 from docutils
import TransformSpec
17 from docutils
._compat
import b
18 from docutils
.error_reporting
import locale_encoding
, ErrorString
, ErrorOutput
21 class InputError(IOError): pass
22 class OutputError(IOError): pass
25 class Input(TransformSpec
):
28 Abstract base class for input wrappers.
31 component_type
= 'input'
33 default_source_path
= None
35 def __init__(self
, source
=None, source_path
=None, encoding
=None,
36 error_handler
='strict'):
37 self
.encoding
= encoding
38 """Text encoding for the input source."""
40 self
.error_handler
= error_handler
41 """Text decoding error handler."""
44 """The source of input data."""
46 self
.source_path
= source_path
47 """A text reference to the source."""
50 self
.source_path
= self
.default_source_path
52 self
.successful_encoding
= None
53 """The encoding that successfully decoded the source data."""
56 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
60 raise NotImplementedError
62 def decode(self
, data
):
64 Decode a string, `data`, heuristically.
65 Raise UnicodeError if unsuccessful.
67 The client application should call ``locale.setlocale`` at the
68 beginning of processing::
70 locale.setlocale(locale.LC_ALL, '')
72 if self
.encoding
and self
.encoding
.lower() == 'unicode':
73 assert isinstance(data
, unicode), (
74 'input encoding is "unicode" '
75 'but input is not a unicode object')
76 if isinstance(data
, unicode):
77 # Accept unicode even if self.encoding != 'unicode'.
80 # We believe the user/application when the encoding is
82 encodings
= [self
.encoding
]
84 data_encoding
= self
.determine_encoding_from_data(data
)
86 # If the data declares its encoding (explicitly or via a BOM),
88 encodings
= [data_encoding
]
90 # Apply heuristics only if no encoding is explicitly given and
91 # no BOM found. Start with UTF-8, because that only matches
92 # data that *IS* UTF-8:
93 encodings
= ['utf-8', 'latin-1']
95 encodings
.insert(1, locale_encoding
)
98 decoded
= unicode(data
, enc
, self
.error_handler
)
99 self
.successful_encoding
= enc
100 # Return decoded, removing BOMs.
101 return decoded
.replace(u
'\ufeff', u
'')
102 except (UnicodeError, LookupError), err
:
103 error
= err
# in Python 3, the <exception instance> is
104 # local to the except clause
106 'Unable to decode input data. Tried the following encodings: '
107 '%s.\n(%s)' % (', '.join([repr(enc
) for enc
in encodings
]),
110 coding_slug
= re
.compile(b("coding[:=]\s*([-\w.]+)"))
111 """Encoding declaration pattern."""
113 byte_order_marks
= ((codecs
.BOM_UTF8
, 'utf-8'), # 'utf-8-sig' new in v2.5
114 (codecs
.BOM_UTF16_BE
, 'utf-16-be'),
115 (codecs
.BOM_UTF16_LE
, 'utf-16-le'),)
116 """Sequence of (start_bytes, encoding) tuples for encoding detection.
117 The first bytes of input data are checked against the start_bytes strings.
118 A match indicates the given encoding."""
120 def determine_encoding_from_data(self
, data
):
122 Try to determine the encoding of `data` by looking *in* `data`.
123 Check for a byte order mark (BOM) or an encoding declaration.
125 # check for a byte order mark:
126 for start_bytes
, encoding
in self
.byte_order_marks
:
127 if data
.startswith(start_bytes
):
129 # check for an encoding declaration pattern in first 2 lines of file:
130 for line
in data
.splitlines()[:2]:
131 match
= self
.coding_slug
.search(line
)
133 return match
.group(1).decode('ascii')
137 class Output(TransformSpec
):
140 Abstract base class for output wrappers.
143 component_type
= 'output'
145 default_destination_path
= None
147 def __init__(self
, destination
=None, destination_path
=None,
148 encoding
=None, error_handler
='strict'):
149 self
.encoding
= encoding
150 """Text encoding for the output destination."""
152 self
.error_handler
= error_handler
or 'strict'
153 """Text encoding error handler."""
155 self
.destination
= destination
156 """The destination for output data."""
158 self
.destination_path
= destination_path
159 """A text reference to the destination."""
161 if not destination_path
:
162 self
.destination_path
= self
.default_destination_path
165 return ('%s: destination=%r, destination_path=%r'
166 % (self
.__class
__, self
.destination
, self
.destination_path
))
168 def write(self
, data
):
169 """`data` is a Unicode string, to be encoded by `self.encode`."""
170 raise NotImplementedError
172 def encode(self
, data
):
173 if self
.encoding
and self
.encoding
.lower() == 'unicode':
174 assert isinstance(data
, unicode), (
175 'the encoding given is "unicode" but the output is not '
178 if not isinstance(data
, unicode):
179 # Non-unicode (e.g. binary) output.
182 return data
.encode(self
.encoding
, self
.error_handler
)
185 class FileInput(Input
):
188 Input for single, simple file-like objects.
190 def __init__(self
, source
=None, source_path
=None,
191 encoding
=None, error_handler
='strict',
192 autoclose
=True, handle_io_errors
=True, mode
='rU'):
195 - `source`: either a file-like object (which is read directly), or
196 `None` (which implies `sys.stdin` if no `source_path` given).
197 - `source_path`: a path to a file, which is opened and then read.
198 - `encoding`: the expected text encoding of the input file.
199 - `error_handler`: the encoding error handler to use.
200 - `autoclose`: close automatically after read (except when
201 `sys.stdin` is the source).
202 - `handle_io_errors`: summarize I/O errors here, and exit?
203 - `mode`: how the file is to be opened (see standard function
204 `open`). The default 'rU' provides universal newline support
207 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
208 self
.autoclose
= autoclose
209 self
.handle_io_errors
= handle_io_errors
210 self
._stderr
= ErrorOutput()
214 # Specify encoding in Python 3
215 if sys
.version_info
>= (3,0):
216 kwargs
= {'encoding': self
.encoding
,
217 'errors': self
.error_handler
}
222 self
.source
= open(source_path
, mode
, **kwargs
)
223 except IOError, error
:
225 print >>self
._stderr
, ErrorString(error
)
226 print >>self
._stderr
, (
227 u
'Unable to open source file for reading ("%s").'
228 u
'Exiting.' % source_path
)
230 raise InputError(error
.errno
, error
.strerror
, source_path
)
232 self
.source
= sys
.stdin
233 elif (sys
.version_info
>= (3,0) and
234 self
.encoding
and hasattr(self
.source
, 'encoding') and
235 self
.encoding
!= self
.source
.encoding
and
236 codecs
.lookup(self
.encoding
) !=
237 codecs
.lookup(self
.source
.encoding
)):
238 # TODO: re-open, warn or raise error?
239 raise UnicodeError('Encoding clash: encoding given is "%s" '
240 'but source is opened with encoding "%s".' %
241 (self
.encoding
, self
.source
.encoding
))
244 self
.source_path
= self
.source
.name
245 except AttributeError:
250 Read and decode a single file and return the data (Unicode string).
252 try: # In Python < 2.5, try...except has to be nested in try...finally.
254 if self
.source
is sys
.stdin
and sys
.version_info
>= (3,0):
255 # read as binary data to circumvent auto-decoding
256 data
= self
.source
.buffer.read()
258 data
= b('\n').join(data
.splitlines()) + b('\n')
260 data
= self
.source
.read()
261 except (UnicodeError, LookupError), err
: # (in Py3k read() decodes)
262 if not self
.encoding
and self
.source_path
:
263 # re-read in binary mode and decode with heuristics
264 b_source
= open(self
.source_path
, 'rb')
265 data
= b_source
.read()
268 data
= b('\n').join(data
.splitlines()) + b('\n')
274 return self
.decode(data
)
278 Return lines of a single file as list of Unicode strings.
280 return self
.read().splitlines(True)
283 if self
.source
is not sys
.stdin
:
287 class FileOutput(Output
):
290 Output for single, simple file-like objects.
294 """The mode argument for `open()`."""
295 # 'wb' for binary (e.g. OpenOffice) files.
296 # (Do not use binary mode ('wb') for text files, as this prevents the
297 # conversion of newlines to the system specific default.)
299 def __init__(self
, destination
=None, destination_path
=None,
300 encoding
=None, error_handler
='strict', autoclose
=True,
301 handle_io_errors
=True, mode
=None):
304 - `destination`: either a file-like object (which is written
305 directly) or `None` (which implies `sys.stdout` if no
306 `destination_path` given).
307 - `destination_path`: a path to a file, which is opened and then
309 - `encoding`: the text encoding of the output file.
310 - `error_handler`: the encoding error handler to use.
311 - `autoclose`: close automatically after write (except when
312 `sys.stdout` or `sys.stderr` is the destination).
313 - `handle_io_errors`: summarize I/O errors here, and exit?
314 - `mode`: how the file is to be opened (see standard function
315 `open`). The default is 'w', providing universal newline
316 support for text files.
318 Output
.__init
__(self
, destination
, destination_path
,
319 encoding
, error_handler
)
321 self
.autoclose
= autoclose
322 self
.handle_io_errors
= handle_io_errors
325 self
._stderr
= ErrorOutput()
326 if destination
is None:
330 self
.destination
= sys
.stdout
331 if not destination_path
:
333 self
.destination_path
= self
.destination
.name
334 except AttributeError:
338 # Specify encoding in Python 3.
339 if sys
.version_info
>= (3,0):
340 kwargs
= {'encoding': self
.encoding
,
341 'errors': self
.error_handler
}
345 self
.destination
= open(self
.destination_path
, self
.mode
, **kwargs
)
346 except IOError, error
:
347 if self
.handle_io_errors
:
348 print >>self
._stderr
, ErrorString(error
)
349 print >>self
._stderr
, (u
'Unable to open destination file'
350 u
" for writing ('%s'). Exiting." % self
.destination_path
)
352 raise OutputError(error
.errno
, error
.strerror
,
353 self
.destination_path
)
356 def write(self
, data
):
357 """Encode `data`, write it to a single file, and return it.
359 In Python 3, `data` is returned unchanged.
361 if sys
.version_info
< (3,0):
362 data
= self
.encode(data
)
365 try: # In Python < 2.5, try...except has to be nested in try...finally.
367 if (sys
.version_info
>= (3,0) and self
.encoding
and
368 hasattr(self
.destination
,'encoding') and
369 self
.encoding
!= self
.destination
.encoding
and
370 codecs
.lookup(self
.encoding
) !=
371 codecs
.lookup(self
.destination
.encoding
)):
372 # encode self, write bytes
373 bdata
= self
.encode(data
)
374 if os
.linesep
!= '\n':
375 bdata
= bdata
.replace('\n', os
.linesep
)
376 sys
.stdout
.buffer.write(bdata
)
378 self
.destination
.write(data
)
379 except (UnicodeError, LookupError), err
: # can only happen in py3k
381 'Unable to encode output data. output-encoding is: '
382 '%s.\n(%s)' % (self
.encoding
, ErrorString(err
)))
389 if self
.destination
not in (sys
.stdout
, sys
.stderr
):
390 self
.destination
.close()
394 class BinaryFileOutput(FileOutput
):
396 A version of docutils.io.FileOutput which writes to a binary file.
398 # Used by core.publish_cmdline_to_binary() which in turn is used by
399 # rst2odt (OpenOffice writer)
403 class StringInput(Input
):
409 default_source_path
= '<string>'
412 """Decode and return the source string."""
413 return self
.decode(self
.source
)
416 class StringOutput(Output
):
419 Direct string output.
422 default_destination_path
= '<string>'
424 def write(self
, data
):
425 """Encode `data`, store it in `self.destination`, and return it."""
426 self
.destination
= self
.encode(data
)
427 return self
.destination
430 class NullInput(Input
):
433 Degenerate input: read nothing.
436 default_source_path
= 'null input'
439 """Return a null string."""
443 class NullOutput(Output
):
446 Degenerate output: write nothing.
449 default_destination_path
= 'null output'
451 def write(self
, data
):
452 """Do nothing ([don't even] send data to the bit bucket)."""
456 class DocTreeInput(Input
):
459 Adapter for document tree input.
461 The document tree must be passed in the ``source`` parameter.
464 default_source_path
= 'doctree input'
467 """Return the document tree."""