2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
16 from docutils
import TransformSpec
17 from docutils
._compat
import b
18 from docutils
.error_reporting
import locale_encoding
, ErrorString
, ErrorOutput
20 class Input(TransformSpec
):
23 Abstract base class for input wrappers.
26 component_type
= 'input'
28 default_source_path
= None
30 def __init__(self
, source
=None, source_path
=None, encoding
=None,
31 error_handler
='strict'):
32 self
.encoding
= encoding
33 """Text encoding for the input source."""
35 self
.error_handler
= error_handler
36 """Text decoding error handler."""
39 """The source of input data."""
41 self
.source_path
= source_path
42 """A text reference to the source."""
45 self
.source_path
= self
.default_source_path
47 self
.successful_encoding
= None
48 """The encoding that successfully decoded the source data."""
51 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
55 raise NotImplementedError
57 def decode(self
, data
):
59 Decode a string, `data`, heuristically.
60 Raise UnicodeError if unsuccessful.
62 The client application should call ``locale.setlocale`` at the
63 beginning of processing::
65 locale.setlocale(locale.LC_ALL, '')
67 if self
.encoding
and self
.encoding
.lower() == 'unicode':
68 assert isinstance(data
, unicode), (
69 'input encoding is "unicode" '
70 'but input is not a unicode object')
71 if isinstance(data
, unicode):
72 # Accept unicode even if self.encoding != 'unicode'.
75 # We believe the user/application when the encoding is
77 encodings
= [self
.encoding
]
79 data_encoding
= self
.determine_encoding_from_data(data
)
81 # If the data declares its encoding (explicitly or via a BOM),
83 encodings
= [data_encoding
]
85 # Apply heuristics only if no encoding is explicitly given and
86 # no BOM found. Start with UTF-8, because that only matches
87 # data that *IS* UTF-8:
88 encodings
= ['utf-8', 'latin-1']
90 encodings
.insert(1, locale_encoding
)
93 decoded
= unicode(data
, enc
, self
.error_handler
)
94 self
.successful_encoding
= enc
95 # Return decoded, removing BOMs.
96 return decoded
.replace(u
'\ufeff', u
'')
97 except (UnicodeError, LookupError), err
:
98 error
= err
# in Python 3, the <exception instance> is
99 # local to the except clause
101 'Unable to decode input data. Tried the following encodings: '
102 '%s.\n(%s)' % (', '.join([repr(enc
) for enc
in encodings
]),
105 coding_slug
= re
.compile(b("coding[:=]\s*([-\w.]+)"))
106 """Encoding declaration pattern."""
108 byte_order_marks
= ((codecs
.BOM_UTF8
, 'utf-8'), # 'utf-8-sig' new in v2.5
109 (codecs
.BOM_UTF16_BE
, 'utf-16-be'),
110 (codecs
.BOM_UTF16_LE
, 'utf-16-le'),)
111 """Sequence of (start_bytes, encoding) tuples for encoding detection.
112 The first bytes of input data are checked against the start_bytes strings.
113 A match indicates the given encoding."""
115 def determine_encoding_from_data(self
, data
):
117 Try to determine the encoding of `data` by looking *in* `data`.
118 Check for a byte order mark (BOM) or an encoding declaration.
120 # check for a byte order mark:
121 for start_bytes
, encoding
in self
.byte_order_marks
:
122 if data
.startswith(start_bytes
):
124 # check for an encoding declaration pattern in first 2 lines of file:
125 for line
in data
.splitlines()[:2]:
126 match
= self
.coding_slug
.search(line
)
128 return match
.group(1).decode('ascii')
132 class Output(TransformSpec
):
135 Abstract base class for output wrappers.
138 component_type
= 'output'
140 default_destination_path
= None
142 def __init__(self
, destination
=None, destination_path
=None,
143 encoding
=None, error_handler
='strict'):
144 self
.encoding
= encoding
145 """Text encoding for the output destination."""
147 self
.error_handler
= error_handler
or 'strict'
148 """Text encoding error handler."""
150 self
.destination
= destination
151 """The destination for output data."""
153 self
.destination_path
= destination_path
154 """A text reference to the destination."""
156 if not destination_path
:
157 self
.destination_path
= self
.default_destination_path
160 return ('%s: destination=%r, destination_path=%r'
161 % (self
.__class
__, self
.destination
, self
.destination_path
))
163 def write(self
, data
):
164 """`data` is a Unicode string, to be encoded by `self.encode`."""
165 raise NotImplementedError
167 def encode(self
, data
):
168 if self
.encoding
and self
.encoding
.lower() == 'unicode':
169 assert isinstance(data
, unicode), (
170 'the encoding given is "unicode" but the output is not '
173 if not isinstance(data
, unicode):
174 # Non-unicode (e.g. binary) output.
177 return data
.encode(self
.encoding
, self
.error_handler
)
180 class FileInput(Input
):
183 Input for single, simple file-like objects.
185 def __init__(self
, source
=None, source_path
=None,
186 encoding
=None, error_handler
='strict',
187 autoclose
=True, handle_io_errors
=True, mode
='rU'):
190 - `source`: either a file-like object (which is read directly), or
191 `None` (which implies `sys.stdin` if no `source_path` given).
192 - `source_path`: a path to a file, which is opened and then read.
193 - `encoding`: the expected text encoding of the input file.
194 - `error_handler`: the encoding error handler to use.
195 - `autoclose`: close automatically after read (except when
196 `sys.stdin` is the source).
197 - `handle_io_errors`: summarize I/O errors here, and exit?
198 - `mode`: how the file is to be opened (see standard function
199 `open`). The default 'rU' provides universal newline support
202 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
203 self
.autoclose
= autoclose
204 self
.handle_io_errors
= handle_io_errors
205 self
._stderr
= ErrorOutput()
209 # Specify encoding in Python 3
210 if sys
.version_info
>= (3,0):
211 kwargs
= {'encoding': self
.encoding
,
212 'errors': self
.error_handler
}
217 self
.source
= open(source_path
, mode
, **kwargs
)
218 except IOError, error
:
219 if not handle_io_errors
:
221 print >>self
._stderr
, ErrorString(error
)
222 print >>self
._stderr
, (u
'Unable to open source'
223 u
" file for reading ('%s'). Exiting." % source_path
)
226 self
.source
= sys
.stdin
227 elif (sys
.version_info
>= (3,0) and
228 self
.encoding
and hasattr(self
.source
, 'encoding') and
229 self
.encoding
!= self
.source
.encoding
and
230 codecs
.lookup(self
.encoding
) !=
231 codecs
.lookup(self
.source
.encoding
)):
232 # TODO: re-open, warn or raise error?
233 raise UnicodeError('Encoding clash: encoding given is "%s" '
234 'but source is opened with encoding "%s".' %
235 (self
.encoding
, self
.source
.encoding
))
238 self
.source_path
= self
.source
.name
239 except AttributeError:
244 Read and decode a single file and return the data (Unicode string).
246 try: # In Python < 2.5, try...except has to be nested in try...finally.
248 if self
.source
is sys
.stdin
and sys
.version_info
>= (3,0):
249 # read as binary data to circumvent auto-decoding
250 data
= self
.source
.buffer.read()
252 data
= b('\n').join(data
.splitlines()) + b('\n')
254 data
= self
.source
.read()
255 except (UnicodeError, LookupError), err
: # (in Py3k read() decodes)
256 if not self
.encoding
and self
.source_path
:
257 # re-read in binary mode and decode with heuristics
258 b_source
= open(self
.source_path
, 'rb')
259 data
= b_source
.read()
262 data
= b('\n').join(data
.splitlines()) + b('\n')
268 return self
.decode(data
)
272 Return lines of a single file as list of Unicode strings.
274 return self
.read().splitlines(True)
277 if self
.source
is not sys
.stdin
:
281 class FileOutput(Output
):
284 Output for single, simple file-like objects.
287 def __init__(self
, destination
=None, destination_path
=None,
288 encoding
=None, error_handler
='strict', autoclose
=True,
289 handle_io_errors
=True):
292 - `destination`: either a file-like object (which is written
293 directly) or `None` (which implies `sys.stdout` if no
294 `destination_path` given).
295 - `destination_path`: a path to a file, which is opened and then
297 - `autoclose`: close automatically after write (except when
298 `sys.stdout` or `sys.stderr` is the destination).
300 Output
.__init
__(self
, destination
, destination_path
,
301 encoding
, error_handler
)
303 self
.autoclose
= autoclose
304 self
.handle_io_errors
= handle_io_errors
305 self
._stderr
= ErrorOutput()
306 if destination
is None:
310 self
.destination
= sys
.stdout
311 if not destination_path
:
313 self
.destination_path
= self
.destination
.name
314 except AttributeError:
318 # Specify encoding in Python 3.
319 # (Do not use binary mode ('wb') as this prevents the
320 # conversion of newlines to the system specific default.)
321 if sys
.version_info
>= (3,0):
322 kwargs
= {'encoding': self
.encoding
,
323 'errors': self
.error_handler
}
327 self
.destination
= open(self
.destination_path
, 'w', **kwargs
)
328 except IOError, error
:
329 if not self
.handle_io_errors
:
331 print >>self
._stderr
, ErrorString(error
)
332 print >>self
._stderr
, (u
'Unable to open destination file'
333 u
" for writing ('%s'). Exiting." % self
.destination_path
)
337 def write(self
, data
):
338 """Encode `data`, write it to a single file, and return it.
340 In Python 3, `data` is returned unchanged.
342 if sys
.version_info
< (3,0):
343 data
= self
.encode(data
)
346 try: # In Python < 2.5, try...except has to be nested in try...finally.
348 if (sys
.version_info
>= (3,0) and self
.encoding
and
349 hasattr(self
.destination
,'encoding') and
350 self
.encoding
!= self
.destination
.encoding
and
351 codecs
.lookup(self
.encoding
) !=
352 codecs
.lookup(self
.destination
.encoding
)):
353 # encode self, write bytes
354 bdata
= self
.encode(data
)
355 if os
.linesep
!= '\n':
356 bdata
= bdata
.replace('\n', os
.linesep
)
357 sys
.stdout
.buffer.write(bdata
)
359 self
.destination
.write(data
)
360 except (UnicodeError, LookupError), err
: # can only happen in py3k
362 'Unable to encode output data. output-encoding is: '
363 '%s.\n(%s)' % (self
.encoding
, ErrorString(err
)))
370 if self
.destination
not in (sys
.stdout
, sys
.stderr
):
371 self
.destination
.close()
375 class BinaryFileOutput(FileOutput
):
377 A version of docutils.io.FileOutput which writes to a binary file.
381 self
.destination
= open(self
.destination_path
, 'wb')
382 except IOError, error
:
383 if not self
.handle_io_errors
:
385 print >>self
._stderr
, ErrorString(error
)
386 print >>self
._stderr
, (u
'Unable to open destination file'
387 u
" for writing ('%s'). Exiting." % self
.destination_path
)
392 class StringInput(Input
):
398 default_source_path
= '<string>'
401 """Decode and return the source string."""
402 return self
.decode(self
.source
)
405 class StringOutput(Output
):
408 Direct string output.
411 default_destination_path
= '<string>'
413 def write(self
, data
):
414 """Encode `data`, store it in `self.destination`, and return it."""
415 self
.destination
= self
.encode(data
)
416 return self
.destination
419 class NullInput(Input
):
422 Degenerate input: read nothing.
425 default_source_path
= 'null input'
428 """Return a null string."""
432 class NullOutput(Output
):
435 Degenerate output: write nothing.
438 default_destination_path
= 'null output'
440 def write(self
, data
):
441 """Do nothing ([don't even] send data to the bit bucket)."""
445 class DocTreeInput(Input
):
448 Adapter for document tree input.
450 The document tree must be passed in the ``source`` parameter.
453 default_source_path
= 'doctree input'
456 """Return the document tree."""