2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
19 from docutils
import TransformSpec
20 from docutils
._compat
import b
23 class Input(TransformSpec
):
26 Abstract base class for input wrappers.
29 component_type
= 'input'
31 default_source_path
= None
33 def __init__(self
, source
=None, source_path
=None, encoding
=None,
34 error_handler
='strict'):
35 self
.encoding
= encoding
36 """Text encoding for the input source."""
38 self
.error_handler
= error_handler
39 """Text decoding error handler."""
42 """The source of input data."""
44 self
.source_path
= source_path
45 """A text reference to the source."""
48 self
.source_path
= self
.default_source_path
50 self
.successful_encoding
= None
51 """The encoding that successfully decoded the source data."""
54 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
58 raise NotImplementedError
60 def decode(self
, data
):
62 Decode a string, `data`, heuristically.
63 Raise UnicodeError if unsuccessful.
65 The client application should call ``locale.setlocale`` at the
66 beginning of processing::
68 locale.setlocale(locale.LC_ALL, '')
70 if self
.encoding
and self
.encoding
.lower() == 'unicode':
71 assert isinstance(data
, unicode), (
72 'input encoding is "unicode" '
73 'but input is not a unicode object')
74 if isinstance(data
, unicode):
75 # Accept unicode even if self.encoding != 'unicode'.
78 # We believe the user/application when the encoding is
80 encodings
= [self
.encoding
]
82 data_encoding
= self
.determine_encoding_from_data(data
)
84 # If the data declares its encoding (explicitly or via a BOM),
86 encodings
= [data_encoding
]
88 # Apply heuristics only if no encoding is explicitly given and
89 # no BOM found. Start with UTF-8, because that only matches
90 # data that *IS* UTF-8:
93 encodings
.append(locale
.getlocale()[1])
97 encodings
.append(locale
.getdefaultlocale()[1])
101 encodings
.append('latin-1')
104 for enc
in encodings
:
108 decoded
= unicode(data
, enc
, self
.error_handler
)
109 self
.successful_encoding
= enc
110 # Return decoded, removing BOMs.
111 return decoded
.replace(u
'\ufeff', u
'')
112 except (UnicodeError, LookupError), tmperror
:
113 error
= tmperror
# working around Python 3 deleting the
114 # error variable after the except clause
115 if error
is not None:
116 error_details
= '\n(%s: %s)' % (error
.__class
__.__name
__, error
)
118 'Unable to decode input data. Tried the following encodings: '
120 % (', '.join([repr(enc
) for enc
in encodings
if enc
]),
123 coding_slug
= re
.compile(b("coding[:=]\s*([-\w.]+)"))
124 """Encoding declaration pattern."""
126 byte_order_marks
= ((codecs
.BOM_UTF8
, 'utf-8'), # actually 'utf-8-sig'
127 (codecs
.BOM_UTF16_BE
, 'utf-16-be'),
128 (codecs
.BOM_UTF16_LE
, 'utf-16-le'),)
129 """Sequence of (start_bytes, encoding) tuples for encoding detection.
130 The first bytes of input data are checked against the start_bytes strings.
131 A match indicates the given encoding."""
133 def determine_encoding_from_data(self
, data
):
135 Try to determine the encoding of `data` by looking *in* `data`.
136 Check for a byte order mark (BOM) or an encoding declaration.
138 # check for a byte order mark:
139 for start_bytes
, encoding
in self
.byte_order_marks
:
140 if data
.startswith(start_bytes
):
142 # check for an encoding declaration pattern in first 2 lines of file:
143 for line
in data
.splitlines()[:2]:
144 match
= self
.coding_slug
.search(line
)
146 return match
.group(1).decode('ascii')
150 class Output(TransformSpec
):
153 Abstract base class for output wrappers.
156 component_type
= 'output'
158 default_destination_path
= None
160 def __init__(self
, destination
=None, destination_path
=None,
161 encoding
=None, error_handler
='strict'):
162 self
.encoding
= encoding
163 """Text encoding for the output destination."""
165 self
.error_handler
= error_handler
or 'strict'
166 """Text encoding error handler."""
168 self
.destination
= destination
169 """The destination for output data."""
171 self
.destination_path
= destination_path
172 """A text reference to the destination."""
174 if not destination_path
:
175 self
.destination_path
= self
.default_destination_path
178 return ('%s: destination=%r, destination_path=%r'
179 % (self
.__class
__, self
.destination
, self
.destination_path
))
181 def write(self
, data
):
182 """`data` is a Unicode string, to be encoded by `self.encode`."""
183 raise NotImplementedError
185 def encode(self
, data
):
186 if self
.encoding
and self
.encoding
.lower() == 'unicode':
187 assert isinstance(data
, unicode), (
188 'the encoding given is "unicode" but the output is not '
191 if not isinstance(data
, unicode):
192 # Non-unicode (e.g. binary) output.
195 return data
.encode(self
.encoding
, self
.error_handler
)
198 class FileInput(Input
):
201 Input for single, simple file-like objects.
204 def __init__(self
, source
=None, source_path
=None,
205 encoding
=None, error_handler
='strict',
206 autoclose
=1, handle_io_errors
=1, mode
='rU'):
209 - `source`: either a file-like object (which is read directly), or
210 `None` (which implies `sys.stdin` if no `source_path` given).
211 - `source_path`: a path to a file, which is opened and then read.
212 - `encoding`: the expected text encoding of the input file.
213 - `error_handler`: the encoding error handler to use.
214 - `autoclose`: close automatically after read (boolean); always
215 false if `sys.stdin` is the source.
216 - `handle_io_errors`: summarize I/O errors here, and exit?
217 - `mode`: how the file is to be opened (see standard function
218 `open`). The default 'rU' provides universal newline support
221 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
222 self
.autoclose
= autoclose
223 self
.handle_io_errors
= handle_io_errors
226 # Specify encoding in Python 3
227 if sys
.version_info
>= (3,0):
228 kwargs
= {'encoding': self
.encoding
,
229 'errors': self
.error_handler
}
234 self
.source
= open(source_path
, mode
, **kwargs
)
235 except IOError, error
:
236 if not handle_io_errors
:
238 print >>sys
.stderr
, '%s: %s' % (error
.__class
__.__name
__,
240 print >>sys
.stderr
, ('Unable to open source file for '
241 "reading ('%s'). Exiting." %
245 self
.source
= sys
.stdin
246 self
.autoclose
= None
249 self
.source_path
= self
.source
.name
250 except AttributeError:
255 Read and decode a single file and return the data (Unicode string).
258 data
= self
.source
.read()
262 return self
.decode(data
)
266 Return lines of a single file as list of Unicode strings.
269 lines
= self
.source
.readlines()
273 return [self
.decode(line
) for line
in lines
]
279 class FileOutput(Output
):
282 Output for single, simple file-like objects.
285 def __init__(self
, destination
=None, destination_path
=None,
286 encoding
=None, error_handler
='strict', autoclose
=1,
290 - `destination`: either a file-like object (which is written
291 directly) or `None` (which implies `sys.stdout` if no
292 `destination_path` given).
293 - `destination_path`: a path to a file, which is opened and then
295 - `autoclose`: close automatically after write (boolean); always
296 false if `sys.stdout` is the destination.
298 Output
.__init
__(self
, destination
, destination_path
,
299 encoding
, error_handler
)
301 self
.autoclose
= autoclose
302 self
.handle_io_errors
= handle_io_errors
303 if destination
is None:
307 self
.destination
= sys
.stdout
308 self
.autoclose
= None
309 if not destination_path
:
311 self
.destination_path
= self
.destination
.name
312 except AttributeError:
316 # Specify encoding in Python 3.
317 # (Do not use binary mode ('wb') as this prevents the
318 # conversion of newlines to the system specific default.)
319 if sys
.version_info
>= (3,0):
320 kwargs
= {'encoding': self
.encoding
,
321 'errors': self
.error_handler
}
326 self
.destination
= open(self
.destination_path
, 'w', **kwargs
)
327 except IOError, error
:
328 if not self
.handle_io_errors
:
330 print >>sys
.stderr
, '%s: %s' % (error
.__class
__.__name
__,
332 print >>sys
.stderr
, ('Unable to open destination file for writing'
333 " ('%s'). Exiting." % self
.destination_path
)
337 def write(self
, data
):
338 """Encode `data`, write it to a single file, and return it.
340 In Python 3, a (unicode) String is returned.
342 if sys
.version_info
>= (3,0):
343 output
= data
# in py3k, write expects a (Unicode) string
345 output
= self
.encode(data
)
349 self
.destination
.write(output
)
356 self
.destination
.close()
360 class BinaryFileOutput(FileOutput
):
362 A version of docutils.io.FileOutput which writes to a binary file.
366 self
.destination
= open(self
.destination_path
, 'wb')
367 except IOError, error
:
368 if not self
.handle_io_errors
:
370 print >>sys
.stderr
, '%s: %s' % (error
.__class
__.__name
__,
372 print >>sys
.stderr
, ('Unable to open destination file for writing '
373 "('%s'). Exiting." % self
.destination_path
)
378 class StringInput(Input
):
384 default_source_path
= '<string>'
387 """Decode and return the source string."""
388 return self
.decode(self
.source
)
391 class StringOutput(Output
):
394 Direct string output.
397 default_destination_path
= '<string>'
399 def write(self
, data
):
400 """Encode `data`, store it in `self.destination`, and return it."""
401 self
.destination
= self
.encode(data
)
402 return self
.destination
405 class NullInput(Input
):
408 Degenerate input: read nothing.
411 default_source_path
= 'null input'
414 """Return a null string."""
418 class NullOutput(Output
):
421 Degenerate output: write nothing.
424 default_destination_path
= 'null output'
426 def write(self
, data
):
427 """Do nothing ([don't even] send data to the bit bucket)."""
431 class DocTreeInput(Input
):
434 Adapter for document tree input.
436 The document tree must be passed in the ``source`` parameter.
439 default_source_path
= 'doctree input'
442 """Return the document tree."""