2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
18 from types
import UnicodeType
19 from docutils
import TransformSpec
22 class Input(TransformSpec
):
25 Abstract base class for input wrappers.
28 component_type
= 'input'
30 default_source_path
= None
32 def __init__(self
, source
=None, source_path
=None, encoding
=None,
33 error_handler
='strict'):
34 self
.encoding
= encoding
35 """Text encoding for the input source."""
37 self
.error_handler
= error_handler
38 """Text decoding error handler."""
41 """The source of input data."""
43 self
.source_path
= source_path
44 """A text reference to the source."""
47 self
.source_path
= self
.default_source_path
49 self
.successful_encoding
= None
50 """The encoding that successfully decoded the source data."""
53 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
57 raise NotImplementedError
59 def decode(self
, data
):
61 Decode a string, `data`, heuristically.
62 Raise UnicodeError if unsuccessful.
64 The client application should call ``locale.setlocale`` at the
65 beginning of processing::
67 locale.setlocale(locale.LC_ALL, '')
69 if self
.encoding
and self
.encoding
.lower() == 'unicode':
70 assert isinstance(data
, UnicodeType
), (
71 'input encoding is "unicode" '
72 'but input is not a unicode object')
73 if isinstance(data
, UnicodeType
):
74 # Accept unicode even if self.encoding != 'unicode'.
77 # We believe the user/application when the encoding is
79 encodings
= [self
.encoding
]
81 data_encoding
= self
.determine_encoding_from_data(data
)
83 # If the data declares its encoding (explicitly or via a BOM),
85 encodings
= [data_encoding
]
87 # Apply heuristics only if no encoding is explicitly given and
88 # no BOM found. Start with UTF-8, because that only matches
89 # data that *IS* UTF-8:
92 # for Python 2.2 compatibility
93 encodings
.append(locale
.nl_langinfo(locale
.CODESET
))
97 encodings
.append(locale
.getlocale()[1])
101 encodings
.append(locale
.getdefaultlocale()[1])
105 encodings
.append('latin-1')
108 for enc
in encodings
:
112 decoded
= unicode(data
, enc
, self
.error_handler
)
113 self
.successful_encoding
= enc
114 # Return decoded, removing BOMs.
115 return decoded
.replace(u
'\ufeff', u
'')
116 except (UnicodeError, LookupError), error
:
118 if error
is not None:
119 error_details
= '\n(%s: %s)' % (error
.__class
__.__name
__, error
)
121 'Unable to decode input data. Tried the following encodings: '
123 % (', '.join([repr(enc
) for enc
in encodings
if enc
]),
126 coding_slug
= re
.compile("coding[:=]\s*([-\w.]+)")
127 """Encoding declaration pattern."""
129 byte_order_marks
= (('\xef\xbb\xbf', 'utf-8'),
130 ('\xfe\xff', 'utf-16-be'),
131 ('\xff\xfe', 'utf-16-le'),)
132 """Sequence of (start_bytes, encoding) tuples to for encoding detection.
133 The first bytes of input data are checked against the start_bytes strings.
134 A match indicates the given encoding."""
136 def determine_encoding_from_data(self
, data
):
138 Try to determine the encoding of `data` by looking *in* `data`.
139 Check for a byte order mark (BOM) or an encoding declaration.
141 # check for a byte order mark:
142 for start_bytes
, encoding
in self
.byte_order_marks
:
143 if data
.startswith(start_bytes
):
145 # check for an encoding declaration pattern in first 2 lines of file:
146 for line
in data
.splitlines()[:2]:
147 match
= self
.coding_slug
.search(line
)
149 return match
.group(1)
153 class Output(TransformSpec
):
156 Abstract base class for output wrappers.
159 component_type
= 'output'
161 default_destination_path
= None
163 def __init__(self
, destination
=None, destination_path
=None,
164 encoding
=None, error_handler
='strict'):
165 self
.encoding
= encoding
166 """Text encoding for the output destination."""
168 self
.error_handler
= error_handler
or 'strict'
169 """Text encoding error handler."""
171 self
.destination
= destination
172 """The destination for output data."""
174 self
.destination_path
= destination_path
175 """A text reference to the destination."""
177 if not destination_path
:
178 self
.destination_path
= self
.default_destination_path
181 return ('%s: destination=%r, destination_path=%r'
182 % (self
.__class
__, self
.destination
, self
.destination_path
))
184 def write(self
, data
):
185 """`data` is a Unicode string, to be encoded by `self.encode`."""
186 raise NotImplementedError
188 def encode(self
, data
):
189 if self
.encoding
and self
.encoding
.lower() == 'unicode':
190 assert isinstance(data
, UnicodeType
), (
191 'the encoding given is "unicode" but the output is not '
194 if not isinstance(data
, UnicodeType
):
195 # Non-unicode (e.g. binary) output.
199 return data
.encode(self
.encoding
, self
.error_handler
)
200 except (LookupError, ValueError):
201 # LookupError is raised if there are unencodable chars
202 # in data and the error_handler isn't found. In old
203 # Python versions, ValueError is raised.
204 if self
.error_handler
== 'xmlcharrefreplace':
205 # We are using xmlcharrefreplace with a Python
206 # version that doesn't support it (2.1, 2.2, or
207 # IronPython 1.0) so we emulate its behavior.
208 return ''.join([self
.xmlcharref_encode(char
)
213 def xmlcharref_encode(self
, char
):
214 """Emulate Python 2.3's 'xmlcharrefreplace' encoding error handler."""
216 return char
.encode(self
.encoding
, 'strict')
218 return '&#%i;' % ord(char
)
221 class FileInput(Input
):
224 Input for single, simple file-like objects.
227 def __init__(self
, source
=None, source_path
=None,
228 encoding
=None, error_handler
='strict',
229 autoclose
=1, handle_io_errors
=1):
232 - `source`: either a file-like object (which is read directly), or
233 `None` (which implies `sys.stdin` if no `source_path` given).
234 - `source_path`: a path to a file, which is opened and then read.
235 - `encoding`: the expected text encoding of the input file.
236 - `error_handler`: the encoding error handler to use.
237 - `autoclose`: close automatically after read (boolean); always
238 false if `sys.stdin` is the source.
239 - `handle_io_errors`: summarize I/O errors here, and exit?
241 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
242 self
.autoclose
= autoclose
243 self
.handle_io_errors
= handle_io_errors
247 self
.source
= open(source_path
)
248 except IOError, error
:
249 if not handle_io_errors
:
251 print >>sys
.stderr
, '%s: %s' % (error
.__class
__.__name
__,
253 print >>sys
.stderr
, (
254 'Unable to open source file for reading (%r). Exiting.'
258 self
.source
= sys
.stdin
259 self
.autoclose
= None
262 self
.source_path
= self
.source
.name
263 except AttributeError:
268 Read and decode a single file and return the data (Unicode string).
271 data
= self
.source
.read()
275 return self
.decode(data
)
281 class FileOutput(Output
):
284 Output for single, simple file-like objects.
287 def __init__(self
, destination
=None, destination_path
=None,
288 encoding
=None, error_handler
='strict', autoclose
=1,
292 - `destination`: either a file-like object (which is written
293 directly) or `None` (which implies `sys.stdout` if no
294 `destination_path` given).
295 - `destination_path`: a path to a file, which is opened and then
297 - `autoclose`: close automatically after write (boolean); always
298 false if `sys.stdout` is the destination.
300 Output
.__init
__(self
, destination
, destination_path
,
301 encoding
, error_handler
)
303 self
.autoclose
= autoclose
304 self
.handle_io_errors
= handle_io_errors
305 if destination
is None:
309 self
.destination
= sys
.stdout
310 self
.autoclose
= None
311 if not destination_path
:
313 self
.destination_path
= self
.destination
.name
314 except AttributeError:
319 self
.destination
= open(self
.destination_path
, 'w')
320 except IOError, error
:
321 if not self
.handle_io_errors
:
323 print >>sys
.stderr
, '%s: %s' % (error
.__class
__.__name
__,
325 print >>sys
.stderr
, ('Unable to open destination file for writing '
326 '(%r). Exiting.' % self
.destination_path
)
330 def write(self
, data
):
331 """Encode `data`, write it to a single file, and return it."""
332 output
= self
.encode(data
)
336 self
.destination
.write(output
)
343 self
.destination
.close()
347 class StringInput(Input
):
353 default_source_path
= '<string>'
356 """Decode and return the source string."""
357 return self
.decode(self
.source
)
360 class StringOutput(Output
):
363 Direct string output.
366 default_destination_path
= '<string>'
368 def write(self
, data
):
369 """Encode `data`, store it in `self.destination`, and return it."""
370 self
.destination
= self
.encode(data
)
371 return self
.destination
374 class NullInput(Input
):
377 Degenerate input: read nothing.
380 default_source_path
= 'null input'
383 """Return a null string."""
387 class NullOutput(Output
):
390 Degenerate output: write nothing.
393 default_destination_path
= 'null output'
395 def write(self
, data
):
396 """Do nothing ([don't even] send data to the bit bucket)."""
400 class DocTreeInput(Input
):
403 Adapter for document tree input.
405 The document tree must be passed in the ``source`` parameter.
408 default_source_path
= 'doctree input'
411 """Return the document tree."""