2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
10 __docformat__
= 'reStructuredText'
15 from docutils
import TransformSpec
16 from docutils
._compat
import b
17 from docutils
.error_reporting
import locale_encoding
, ErrorString
, ErrorOutput
19 class Input(TransformSpec
):
22 Abstract base class for input wrappers.
25 component_type
= 'input'
27 default_source_path
= None
29 def __init__(self
, source
=None, source_path
=None, encoding
=None,
30 error_handler
='strict'):
31 self
.encoding
= encoding
32 """Text encoding for the input source."""
34 self
.error_handler
= error_handler
35 """Text decoding error handler."""
38 """The source of input data."""
40 self
.source_path
= source_path
41 """A text reference to the source."""
44 self
.source_path
= self
.default_source_path
46 self
.successful_encoding
= None
47 """The encoding that successfully decoded the source data."""
50 return '%s: source=%r, source_path=%r' % (self
.__class
__, self
.source
,
54 raise NotImplementedError
56 def decode(self
, data
):
58 Decode a string, `data`, heuristically.
59 Raise UnicodeError if unsuccessful.
61 The client application should call ``locale.setlocale`` at the
62 beginning of processing::
64 locale.setlocale(locale.LC_ALL, '')
66 if self
.encoding
and self
.encoding
.lower() == 'unicode':
67 assert isinstance(data
, unicode), (
68 'input encoding is "unicode" '
69 'but input is not a unicode object')
70 if isinstance(data
, unicode):
71 # Accept unicode even if self.encoding != 'unicode'.
74 # We believe the user/application when the encoding is
76 encodings
= [self
.encoding
]
78 data_encoding
= self
.determine_encoding_from_data(data
)
80 # If the data declares its encoding (explicitly or via a BOM),
82 encodings
= [data_encoding
]
84 # Apply heuristics only if no encoding is explicitly given and
85 # no BOM found. Start with UTF-8, because that only matches
86 # data that *IS* UTF-8:
87 encodings
= [enc
for enc
in ('utf-8',
88 locale_encoding
, # can be None
89 'latin-1') # fallback encoding
93 decoded
= unicode(data
, enc
, self
.error_handler
)
94 self
.successful_encoding
= enc
95 # Return decoded, removing BOMs.
96 return decoded
.replace(u
'\ufeff', u
'')
97 except (UnicodeError, LookupError), err
:
98 error
= err
# in Python 3, the <exception instance> is
99 # local to the except clause
101 'Unable to decode input data. Tried the following encodings: '
102 '%s.\n(%s)' % (', '.join([repr(enc
) for enc
in encodings
]),
105 coding_slug
= re
.compile(b("coding[:=]\s*([-\w.]+)"))
106 """Encoding declaration pattern."""
108 byte_order_marks
= ((codecs
.BOM_UTF8
, 'utf-8'), # actually 'utf-8-sig'
109 (codecs
.BOM_UTF16_BE
, 'utf-16-be'),
110 (codecs
.BOM_UTF16_LE
, 'utf-16-le'),)
111 """Sequence of (start_bytes, encoding) tuples for encoding detection.
112 The first bytes of input data are checked against the start_bytes strings.
113 A match indicates the given encoding."""
115 def determine_encoding_from_data(self
, data
):
117 Try to determine the encoding of `data` by looking *in* `data`.
118 Check for a byte order mark (BOM) or an encoding declaration.
120 # check for a byte order mark:
121 for start_bytes
, encoding
in self
.byte_order_marks
:
122 if data
.startswith(start_bytes
):
124 # check for an encoding declaration pattern in first 2 lines of file:
125 for line
in data
.splitlines()[:2]:
126 match
= self
.coding_slug
.search(line
)
128 return match
.group(1).decode('ascii')
132 class Output(TransformSpec
):
135 Abstract base class for output wrappers.
138 component_type
= 'output'
140 default_destination_path
= None
142 def __init__(self
, destination
=None, destination_path
=None,
143 encoding
=None, error_handler
='strict'):
144 self
.encoding
= encoding
145 """Text encoding for the output destination."""
147 self
.error_handler
= error_handler
or 'strict'
148 """Text encoding error handler."""
150 self
.destination
= destination
151 """The destination for output data."""
153 self
.destination_path
= destination_path
154 """A text reference to the destination."""
156 if not destination_path
:
157 self
.destination_path
= self
.default_destination_path
160 return ('%s: destination=%r, destination_path=%r'
161 % (self
.__class
__, self
.destination
, self
.destination_path
))
163 def write(self
, data
):
164 """`data` is a Unicode string, to be encoded by `self.encode`."""
165 raise NotImplementedError
167 def encode(self
, data
):
168 if self
.encoding
and self
.encoding
.lower() == 'unicode':
169 assert isinstance(data
, unicode), (
170 'the encoding given is "unicode" but the output is not '
173 if not isinstance(data
, unicode):
174 # Non-unicode (e.g. binary) output.
177 return data
.encode(self
.encoding
, self
.error_handler
)
180 class FileInput(Input
):
183 Input for single, simple file-like objects.
185 def __init__(self
, source
=None, source_path
=None,
186 encoding
=None, error_handler
='strict',
187 autoclose
=True, handle_io_errors
=True, mode
='rU'):
190 - `source`: either a file-like object (which is read directly), or
191 `None` (which implies `sys.stdin` if no `source_path` given).
192 - `source_path`: a path to a file, which is opened and then read.
193 - `encoding`: the expected text encoding of the input file.
194 - `error_handler`: the encoding error handler to use.
195 - `autoclose`: close automatically after read (except when
196 `sys.stdin` is the source).
197 - `handle_io_errors`: summarize I/O errors here, and exit?
198 - `mode`: how the file is to be opened (see standard function
199 `open`). The default 'rU' provides universal newline support
202 Input
.__init
__(self
, source
, source_path
, encoding
, error_handler
)
203 self
.autoclose
= autoclose
204 self
.handle_io_errors
= handle_io_errors
205 self
._stderr
= ErrorOutput()
209 # Specify encoding in Python 3
210 if sys
.version_info
>= (3,0):
211 kwargs
= {'encoding': self
.encoding
,
212 'errors': self
.error_handler
}
217 self
.source
= open(source_path
, mode
, **kwargs
)
218 except IOError, error
:
219 if not handle_io_errors
:
221 print >>self
._stderr
, ErrorString(error
)
222 print >>self
._stderr
, (u
'Unable to open source'
223 u
" file for reading ('%s'). Exiting." % source_path
)
226 self
.source
= sys
.stdin
229 self
.source_path
= self
.source
.name
230 except AttributeError:
235 Read and decode a single file and return the data (Unicode string).
238 data
= self
.source
.read()
242 return self
.decode(data
)
246 Return lines of a single file as list of Unicode strings.
249 lines
= self
.source
.readlines()
253 return [self
.decode(line
) for line
in lines
]
256 if self
.source
is not sys
.stdin
:
260 class FileOutput(Output
):
263 Output for single, simple file-like objects.
266 def __init__(self
, destination
=None, destination_path
=None,
267 encoding
=None, error_handler
='strict', autoclose
=True,
268 handle_io_errors
=True):
271 - `destination`: either a file-like object (which is written
272 directly) or `None` (which implies `sys.stdout` if no
273 `destination_path` given).
274 - `destination_path`: a path to a file, which is opened and then
276 - `autoclose`: close automatically after write (except when
277 `sys.stdout` or `sys.stderr` is the destination).
279 Output
.__init
__(self
, destination
, destination_path
,
280 encoding
, error_handler
)
282 self
.autoclose
= autoclose
283 self
.handle_io_errors
= handle_io_errors
284 self
._stderr
= ErrorOutput()
285 if destination
is None:
289 self
.destination
= sys
.stdout
290 if not destination_path
:
292 self
.destination_path
= self
.destination
.name
293 except AttributeError:
297 # Specify encoding in Python 3.
298 # (Do not use binary mode ('wb') as this prevents the
299 # conversion of newlines to the system specific default.)
300 if sys
.version_info
>= (3,0):
301 kwargs
= {'encoding': self
.encoding
,
302 'errors': self
.error_handler
}
307 self
.destination
= open(self
.destination_path
, 'w', **kwargs
)
308 except IOError, error
:
309 if not self
.handle_io_errors
:
311 print >>self
._stderr
, ErrorString(error
)
312 print >>self
._stderr
, (u
'Unable to open destination file'
313 u
" for writing ('%s'). Exiting." % self
.destination_path
)
317 def write(self
, data
):
318 """Encode `data`, write it to a single file, and return it.
320 In Python 3, a (unicode) string is returned.
322 if sys
.version_info
>= (3,0):
323 output
= data
# in py3k, write expects a (Unicode) string
325 output
= self
.encode(data
)
329 self
.destination
.write(output
)
336 if self
.destination
not in (sys
.stdout
, sys
.stderr
):
337 self
.destination
.close()
341 class BinaryFileOutput(FileOutput
):
343 A version of docutils.io.FileOutput which writes to a binary file.
347 self
.destination
= open(self
.destination_path
, 'wb')
348 except IOError, error
:
349 if not self
.handle_io_errors
:
351 print >>self
._stderr
, ErrorString(error
)
352 print >>self
._stderr
, (u
'Unable to open destination file'
353 u
" for writing ('%s'). Exiting." % self
.destination_path
)
358 class StringInput(Input
):
364 default_source_path
= '<string>'
367 """Decode and return the source string."""
368 return self
.decode(self
.source
)
371 class StringOutput(Output
):
374 Direct string output.
377 default_destination_path
= '<string>'
379 def write(self
, data
):
380 """Encode `data`, store it in `self.destination`, and return it."""
381 self
.destination
= self
.encode(data
)
382 return self
.destination
385 class NullInput(Input
):
388 Degenerate input: read nothing.
391 default_source_path
= 'null input'
394 """Return a null string."""
398 class NullOutput(Output
):
401 Degenerate output: write nothing.
404 default_destination_path
= 'null output'
406 def write(self
, data
):
407 """Do nothing ([don't even] send data to the bit bucket)."""
411 class DocTreeInput(Input
):
414 Adapter for document tree input.
416 The document tree must be passed in the ``source`` parameter.
419 default_source_path
= 'doctree input'
422 """Return the document tree."""