Fix bug 2896512 and add some more test cases.
[docutils.git] / docutils / io.py
blob9523142fd789da77eb61148b2dd4ee71b482dc79
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 try:
14 import locale
15 except:
16 pass
17 import re
18 from docutils import TransformSpec
19 from docutils._compat import b
22 class Input(TransformSpec):
24 """
25 Abstract base class for input wrappers.
26 """
28 component_type = 'input'
30 default_source_path = None
32 def __init__(self, source=None, source_path=None, encoding=None,
33 error_handler='strict'):
34 self.encoding = encoding
35 """Text encoding for the input source."""
37 self.error_handler = error_handler
38 """Text decoding error handler."""
40 self.source = source
41 """The source of input data."""
43 self.source_path = source_path
44 """A text reference to the source."""
46 if not source_path:
47 self.source_path = self.default_source_path
49 self.successful_encoding = None
50 """The encoding that successfully decoded the source data."""
52 def __repr__(self):
53 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
54 self.source_path)
56 def read(self):
57 raise NotImplementedError
59 def decode(self, data):
60 """
61 Decode a string, `data`, heuristically.
62 Raise UnicodeError if unsuccessful.
64 The client application should call ``locale.setlocale`` at the
65 beginning of processing::
67 locale.setlocale(locale.LC_ALL, '')
68 """
69 if self.encoding and self.encoding.lower() == 'unicode':
70 assert isinstance(data, unicode), (
71 'input encoding is "unicode" '
72 'but input is not a unicode object')
73 if isinstance(data, unicode):
74 # Accept unicode even if self.encoding != 'unicode'.
75 return data
76 if self.encoding:
77 # We believe the user/application when the encoding is
78 # explicitly given.
79 encodings = [self.encoding]
80 else:
81 data_encoding = self.determine_encoding_from_data(data)
82 if data_encoding:
83 # If the data declares its encoding (explicitly or via a BOM),
84 # we believe it.
85 encodings = [data_encoding]
86 else:
87 # Apply heuristics only if no encoding is explicitly given and
88 # no BOM found. Start with UTF-8, because that only matches
89 # data that *IS* UTF-8:
90 encodings = ['utf-8']
91 try:
92 # for Python 2.2 compatibility
93 encodings.append(locale.nl_langinfo(locale.CODESET))
94 except:
95 pass
96 try:
97 encodings.append(locale.getlocale()[1])
98 except:
99 pass
100 try:
101 encodings.append(locale.getdefaultlocale()[1])
102 except:
103 pass
104 # fallback encoding:
105 encodings.append('latin-1')
106 error = None
107 error_details = ''
108 for enc in encodings:
109 if not enc:
110 continue
111 try:
112 decoded = unicode(data, enc, self.error_handler)
113 self.successful_encoding = enc
114 # Return decoded, removing BOMs.
115 return decoded.replace(u'\ufeff', u'')
116 except (UnicodeError, LookupError), tmperror:
117 error = tmperror # working around Python 3 deleting the
118 # error variable after the except clause
119 if error is not None:
120 error_details = '\n(%s: %s)' % (error.__class__.__name__, error)
121 raise UnicodeError(
122 'Unable to decode input data. Tried the following encodings: '
123 '%s.%s'
124 % (', '.join([repr(enc) for enc in encodings if enc]),
125 error_details))
127 coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
128 """Encoding declaration pattern."""
130 byte_order_marks = ((b('\xef\xbb\xbf'), 'utf-8'),
131 (b('\xfe\xff'), 'utf-16-be'),
132 (b('\xff\xfe'), 'utf-16-le'),)
133 """Sequence of (start_bytes, encoding) tuples to for encoding detection.
134 The first bytes of input data are checked against the start_bytes strings.
135 A match indicates the given encoding."""
137 def determine_encoding_from_data(self, data):
139 Try to determine the encoding of `data` by looking *in* `data`.
140 Check for a byte order mark (BOM) or an encoding declaration.
142 # check for a byte order mark:
143 for start_bytes, encoding in self.byte_order_marks:
144 if data.startswith(start_bytes):
145 return encoding
146 # check for an encoding declaration pattern in first 2 lines of file:
147 for line in data.splitlines()[:2]:
148 match = self.coding_slug.search(line)
149 if match:
150 return match.group(1).decode('ascii')
151 return None
154 class Output(TransformSpec):
157 Abstract base class for output wrappers.
160 component_type = 'output'
162 default_destination_path = None
164 def __init__(self, destination=None, destination_path=None,
165 encoding=None, error_handler='strict'):
166 self.encoding = encoding
167 """Text encoding for the output destination."""
169 self.error_handler = error_handler or 'strict'
170 """Text encoding error handler."""
172 self.destination = destination
173 """The destination for output data."""
175 self.destination_path = destination_path
176 """A text reference to the destination."""
178 if not destination_path:
179 self.destination_path = self.default_destination_path
181 def __repr__(self):
182 return ('%s: destination=%r, destination_path=%r'
183 % (self.__class__, self.destination, self.destination_path))
185 def write(self, data):
186 """`data` is a Unicode string, to be encoded by `self.encode`."""
187 raise NotImplementedError
189 def encode(self, data):
190 if self.encoding and self.encoding.lower() == 'unicode':
191 assert isinstance(data, unicode), (
192 'the encoding given is "unicode" but the output is not '
193 'a Unicode string')
194 return data
195 if not isinstance(data, unicode):
196 # Non-unicode (e.g. binary) output.
197 return data
198 else:
199 return data.encode(self.encoding, self.error_handler)
202 class FileInput(Input):
205 Input for single, simple file-like objects.
208 def __init__(self, source=None, source_path=None,
209 encoding=None, error_handler='strict',
210 autoclose=1, handle_io_errors=1, mode='rU'):
212 :Parameters:
213 - `source`: either a file-like object (which is read directly), or
214 `None` (which implies `sys.stdin` if no `source_path` given).
215 - `source_path`: a path to a file, which is opened and then read.
216 - `encoding`: the expected text encoding of the input file.
217 - `error_handler`: the encoding error handler to use.
218 - `autoclose`: close automatically after read (boolean); always
219 false if `sys.stdin` is the source.
220 - `handle_io_errors`: summarize I/O errors here, and exit?
221 - `mode`: how the file is to be opened (see standard function
222 `open`). The default 'rU' provides universal newline support
223 for text files.
225 Input.__init__(self, source, source_path, encoding, error_handler)
226 self.autoclose = autoclose
227 self.handle_io_errors = handle_io_errors
228 if source is None:
229 if source_path:
230 try:
231 self.source = open(source_path, mode)
232 except IOError, error:
233 if not handle_io_errors:
234 raise
235 print >>sys.stderr, '%s: %s' % (error.__class__.__name__,
236 error)
237 print >>sys.stderr, ('Unable to open source file for '
238 "reading ('%s'). Exiting." %
239 source_path)
240 sys.exit(1)
241 else:
242 self.source = sys.stdin
243 self.autoclose = None
244 if not source_path:
245 try:
246 self.source_path = self.source.name
247 except AttributeError:
248 pass
250 def read(self):
252 Read and decode a single file and return the data (Unicode string).
254 try:
255 data = self.source.read()
256 finally:
257 if self.autoclose:
258 self.close()
259 return self.decode(data)
261 def readlines(self):
263 Return lines of a single file as list of Unicode strings.
265 try:
266 lines = self.source.readlines()
267 finally:
268 if self.autoclose:
269 self.close()
270 return [self.decode(line) for line in lines]
272 def close(self):
273 self.source.close()
276 class FileOutput(Output):
279 Output for single, simple file-like objects.
282 def __init__(self, destination=None, destination_path=None,
283 encoding=None, error_handler='strict', autoclose=1,
284 handle_io_errors=1):
286 :Parameters:
287 - `destination`: either a file-like object (which is written
288 directly) or `None` (which implies `sys.stdout` if no
289 `destination_path` given).
290 - `destination_path`: a path to a file, which is opened and then
291 written.
292 - `autoclose`: close automatically after write (boolean); always
293 false if `sys.stdout` is the destination.
295 Output.__init__(self, destination, destination_path,
296 encoding, error_handler)
297 self.opened = 1
298 self.autoclose = autoclose
299 self.handle_io_errors = handle_io_errors
300 if destination is None:
301 if destination_path:
302 self.opened = None
303 else:
304 self.destination = sys.stdout
305 self.autoclose = None
306 if not destination_path:
307 try:
308 self.destination_path = self.destination.name
309 except AttributeError:
310 pass
312 def open(self):
313 try:
314 self.destination = open(self.destination_path, 'w')
315 except IOError, error:
316 if not self.handle_io_errors:
317 raise
318 print >>sys.stderr, '%s: %s' % (error.__class__.__name__,
319 error)
320 print >>sys.stderr, ('Unable to open destination file for writing'
321 " ('%s'). Exiting." % self.destination_path)
322 sys.exit(1)
323 self.opened = 1
325 def write(self, data):
326 """Encode `data`, write it to a single file, and return it."""
327 output = self.encode(data)
328 if not self.opened:
329 self.open()
330 try:
331 self.destination.write(output)
332 finally:
333 if self.autoclose:
334 self.close()
335 return output
337 def close(self):
338 self.destination.close()
339 self.opened = None
342 class BinaryFileOutput(FileOutput):
344 A version of docutils.io.FileOutput which writes to a binary file.
346 def open(self):
347 try:
348 self.destination = open(self.destination_path, 'wb')
349 except IOError, error:
350 if not self.handle_io_errors:
351 raise
352 print >>sys.stderr, '%s: %s' % (error.__class__.__name__,
353 error)
354 print >>sys.stderr, ('Unable to open destination file for writing '
355 "('%s'). Exiting." % self.destination_path)
356 sys.exit(1)
357 self.opened = 1
360 class StringInput(Input):
363 Direct string input.
366 default_source_path = '<string>'
368 def read(self):
369 """Decode and return the source string."""
370 return self.decode(self.source)
373 class StringOutput(Output):
376 Direct string output.
379 default_destination_path = '<string>'
381 def write(self, data):
382 """Encode `data`, store it in `self.destination`, and return it."""
383 self.destination = self.encode(data)
384 return self.destination
387 class NullInput(Input):
390 Degenerate input: read nothing.
393 default_source_path = 'null input'
395 def read(self):
396 """Return a null string."""
397 return u''
400 class NullOutput(Output):
403 Degenerate output: write nothing.
406 default_destination_path = 'null output'
408 def write(self, data):
409 """Do nothing ([don't even] send data to the bit bucket)."""
410 pass
413 class DocTreeInput(Input):
416 Adapter for document tree input.
418 The document tree must be passed in the ``source`` parameter.
421 default_source_path = 'doctree input'
423 def read(self):
424 """Return the document tree."""
425 return self.source