use failUnless instead of assertTrue for 2.3 compatibility
[docutils.git] / docutils / io.py
blob66f22a15ddfc88fad3acbc6962cbf1841be3b88d
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 will exist for a variety of input/output mechanisms.
8 """
10 __docformat__ = 'reStructuredText'
12 import sys
13 try:
14 import locale
15 except:
16 pass
17 import re
18 import codecs
19 from docutils import TransformSpec
20 from docutils._compat import b
23 class Input(TransformSpec):
25 """
26 Abstract base class for input wrappers.
27 """
29 component_type = 'input'
31 default_source_path = None
33 def __init__(self, source=None, source_path=None, encoding=None,
34 error_handler='strict'):
35 self.encoding = encoding
36 """Text encoding for the input source."""
38 self.error_handler = error_handler
39 """Text decoding error handler."""
41 self.source = source
42 """The source of input data."""
44 self.source_path = source_path
45 """A text reference to the source."""
47 if not source_path:
48 self.source_path = self.default_source_path
50 self.successful_encoding = None
51 """The encoding that successfully decoded the source data."""
53 def __repr__(self):
54 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
55 self.source_path)
57 def read(self):
58 raise NotImplementedError
60 def decode(self, data):
61 """
62 Decode a string, `data`, heuristically.
63 Raise UnicodeError if unsuccessful.
65 The client application should call ``locale.setlocale`` at the
66 beginning of processing::
68 locale.setlocale(locale.LC_ALL, '')
69 """
70 if self.encoding and self.encoding.lower() == 'unicode':
71 assert isinstance(data, unicode), (
72 'input encoding is "unicode" '
73 'but input is not a unicode object')
74 if isinstance(data, unicode):
75 # Accept unicode even if self.encoding != 'unicode'.
76 return data
77 if self.encoding:
78 # We believe the user/application when the encoding is
79 # explicitly given.
80 encodings = [self.encoding]
81 else:
82 data_encoding = self.determine_encoding_from_data(data)
83 if data_encoding:
84 # If the data declares its encoding (explicitly or via a BOM),
85 # we believe it.
86 encodings = [data_encoding]
87 else:
88 # Apply heuristics only if no encoding is explicitly given and
89 # no BOM found. Start with UTF-8, because that only matches
90 # data that *IS* UTF-8:
91 encodings = ['utf-8']
92 try:
93 encodings.append(locale.getlocale()[1])
94 except:
95 pass
96 try:
97 encodings.append(locale.getdefaultlocale()[1])
98 except:
99 pass
100 # fallback encoding:
101 encodings.append('latin-1')
102 error = None
103 error_details = ''
104 for enc in encodings:
105 if not enc:
106 continue
107 try:
108 decoded = unicode(data, enc, self.error_handler)
109 self.successful_encoding = enc
110 # Return decoded, removing BOMs.
111 return decoded.replace(u'\ufeff', u'')
112 except (UnicodeError, LookupError), tmperror:
113 error = tmperror # working around Python 3 deleting the
114 # error variable after the except clause
115 if error is not None:
116 error_details = '\n(%s: %s)' % (error.__class__.__name__, error)
117 raise UnicodeError(
118 'Unable to decode input data. Tried the following encodings: '
119 '%s.%s'
120 % (', '.join([repr(enc) for enc in encodings if enc]),
121 error_details))
123 coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
124 """Encoding declaration pattern."""
126 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # actually 'utf-8-sig'
127 (codecs.BOM_UTF16_BE, 'utf-16-be'),
128 (codecs.BOM_UTF16_LE, 'utf-16-le'),)
129 """Sequence of (start_bytes, encoding) tuples for encoding detection.
130 The first bytes of input data are checked against the start_bytes strings.
131 A match indicates the given encoding."""
133 def determine_encoding_from_data(self, data):
135 Try to determine the encoding of `data` by looking *in* `data`.
136 Check for a byte order mark (BOM) or an encoding declaration.
138 # check for a byte order mark:
139 for start_bytes, encoding in self.byte_order_marks:
140 if data.startswith(start_bytes):
141 return encoding
142 # check for an encoding declaration pattern in first 2 lines of file:
143 for line in data.splitlines()[:2]:
144 match = self.coding_slug.search(line)
145 if match:
146 return match.group(1).decode('ascii')
147 return None
150 class Output(TransformSpec):
153 Abstract base class for output wrappers.
156 component_type = 'output'
158 default_destination_path = None
160 def __init__(self, destination=None, destination_path=None,
161 encoding=None, error_handler='strict'):
162 self.encoding = encoding
163 """Text encoding for the output destination."""
165 self.error_handler = error_handler or 'strict'
166 """Text encoding error handler."""
168 self.destination = destination
169 """The destination for output data."""
171 self.destination_path = destination_path
172 """A text reference to the destination."""
174 if not destination_path:
175 self.destination_path = self.default_destination_path
177 def __repr__(self):
178 return ('%s: destination=%r, destination_path=%r'
179 % (self.__class__, self.destination, self.destination_path))
181 def write(self, data):
182 """`data` is a Unicode string, to be encoded by `self.encode`."""
183 raise NotImplementedError
185 def encode(self, data):
186 if self.encoding and self.encoding.lower() == 'unicode':
187 assert isinstance(data, unicode), (
188 'the encoding given is "unicode" but the output is not '
189 'a Unicode string')
190 return data
191 if not isinstance(data, unicode):
192 # Non-unicode (e.g. binary) output.
193 return data
194 else:
195 return data.encode(self.encoding, self.error_handler)
198 class FileInput(Input):
201 Input for single, simple file-like objects.
204 def __init__(self, source=None, source_path=None,
205 encoding=None, error_handler='strict',
206 autoclose=1, handle_io_errors=1, mode='rU'):
208 :Parameters:
209 - `source`: either a file-like object (which is read directly), or
210 `None` (which implies `sys.stdin` if no `source_path` given).
211 - `source_path`: a path to a file, which is opened and then read.
212 - `encoding`: the expected text encoding of the input file.
213 - `error_handler`: the encoding error handler to use.
214 - `autoclose`: close automatically after read (boolean); always
215 false if `sys.stdin` is the source.
216 - `handle_io_errors`: summarize I/O errors here, and exit?
217 - `mode`: how the file is to be opened (see standard function
218 `open`). The default 'rU' provides universal newline support
219 for text files.
221 Input.__init__(self, source, source_path, encoding, error_handler)
222 self.autoclose = autoclose
223 self.handle_io_errors = handle_io_errors
224 if source is None:
225 if source_path:
226 # Specify encoding in Python 3
227 if sys.version_info >= (3,0):
228 kwargs = {'encoding': self.encoding,
229 'errors': self.error_handler}
230 else:
231 kwargs = {}
233 try:
234 self.source = open(source_path, mode, **kwargs)
235 except IOError, error:
236 if not handle_io_errors:
237 raise
238 print >>sys.stderr, '%s: %s' % (error.__class__.__name__,
239 error)
240 print >>sys.stderr, ('Unable to open source file for '
241 "reading ('%s'). Exiting." %
242 source_path)
243 sys.exit(1)
244 else:
245 self.source = sys.stdin
246 self.autoclose = None
247 if not source_path:
248 try:
249 self.source_path = self.source.name
250 except AttributeError:
251 pass
253 def read(self):
255 Read and decode a single file and return the data (Unicode string).
257 try:
258 data = self.source.read()
259 finally:
260 if self.autoclose:
261 self.close()
262 return self.decode(data)
264 def readlines(self):
266 Return lines of a single file as list of Unicode strings.
268 try:
269 lines = self.source.readlines()
270 finally:
271 if self.autoclose:
272 self.close()
273 return [self.decode(line) for line in lines]
275 def close(self):
276 self.source.close()
279 class FileOutput(Output):
282 Output for single, simple file-like objects.
285 def __init__(self, destination=None, destination_path=None,
286 encoding=None, error_handler='strict', autoclose=1,
287 handle_io_errors=1):
289 :Parameters:
290 - `destination`: either a file-like object (which is written
291 directly) or `None` (which implies `sys.stdout` if no
292 `destination_path` given).
293 - `destination_path`: a path to a file, which is opened and then
294 written.
295 - `autoclose`: close automatically after write (boolean); always
296 false if `sys.stdout` is the destination.
298 Output.__init__(self, destination, destination_path,
299 encoding, error_handler)
300 self.opened = 1
301 self.autoclose = autoclose
302 self.handle_io_errors = handle_io_errors
303 if destination is None:
304 if destination_path:
305 self.opened = None
306 else:
307 self.destination = sys.stdout
308 self.autoclose = None
309 if not destination_path:
310 try:
311 self.destination_path = self.destination.name
312 except AttributeError:
313 pass
315 def open(self):
316 # Specify encoding in Python 3.
317 # (Do not use binary mode ('wb') as this prevents the
318 # conversion of newlines to the system specific default.)
319 if sys.version_info >= (3,0):
320 kwargs = {'encoding': self.encoding,
321 'errors': self.error_handler}
322 else:
323 kwargs = {}
325 try:
326 self.destination = open(self.destination_path, 'w', **kwargs)
327 except IOError, error:
328 if not self.handle_io_errors:
329 raise
330 print >>sys.stderr, '%s: %s' % (error.__class__.__name__,
331 error)
332 print >>sys.stderr, ('Unable to open destination file for writing'
333 " ('%s'). Exiting." % self.destination_path)
334 sys.exit(1)
335 self.opened = 1
337 def write(self, data):
338 """Encode `data`, write it to a single file, and return it.
340 In Python 3, a (unicode) String is returned.
342 if sys.version_info >= (3,0):
343 output = data # in py3k, write expects a (Unicode) string
344 else:
345 output = self.encode(data)
346 if not self.opened:
347 self.open()
348 try:
349 self.destination.write(output)
350 finally:
351 if self.autoclose:
352 self.close()
353 return output
355 def close(self):
356 self.destination.close()
357 self.opened = None
360 class BinaryFileOutput(FileOutput):
362 A version of docutils.io.FileOutput which writes to a binary file.
364 def open(self):
365 try:
366 self.destination = open(self.destination_path, 'wb')
367 except IOError, error:
368 if not self.handle_io_errors:
369 raise
370 print >>sys.stderr, '%s: %s' % (error.__class__.__name__,
371 error)
372 print >>sys.stderr, ('Unable to open destination file for writing '
373 "('%s'). Exiting." % self.destination_path)
374 sys.exit(1)
375 self.opened = 1
378 class StringInput(Input):
381 Direct string input.
384 default_source_path = '<string>'
386 def read(self):
387 """Decode and return the source string."""
388 return self.decode(self.source)
391 class StringOutput(Output):
394 Direct string output.
397 default_destination_path = '<string>'
399 def write(self, data):
400 """Encode `data`, store it in `self.destination`, and return it."""
401 self.destination = self.encode(data)
402 return self.destination
405 class NullInput(Input):
408 Degenerate input: read nothing.
411 default_source_path = 'null input'
413 def read(self):
414 """Return a null string."""
415 return u''
418 class NullOutput(Output):
421 Degenerate output: write nothing.
424 default_destination_path = 'null output'
426 def write(self, data):
427 """Do nothing ([don't even] send data to the bit bucket)."""
428 pass
431 class DocTreeInput(Input):
434 Adapter for document tree input.
436 The document tree must be passed in the ``source`` parameter.
439 default_source_path = 'doctree input'
441 def read(self):
442 """Return the document tree."""
443 return self.source