docutils/io.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 I/O classes provide a uniform API for low-level input and output.  Subclasses
   7 will exist for a variety of input/output mechanisms.
   8 """
   9
  10 __docformat__ = 'reStructuredText'
  11
  12 import sys
  13 import os
  14 import re
  15 import codecs
  16 from docutils import TransformSpec
  17 from docutils._compat import b
  18 from docutils.error_reporting import locale_encoding, ErrorString, ErrorOutput
  19
  20
  21 class InputError(IOError): pass
  22 class OutputError(IOError): pass
  23
  24 def check_encoding(stream, encoding):
  25     """Test, whether the encoding of `stream` matches `encoding`.
  26
  27     Returns
  28
  29     :None:  if `encoding` or `stream.encoding` are not a valid encoding
  30             argument (e.g. ``None``) or `stream.encoding is missing.
  31     :True:  if the encoding argument resolves to the same value as `encoding`,
  32     :False: if the encodings differ.
  33     """
  34     try:
  35         return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
  36     except (LookupError, AttributeError, TypeError):
  37         return None
  38
  39
  40 class Input(TransformSpec):
  41
  42     """
  43     Abstract base class for input wrappers.
  44     """
  45
  46     component_type = 'input'
  47
  48     default_source_path = None
  49
  50     def __init__(self, source=None, source_path=None, encoding=None,
  51                  error_handler='strict'):
  52         self.encoding = encoding
  53         """Text encoding for the input source."""
  54
  55         self.error_handler = error_handler
  56         """Text decoding error handler."""
  57
  58         self.source = source
  59         """The source of input data."""
  60
  61         self.source_path = source_path
  62         """A text reference to the source."""
  63
  64         if not source_path:
  65             self.source_path = self.default_source_path
  66
  67         self.successful_encoding = None
  68         """The encoding that successfully decoded the source data."""
  69
  70     def __repr__(self):
  71         return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
  72                                                   self.source_path)
  73
  74     def read(self):
  75         raise NotImplementedError
  76
  77     def decode(self, data):
  78         """
  79         Decode a string, `data`, heuristically.
  80         Raise UnicodeError if unsuccessful.
  81
  82         The client application should call ``locale.setlocale`` at the
  83         beginning of processing::
  84
  85             locale.setlocale(locale.LC_ALL, '')
  86         """
  87         if self.encoding and self.encoding.lower() == 'unicode':
  88             assert isinstance(data, unicode), (
  89                 'input encoding is "unicode" '
  90                 'but input is not a unicode object')
  91         if isinstance(data, unicode):
  92             # Accept unicode even if self.encoding != 'unicode'.
  93             return data
  94         if self.encoding:
  95             # We believe the user/application when the encoding is
  96             # explicitly given.
  97             encodings = [self.encoding]
  98         else:
  99             data_encoding = self.determine_encoding_from_data(data)
 100             if data_encoding:
 101                 # If the data declares its encoding (explicitly or via a BOM),
 102                 # we believe it.
 103                 encodings = [data_encoding]
 104             else:
 105                 # Apply heuristics only if no encoding is explicitly given and
 106                 # no BOM found.  Start with UTF-8, because that only matches
 107                 # data that *IS* UTF-8:
 108                 encodings = ['utf-8', 'latin-1']
 109                 if locale_encoding:
 110                     encodings.insert(1, locale_encoding)
 111         for enc in encodings:
 112             try:
 113                 decoded = unicode(data, enc, self.error_handler)
 114                 self.successful_encoding = enc
 115                 # Return decoded, removing BOMs.
 116                 return decoded.replace(u'\ufeff', u'')
 117             except (UnicodeError, LookupError), err:
 118                 error = err # in Python 3, the <exception instance> is
 119                             # local to the except clause
 120         raise UnicodeError(
 121             'Unable to decode input data.  Tried the following encodings: '
 122             '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
 123                          ErrorString(error)))
 124
 125     coding_slug = re.compile(b("coding[:=]\s*([-\w.]+)"))
 126     """Encoding declaration pattern."""
 127
 128     byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
 129                         (codecs.BOM_UTF16_BE, 'utf-16-be'),
 130                         (codecs.BOM_UTF16_LE, 'utf-16-le'),)
 131     """Sequence of (start_bytes, encoding) tuples for encoding detection.
 132     The first bytes of input data are checked against the start_bytes strings.
 133     A match indicates the given encoding."""
 134
 135     def determine_encoding_from_data(self, data):
 136         """
 137         Try to determine the encoding of `data` by looking *in* `data`.
 138         Check for a byte order mark (BOM) or an encoding declaration.
 139         """
 140         # check for a byte order mark:
 141         for start_bytes, encoding in self.byte_order_marks:
 142             if data.startswith(start_bytes):
 143                 return encoding
 144         # check for an encoding declaration pattern in first 2 lines of file:
 145         for line in data.splitlines()[:2]:
 146             match = self.coding_slug.search(line)
 147             if match:
 148                 return match.group(1).decode('ascii')
 149         return None
 150
 151
 152 class Output(TransformSpec):
 153
 154     """
 155     Abstract base class for output wrappers.
 156     """
 157
 158     component_type = 'output'
 159
 160     default_destination_path = None
 161
 162     def __init__(self, destination=None, destination_path=None,
 163                  encoding=None, error_handler='strict'):
 164         self.encoding = encoding
 165         """Text encoding for the output destination."""
 166
 167         self.error_handler = error_handler or 'strict'
 168         """Text encoding error handler."""
 169
 170         self.destination = destination
 171         """The destination for output data."""
 172
 173         self.destination_path = destination_path
 174         """A text reference to the destination."""
 175
 176         if not destination_path:
 177             self.destination_path = self.default_destination_path
 178
 179     def __repr__(self):
 180         return ('%s: destination=%r, destination_path=%r'
 181                 % (self.__class__, self.destination, self.destination_path))
 182
 183     def write(self, data):
 184         """`data` is a Unicode string, to be encoded by `self.encode`."""
 185         raise NotImplementedError
 186
 187     def encode(self, data):
 188         if self.encoding and self.encoding.lower() == 'unicode':
 189             assert isinstance(data, unicode), (
 190                 'the encoding given is "unicode" but the output is not '
 191                 'a Unicode string')
 192             return data
 193         if not isinstance(data, unicode):
 194             # Non-unicode (e.g. binary) output.
 195             return data
 196         else:
 197             return data.encode(self.encoding, self.error_handler)
 198
 199
 200 class FileInput(Input):
 201
 202     """
 203     Input for single, simple file-like objects.
 204     """
 205     def __init__(self, source=None, source_path=None,
 206                  encoding=None, error_handler='strict',
 207                  autoclose=True, handle_io_errors=True, mode='rU'):
 208         """
 209         :Parameters:
 210             - `source`: either a file-like object (which is read directly), or
 211               `None` (which implies `sys.stdin` if no `source_path` given).
 212             - `source_path`: a path to a file, which is opened and then read.
 213             - `encoding`: the expected text encoding of the input file.
 214             - `error_handler`: the encoding error handler to use.
 215             - `autoclose`: close automatically after read (except when
 216               `sys.stdin` is the source).
 217             - `handle_io_errors`: summarize I/O errors here, and exit?
 218             - `mode`: how the file is to be opened (see standard function
 219               `open`). The default 'rU' provides universal newline support
 220               for text files.
 221         """
 222         Input.__init__(self, source, source_path, encoding, error_handler)
 223         self.autoclose = autoclose
 224         self.handle_io_errors = handle_io_errors
 225         self._stderr = ErrorOutput()
 226
 227         if source is None:
 228             if source_path:
 229                 # Specify encoding in Python 3
 230                 if sys.version_info >= (3,0):
 231                     kwargs = {'encoding': self.encoding,
 232                               'errors': self.error_handler}
 233                 else:
 234                     kwargs = {}
 235
 236                 try:
 237                     self.source = open(source_path, mode, **kwargs)
 238                 except IOError, error:
 239                     if handle_io_errors:
 240                         print >>self._stderr, ErrorString(error)
 241                         print >>self._stderr, (
 242                             u'Unable to open source file for reading ("%s").'
 243                             u'Exiting.' % source_path)
 244                         sys.exit(1)
 245                     raise InputError(error.errno, error.strerror, source_path)
 246             else:
 247                 self.source = sys.stdin
 248         elif (sys.version_info >= (3,0) and
 249               check_encoding(self.source, self.encoding) is False):
 250             # TODO: re-open, warn or raise error?
 251             raise UnicodeError('Encoding clash: encoding given is "%s" '
 252                                'but source is opened with encoding "%s".' %
 253                                (self.encoding, self.source.encoding))
 254         if not source_path:
 255             try:
 256                 self.source_path = self.source.name
 257             except AttributeError:
 258                 pass
 259
 260     def read(self):
 261         """
 262         Read and decode a single file and return the data (Unicode string).
 263         """
 264         try: # In Python < 2.5, try...except has to be nested in try...finally.
 265             try:
 266                 if self.source is sys.stdin and sys.version_info >= (3,0):
 267                     # read as binary data to circumvent auto-decoding
 268                     data = self.source.buffer.read()
 269                     # normalize newlines
 270                     data = b('\n').join(data.splitlines()) + b('\n')
 271                 else:
 272                     data = self.source.read()
 273             except (UnicodeError, LookupError), err: # (in Py3k read() decodes)
 274                 if not self.encoding and self.source_path:
 275                     # re-read in binary mode and decode with heuristics
 276                     b_source = open(self.source_path, 'rb')
 277                     data = b_source.read()
 278                     b_source.close()
 279                     # normalize newlines
 280                     data = b('\n').join(data.splitlines()) + b('\n')
 281                 else:
 282                     raise
 283         finally:
 284             if self.autoclose:
 285                 self.close()
 286         return self.decode(data)
 287
 288     def readlines(self):
 289         """
 290         Return lines of a single file as list of Unicode strings.
 291         """
 292         return self.read().splitlines(True)
 293
 294     def close(self):
 295         if self.source is not sys.stdin:
 296             self.source.close()
 297
 298
 299 class FileOutput(Output):
 300
 301     """
 302     Output for single, simple file-like objects.
 303     """
 304
 305     mode = 'w'
 306     """The mode argument for `open()`."""
 307     # 'wb' for binary (e.g. OpenOffice) files.
 308     # (Do not use binary mode ('wb') for text files, as this prevents the
 309     # conversion of newlines to the system specific default.)
 310
 311     def __init__(self, destination=None, destination_path=None,
 312                  encoding=None, error_handler='strict', autoclose=True,
 313                  handle_io_errors=True, mode=None):
 314         """
 315         :Parameters:
 316             - `destination`: either a file-like object (which is written
 317               directly) or `None` (which implies `sys.stdout` if no
 318               `destination_path` given).
 319             - `destination_path`: a path to a file, which is opened and then
 320               written.
 321             - `encoding`: the text encoding of the output file.
 322             - `error_handler`: the encoding error handler to use.
 323             - `autoclose`: close automatically after write (except when
 324               `sys.stdout` or `sys.stderr` is the destination).
 325             - `handle_io_errors`: summarize I/O errors here, and exit?
 326             - `mode`: how the file is to be opened (see standard function
 327               `open`). The default is 'w', providing universal newline
 328               support for text files.
 329         """
 330         Output.__init__(self, destination, destination_path,
 331                         encoding, error_handler)
 332         self.opened = True
 333         self.autoclose = autoclose
 334         self.handle_io_errors = handle_io_errors
 335         if mode is not None:
 336             self.mode = mode
 337         self._stderr = ErrorOutput()
 338         if destination is None:
 339             if destination_path:
 340                 self.opened = False
 341             else:
 342                 self.destination = sys.stdout
 343         elif (# destination is file-type object -> check mode:
 344               mode and hasattr(self.destination, 'mode')
 345               and mode != self.destination.mode):
 346                 print >>self._stderr, ('Destination mode "%s" '
 347                                'differs from specified mode "%s"' %
 348                                (self.destination.mode, mode))
 349         if not destination_path:
 350             try:
 351                 self.destination_path = self.destination.name
 352             except AttributeError:
 353                 pass
 354         # Special cases under Python 3: different encoding or binary output
 355         if sys.version_info >= (3,0):
 356             if ('b' in self.mode
 357                 and self.destination in (sys.stdout, sys.stderr)
 358                ):
 359                 self.destination = self.destination.buffer
 360             if check_encoding(self.destination, self.encoding) is False:
 361                 if self.destination in (sys.stdout, sys.stderr):
 362                     self.destination = self.destination.buffer
 363                 else:  # TODO: try the `write to .buffer` scheme instead?
 364                     raise ValueError('Encoding of %s (%s) differs \n'
 365                                      '  from specified encoding (%s)' %
 366                                      (self.destination_path or 'destination',
 367                                       destination.encoding, encoding))
 368
 369
 370     def open(self):
 371         # Specify encoding in Python 3.
 372         if sys.version_info >= (3,0):
 373             kwargs = {'encoding': self.encoding,
 374                       'errors': self.error_handler}
 375         else:
 376             kwargs = {}
 377         try:
 378             self.destination = open(self.destination_path, self.mode, **kwargs)
 379         except IOError, error:
 380             if self.handle_io_errors:
 381                 print >>self._stderr, ErrorString(error)
 382                 print >>self._stderr, (u'Unable to open destination file'
 383                     u" for writing ('%s').  Exiting." % self.destination_path)
 384                 sys.exit(1)
 385             raise OutputError(error.errno, error.strerror,
 386                               self.destination_path)
 387         self.opened = True
 388
 389     def write(self, data):
 390         """Encode `data`, write it to a single file, and return it.
 391
 392         With Python 3 or binary output mode, `data` is returned unchanged,
 393         except when specified encoding and output encoding differ.
 394         """
 395         if not self.opened:
 396             self.open()
 397         try: # In Python < 2.5, try...except has to be nested in try...finally.
 398             try:
 399                 if 'b' not in self.mode and (sys.version_info < (3,0) or
 400                    check_encoding(self.destination, self.encoding) is False):
 401                     data = self.encode(data)
 402                     if sys.version_info >= (3,0) and os.linesep != '\n':
 403                         # writing as binary data -> fix endings
 404                         data = data.replace('\n', os.linesep)
 405
 406                 self.destination.write(data)
 407
 408             except (UnicodeError, LookupError), err:
 409                 raise UnicodeError(
 410                     'Unable to encode output data. output-encoding is: '
 411                     '%s.\n(%s)' % (self.encoding, ErrorString(err)))
 412         finally:
 413             if self.autoclose:
 414                 self.close()
 415         return data
 416
 417     def close(self):
 418         if self.destination not in (sys.stdout, sys.stderr):
 419             self.destination.close()
 420             self.opened = False
 421
 422
 423 class BinaryFileOutput(FileOutput):
 424     """
 425     A version of docutils.io.FileOutput which writes to a binary file.
 426     """
 427     # Used by core.publish_cmdline_to_binary() which in turn is used by
 428     # rst2odt (OpenOffice writer)
 429     mode = 'wb'
 430
 431
 432 class StringInput(Input):
 433
 434     """
 435     Direct string input.
 436     """
 437
 438     default_source_path = '<string>'
 439
 440     def read(self):
 441         """Decode and return the source string."""
 442         return self.decode(self.source)
 443
 444
 445 class StringOutput(Output):
 446
 447     """
 448     Direct string output.
 449     """
 450
 451     default_destination_path = '<string>'
 452
 453     def write(self, data):
 454         """Encode `data`, store it in `self.destination`, and return it."""
 455         self.destination = self.encode(data)
 456         return self.destination
 457
 458
 459 class NullInput(Input):
 460
 461     """
 462     Degenerate input: read nothing.
 463     """
 464
 465     default_source_path = 'null input'
 466
 467     def read(self):
 468         """Return a null string."""
 469         return u''
 470
 471
 472 class NullOutput(Output):
 473
 474     """
 475     Degenerate output: write nothing.
 476     """
 477
 478     default_destination_path = 'null output'
 479
 480     def write(self, data):
 481         """Do nothing ([don't even] send data to the bit bucket)."""
 482         pass
 483
 484
 485 class DocTreeInput(Input):
 486
 487     """
 488     Adapter for document tree input.
 489
 490     The document tree must be passed in the ``source`` parameter.
 491     """
 492
 493     default_source_path = 'doctree input'
 494
 495     def read(self):
 496         """Return the document tree."""
 497         return self.source