docutils/test/test_io.py

   1 #! /usr/bin/env python3
   2
   3 # $Id$
   4 # Author: Lea Wiemann <LeWiemann@gmail.com>
   5 # Copyright: This module has been placed in the public domain.
   6
   7 """
   8 Test module for `docutils.io`.
   9 """
  10
  11 import codecs
  12 import locale
  13 from io import StringIO, BytesIO
  14 import os.path
  15 from pathlib import Path
  16 import sys
  17 import unittest
  18
  19 if __name__ == '__main__':
  20     # prepend the "docutils root" to the Python library path
  21     # so we import the local `docutils` package.
  22     sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
  23
  24 from docutils import io as du_io
  25
  26 # DATA_ROOT is ./test/data/ from the docutils root
  27 DATA_ROOT = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data')
  28
  29 # normalize the preferred encoding's name:
  30 preferredencoding = codecs.lookup(
  31     locale.getpreferredencoding(do_setlocale=False)).name
  32
  33
  34 # Stub: Buffer with 'strict' auto-conversion of input to byte string:
  35 class BBuf(BytesIO):
  36     def write(self, data):
  37         if isinstance(data, str):
  38             data.encode('ascii', 'strict')
  39         super().write(data)
  40
  41
  42 # Stub: Buffer expecting unicode string:
  43 class UBuf(StringIO):
  44     def write(self, data):
  45         # emulate Python 3 handling of stdout, stderr
  46         if isinstance(data, bytes):
  47             raise TypeError('must be unicode, not bytes')
  48         super().write(data)
  49
  50
  51 class mock_stdout(UBuf):
  52     encoding = 'utf-8'
  53
  54     def __init__(self):
  55         self.buffer = BBuf()
  56         super().__init__()
  57
  58
  59 class HelperTests(unittest.TestCase):
  60
  61     def test_check_encoding_true(self):
  62         """Return `True` if lookup returns the same codec"""
  63         self.assertEqual(True, du_io.check_encoding(mock_stdout, 'utf-8'))
  64         self.assertEqual(True, du_io.check_encoding(mock_stdout, 'utf_8'))
  65         self.assertEqual(True, du_io.check_encoding(mock_stdout, 'utf8'))
  66         self.assertEqual(True, du_io.check_encoding(mock_stdout, 'UTF-8'))
  67
  68     def test_check_encoding_false(self):
  69         """Return `False` if lookup returns different codecs"""
  70         self.assertEqual(False, du_io.check_encoding(mock_stdout, 'ascii'))
  71         self.assertEqual(False, du_io.check_encoding(mock_stdout, 'latin-1'))
  72
  73     def test_check_encoding_none(self):
  74         """Cases where the comparison fails."""
  75         # stream.encoding is None:
  76         self.assertEqual(None,
  77                          du_io.check_encoding(du_io.FileInput(), 'ascii'))
  78         # stream.encoding does not exist:
  79         self.assertEqual(None, du_io.check_encoding(BBuf, 'ascii'))
  80         # encoding is None or empty string:
  81         self.assertEqual(None, du_io.check_encoding(mock_stdout, None))
  82         self.assertEqual(None, du_io.check_encoding(mock_stdout, ''))
  83         # encoding is invalid
  84         self.assertEqual(None, du_io.check_encoding(mock_stdout, 'UTF-9'))
  85
  86     def test_error_string(self):
  87         us = '\xfc'       # bytes(us) fails
  88         bs = b'\xc3\xbc'  # str(bs) returns repr(bs)
  89
  90         self.assertEqual('Exception: spam',
  91                          du_io.error_string(Exception('spam')))
  92         self.assertEqual('IndexError: ' + str(bs),
  93                          du_io.error_string(IndexError(bs)))
  94         self.assertEqual('ImportError: %s' % us,
  95                          du_io.error_string(ImportError(us)))
  96
  97
  98 class InputTests(unittest.TestCase):
  99
 100     def test_bom_handling(self):
 101         # Provisional:
 102         # default input encoding will change to UTF-8 in Docutils 0.22
 103         source = '\ufeffdata\n\ufeff blah\n'
 104         expected = 'data\n\ufeff blah\n'  # only leading ZWNBSP removed
 105         input = du_io.StringInput(source=source.encode('utf-16-be'))
 106         self.assertEqual(expected, input.read())
 107         input = du_io.StringInput(source=source.encode('utf-16-le'))
 108         self.assertEqual(expected, input.read())
 109         input = du_io.StringInput(source=source.encode('utf-8'))
 110         self.assertEqual(expected, input.read())
 111         # With `str` input all ZWNBSPs are still there.
 112         input = du_io.StringInput(source=source)
 113         self.assertEqual(source, input.read())
 114
 115     def test_encoding_declaration(self):
 116         input = du_io.StringInput(source=b"""\
 117 .. -*- coding: ascii -*-
 118 data
 119 blah
 120 """)
 121         data = input.read()  # noqa: F841
 122         self.assertEqual('ascii', input.successful_encoding)
 123         input = du_io.StringInput(source=b"""\
 124 #! python
 125 # -*- coding: ascii -*-
 126 print("hello world")
 127 """)
 128         data = input.read()  # noqa: F841
 129         self.assertEqual('ascii', input.successful_encoding)
 130         input = du_io.StringInput(source=b"""\
 131 #! python
 132 # extraneous comment; prevents coding slug from being read
 133 # -*- coding: ascii -*-
 134 print("hello world")
 135 """)
 136         self.assertNotEqual(input.successful_encoding, 'ascii')
 137
 138     def test_decode_unicode(self):
 139         # With the special value "unicode" or "Unicode":
 140         uniinput = du_io.Input(encoding='unicode')
 141         # keep unicode instances as-is
 142         self.assertEqual('ja', uniinput.decode('ja'))
 143         # raise AssertionError if data is not a `str` instance
 144         with self.assertRaises(AssertionError):
 145             uniinput.decode(b'ja')
 146
 147
 148 class OutputTests(unittest.TestCase):
 149
 150     bdata = b'\xfc'
 151     udata = '\xfc'
 152
 153     def setUp(self):
 154         self.bdrain = BBuf()
 155         """Buffer accepting binary strings (bytes)"""
 156         self.udrain = UBuf()
 157         """Buffer accepting unicode strings"""
 158         self.mock_stdout = mock_stdout()
 159         """Stub of sys.stdout under Python 3"""
 160
 161     def test_write_unicode(self):
 162         fo = du_io.FileOutput(destination=self.udrain, encoding='unicode',
 163                               autoclose=False)
 164         fo.write(self.udata)
 165         self.assertEqual(self.udata, self.udrain.getvalue())
 166
 167     def test_write_utf8(self):
 168         fo = du_io.FileOutput(destination=self.udrain, encoding='utf-8',
 169                               autoclose=False)
 170         fo.write(self.udata)
 171         self.assertEqual(self.udata, self.udrain.getvalue())
 172
 173     def test_FileOutput_hande_io_errors_deprection_warning(self):
 174         with self.assertWarnsRegex(DeprecationWarning,
 175                                    '"handle_io_errors" is ignored'):
 176             du_io.FileOutput(handle_io_errors=True)
 177
 178     # With destination in binary mode, data must be binary string
 179     # and is written as-is:
 180     def test_write_bytes(self):
 181         fo = du_io.FileOutput(destination=self.bdrain, encoding='utf-8',
 182                               mode='wb', autoclose=False)
 183         fo.write(self.bdata)
 184         self.assertEqual(self.bdata, self.bdrain.getvalue())
 185
 186     def test_write_bytes_to_stdout(self):
 187         # try writing data to `destination.buffer`, if data is
 188         # instance of `bytes` and writing to `destination` fails:
 189         fo = du_io.FileOutput(destination=self.mock_stdout)
 190         fo.write(self.bdata)
 191         self.assertEqual(self.bdata,
 192                          self.mock_stdout.buffer.getvalue())
 193
 194     def test_encoding_clash_resolved(self):
 195         fo = du_io.FileOutput(destination=self.mock_stdout,
 196                               encoding='latin1', autoclose=False)
 197         fo.write(self.udata)
 198         self.assertEqual(self.udata.encode('latin1'),
 199                          self.mock_stdout.buffer.getvalue())
 200
 201     def test_encoding_clash_nonresolvable(self):
 202         del self.mock_stdout.buffer
 203         fo = du_io.FileOutput(destination=self.mock_stdout,
 204                               encoding='latin1', autoclose=False)
 205         self.assertRaises(ValueError, fo.write, self.udata)
 206
 207
 208 class ErrorOutputTests(unittest.TestCase):
 209     def test_defaults(self):
 210         e = du_io.ErrorOutput()
 211         self.assertEqual(sys.stderr, e.destination)
 212
 213     def test_bbuf(self):
 214         buf = BBuf()  # buffer storing byte string
 215         e = du_io.ErrorOutput(buf, encoding='ascii')
 216         # write byte-string as-is
 217         e.write(b'b\xfc')
 218         self.assertEqual(b'b\xfc', buf.getvalue())
 219         # encode unicode data with backslashescape fallback replacement:
 220         e.write(' u\xfc')
 221         self.assertEqual(b'b\xfc u\\xfc', buf.getvalue())
 222         # handle Exceptions with Unicode string args
 223         # unicode(Exception('e\xfc')) # fails in Python < 2.6
 224         e.write(AttributeError(' e\xfc'))
 225         self.assertEqual(b'b\xfc u\\xfc e\\xfc', buf.getvalue())
 226         # encode with `encoding` attribute
 227         e.encoding = 'utf-8'
 228         e.write(' u\xfc')
 229         self.assertEqual(b'b\xfc u\\xfc e\\xfc u\xc3\xbc', buf.getvalue())
 230
 231     def test_ubuf(self):
 232         buf = UBuf()  # buffer only accepting unicode string
 233         # decode of binary strings
 234         e = du_io.ErrorOutput(buf, encoding='ascii')
 235         e.write(b'b\xfc')
 236         # use REPLACEMENT CHARACTER
 237         self.assertEqual(buf.getvalue(), 'b\ufffd')
 238         # write Unicode string and Exceptions with Unicode args
 239         e.write(' u\xfc')
 240         self.assertEqual(buf.getvalue(), 'b\ufffd u\xfc')
 241         e.write(AttributeError(' e\xfc'))
 242         self.assertEqual(buf.getvalue(), 'b\ufffd u\xfc e\xfc')
 243         # decode with `encoding` attribute
 244         e.encoding = 'latin1'
 245         e.write(b' b\xfc')
 246         self.assertEqual(buf.getvalue(), 'b\ufffd u\xfc e\xfc b\xfc')
 247
 248
 249 class FileInputTests(unittest.TestCase):
 250
 251     # test input encoding auto-detection:
 252     #
 253     # Up to Docutils 0.18, auto-detection was not used under Python 3
 254     # unless reading a file with Python's default encoding failed
 255
 256     def test_bom_utf_8(self):
 257         """Drop optional BOM from utf-8 encoded files.
 258         """
 259         source = du_io.FileInput(
 260             source_path=os.path.join(DATA_ROOT, 'utf-8-sig.txt'))
 261         self.assertTrue(source.read().startswith('Grüße'))
 262
 263     def test_bom_utf_16(self):
 264         """Drop BOM from utf-16 encoded files, use correct encoding.
 265         """
 266         # Assert correct decoding, BOM is gone.
 267         source = du_io.FileInput(
 268             source_path=os.path.join(DATA_ROOT, 'utf-16-le-sig.txt'))
 269         self.assertTrue(source.read().startswith('Grüße'))
 270
 271     def test_coding_slug(self):
 272         """Use self-declared encoding.
 273         """
 274         source = du_io.FileInput(
 275             source_path=os.path.join(DATA_ROOT, 'latin2.txt'))
 276         self.assertTrue(source.read().endswith('škoda\n'))
 277
 278     def test_fallback_utf8(self):
 279         """Try 'utf-8', if encoding is not specified in the source."""
 280         source = du_io.FileInput(
 281             source_path=os.path.join(DATA_ROOT, 'utf8.txt'))
 282         self.assertEqual('Grüße\n', source.read())
 283
 284     @unittest.skipIf(preferredencoding in (None, 'ascii', 'utf-8'),
 285                      'locale encoding not set or UTF-8')
 286     def test_fallback_no_utf8(self):
 287         # If  no encoding is given and decoding with 'utf-8' fails,
 288         # use the locale's preferred encoding (if not None).
 289         # Provisional: the default will become 'utf-8'
 290         # (without auto-detection and fallback) in Docutils 0.22.
 291         source = du_io.FileInput(
 292             source_path=os.path.join(DATA_ROOT, 'latin1.txt'))
 293         data = source.read()
 294         successful_encoding = codecs.lookup(source.successful_encoding).name
 295         self.assertEqual(preferredencoding, successful_encoding)
 296         if successful_encoding == 'iso8859-1':
 297             self.assertEqual('Grüße\n', data)
 298
 299     def test_readlines(self):
 300         source = du_io.FileInput(
 301             source_path=os.path.join(DATA_ROOT, 'include.txt'))
 302         data = source.readlines()
 303         self.assertEqual(['Some include text.\n'], data)
 304
 305
 306 if __name__ == '__main__':
 307     unittest.main()