lib/cherrypy/test/test_encoding.py

   1
   2 import gzip
   3 import sys
   4
   5 import cherrypy
   6 from cherrypy._cpcompat import BytesIO, IncompleteRead, ntob, ntou
   7
   8 europoundUnicode = ntou('\x80\xa3')
   9 sing = u"\u6bdb\u6cfd\u4e1c: Sing, Little Birdie?"
  10 sing8 = sing.encode('utf-8')
  11 sing16 = sing.encode('utf-16')
  12
  13
  14 from cherrypy.test import helper
  15
  16
  17 class EncodingTests(helper.CPWebCase):
  18
  19     def setup_server():
  20         class Root:
  21             def index(self, param):
  22                 assert param == europoundUnicode, "%r != %r" % (param, europoundUnicode)
  23                 yield europoundUnicode
  24             index.exposed = True
  25
  26             def mao_zedong(self):
  27                 return sing
  28             mao_zedong.exposed = True
  29
  30             def utf8(self):
  31                 return sing8
  32             utf8.exposed = True
  33             utf8._cp_config = {'tools.encode.encoding': 'utf-8'}
  34
  35             def cookies_and_headers(self):
  36                 # if the headers have non-ascii characters and a cookie has
  37                 #  any part which is unicode (even ascii), the response
  38                 #  should not fail.
  39                 cherrypy.response.cookie['candy'] = 'bar'
  40                 cherrypy.response.cookie['candy']['domain'] = 'cherrypy.org'
  41                 cherrypy.response.headers['Some-Header'] = 'My d\xc3\xb6g has fleas'
  42                 return 'Any content'
  43             cookies_and_headers.exposed = True
  44
  45             def reqparams(self, *args, **kwargs):
  46                 return ntob(', ').join([": ".join((k, v)).encode('utf8')
  47                                   for k, v in cherrypy.request.params.items()])
  48             reqparams.exposed = True
  49
  50             def nontext(self, *args, **kwargs):
  51                 cherrypy.response.headers['Content-Type'] = 'application/binary'
  52                 return '\x00\x01\x02\x03'
  53             nontext.exposed = True
  54             nontext._cp_config = {'tools.encode.text_only': False,
  55                                   'tools.encode.add_charset': True,
  56                                   }
  57
  58         class GZIP:
  59             def index(self):
  60                 yield "Hello, world"
  61             index.exposed = True
  62
  63             def noshow(self):
  64                 # Test for ticket #147, where yield showed no exceptions (content-
  65                 # encoding was still gzip even though traceback wasn't zipped).
  66                 raise IndexError()
  67                 yield "Here be dragons"
  68             noshow.exposed = True
  69             # Turn encoding off so the gzip tool is the one doing the collapse.
  70             noshow._cp_config = {'tools.encode.on': False}
  71
  72             def noshow_stream(self):
  73                 # Test for ticket #147, where yield showed no exceptions (content-
  74                 # encoding was still gzip even though traceback wasn't zipped).
  75                 raise IndexError()
  76                 yield "Here be dragons"
  77             noshow_stream.exposed = True
  78             noshow_stream._cp_config = {'response.stream': True}
  79
  80         class Decode:
  81             def extra_charset(self, *args, **kwargs):
  82                 return ', '.join([": ".join((k, v))
  83                                   for k, v in cherrypy.request.params.items()])
  84             extra_charset.exposed = True
  85             extra_charset._cp_config = {
  86                 'tools.decode.on': True,
  87                 'tools.decode.default_encoding': ['utf-16'],
  88                 }
  89
  90             def force_charset(self, *args, **kwargs):
  91                 return ', '.join([": ".join((k, v))
  92                                   for k, v in cherrypy.request.params.items()])
  93             force_charset.exposed = True
  94             force_charset._cp_config = {
  95                 'tools.decode.on': True,
  96                 'tools.decode.encoding': 'utf-16',
  97                 }
  98
  99         root = Root()
 100         root.gzip = GZIP()
 101         root.decode = Decode()
 102         cherrypy.tree.mount(root, config={'/gzip': {'tools.gzip.on': True}})
 103     setup_server = staticmethod(setup_server)
 104
 105     def test_query_string_decoding(self):
 106         europoundUtf8 = europoundUnicode.encode('utf-8')
 107         self.getPage(ntob('/?param=') + europoundUtf8)
 108         self.assertBody(europoundUtf8)
 109
 110         # Encoded utf8 query strings MUST be parsed correctly.
 111         # Here, q is the POUND SIGN U+00A3 encoded in utf8 and then %HEX
 112         self.getPage("/reqparams?q=%C2%A3")
 113         # The return value will be encoded as utf8.
 114         self.assertBody(ntob("q: \xc2\xa3"))
 115
 116         # Query strings that are incorrectly encoded MUST raise 404.
 117         # Here, q is the POUND SIGN U+00A3 encoded in latin1 and then %HEX
 118         self.getPage("/reqparams?q=%A3")
 119         self.assertStatus(404)
 120         self.assertErrorPage(404,
 121             "The given query string could not be processed. Query "
 122             "strings for this resource must be encoded with 'utf8'.")
 123
 124     def test_urlencoded_decoding(self):
 125         # Test the decoding of an application/x-www-form-urlencoded entity.
 126         europoundUtf8 = europoundUnicode.encode('utf-8')
 127         body=ntob("param=") + europoundUtf8
 128         self.getPage('/', method='POST',
 129                      headers=[("Content-Type", "application/x-www-form-urlencoded"),
 130                               ("Content-Length", str(len(body))),
 131                               ],
 132                      body=body),
 133         self.assertBody(europoundUtf8)
 134
 135         # Encoded utf8 entities MUST be parsed and decoded correctly.
 136         # Here, q is the POUND SIGN U+00A3 encoded in utf8
 137         body = ntob("q=\xc2\xa3")
 138         self.getPage('/reqparams', method='POST',
 139                      headers=[("Content-Type", "application/x-www-form-urlencoded"),
 140                               ("Content-Length", str(len(body))),
 141                               ],
 142                      body=body),
 143         self.assertBody(ntob("q: \xc2\xa3"))
 144
 145         # ...and in utf16, which is not in the default attempt_charsets list:
 146         body = ntob("\xff\xfeq\x00=\xff\xfe\xa3\x00")
 147         self.getPage('/reqparams', method='POST',
 148                      headers=[("Content-Type", "application/x-www-form-urlencoded;charset=utf-16"),
 149                               ("Content-Length", str(len(body))),
 150                               ],
 151                      body=body),
 152         self.assertBody(ntob("q: \xc2\xa3"))
 153
 154         # Entities that are incorrectly encoded MUST raise 400.
 155         # Here, q is the POUND SIGN U+00A3 encoded in utf16, but
 156         # the Content-Type incorrectly labels it utf-8.
 157         body = ntob("\xff\xfeq\x00=\xff\xfe\xa3\x00")
 158         self.getPage('/reqparams', method='POST',
 159                      headers=[("Content-Type", "application/x-www-form-urlencoded;charset=utf-8"),
 160                               ("Content-Length", str(len(body))),
 161                               ],
 162                      body=body),
 163         self.assertStatus(400)
 164         self.assertErrorPage(400,
 165             "The request entity could not be decoded. The following charsets "
 166             "were attempted: ['utf-8']")
 167
 168     def test_decode_tool(self):
 169         # An extra charset should be tried first, and succeed if it matches.
 170         # Here, we add utf-16 as a charset and pass a utf-16 body.
 171         body = ntob("\xff\xfeq\x00=\xff\xfe\xa3\x00")
 172         self.getPage('/decode/extra_charset', method='POST',
 173                      headers=[("Content-Type", "application/x-www-form-urlencoded"),
 174                               ("Content-Length", str(len(body))),
 175                               ],
 176                      body=body),
 177         self.assertBody(ntob("q: \xc2\xa3"))
 178
 179         # An extra charset should be tried first, and continue to other default
 180         # charsets if it doesn't match.
 181         # Here, we add utf-16 as a charset but still pass a utf-8 body.
 182         body = ntob("q=\xc2\xa3")
 183         self.getPage('/decode/extra_charset', method='POST',
 184                      headers=[("Content-Type", "application/x-www-form-urlencoded"),
 185                               ("Content-Length", str(len(body))),
 186                               ],
 187                      body=body),
 188         self.assertBody(ntob("q: \xc2\xa3"))
 189
 190         # An extra charset should error if force is True and it doesn't match.
 191         # Here, we force utf-16 as a charset but still pass a utf-8 body.
 192         body = ntob("q=\xc2\xa3")
 193         self.getPage('/decode/force_charset', method='POST',
 194                      headers=[("Content-Type", "application/x-www-form-urlencoded"),
 195                               ("Content-Length", str(len(body))),
 196                               ],
 197                      body=body),
 198         self.assertErrorPage(400,
 199             "The request entity could not be decoded. The following charsets "
 200             "were attempted: ['utf-16']")
 201
 202     def test_multipart_decoding(self):
 203         # Test the decoding of a multipart entity when the charset (utf16) is
 204         # explicitly given.
 205         body=ntob('\r\n'.join(['--X',
 206                                'Content-Type: text/plain;charset=utf-16',
 207                                'Content-Disposition: form-data; name="text"',
 208                                '',
 209                                '\xff\xfea\x00b\x00\x1c c\x00',
 210                                '--X',
 211                                'Content-Type: text/plain;charset=utf-16',
 212                                'Content-Disposition: form-data; name="submit"',
 213                                '',
 214                                '\xff\xfeC\x00r\x00e\x00a\x00t\x00e\x00',
 215                                '--X--']))
 216         self.getPage('/reqparams', method='POST',
 217                      headers=[("Content-Type", "multipart/form-data;boundary=X"),
 218                               ("Content-Length", str(len(body))),
 219                               ],
 220                      body=body),
 221         self.assertBody(ntob("text: ab\xe2\x80\x9cc, submit: Create"))
 222
 223     def test_multipart_decoding_no_charset(self):
 224         # Test the decoding of a multipart entity when the charset (utf8) is
 225         # NOT explicitly given, but is in the list of charsets to attempt.
 226         body=ntob('\r\n'.join(['--X',
 227                                'Content-Disposition: form-data; name="text"',
 228                                '',
 229                                '\xe2\x80\x9c',
 230                                '--X',
 231                                'Content-Disposition: form-data; name="submit"',
 232                                '',
 233                                'Create',
 234                                '--X--']))
 235         self.getPage('/reqparams', method='POST',
 236                      headers=[("Content-Type", "multipart/form-data;boundary=X"),
 237                               ("Content-Length", str(len(body))),
 238                               ],
 239                      body=body),
 240         self.assertBody(ntob("text: \xe2\x80\x9c, submit: Create"))
 241
 242     def test_multipart_decoding_no_successful_charset(self):
 243         # Test the decoding of a multipart entity when the charset (utf16) is
 244         # NOT explicitly given, and is NOT in the list of charsets to attempt.
 245         body=ntob('\r\n'.join(['--X',
 246                                'Content-Disposition: form-data; name="text"',
 247                                '',
 248                                '\xff\xfea\x00b\x00\x1c c\x00',
 249                                '--X',
 250                                'Content-Disposition: form-data; name="submit"',
 251                                '',
 252                                '\xff\xfeC\x00r\x00e\x00a\x00t\x00e\x00',
 253                                '--X--']))
 254         self.getPage('/reqparams', method='POST',
 255                      headers=[("Content-Type", "multipart/form-data;boundary=X"),
 256                               ("Content-Length", str(len(body))),
 257                               ],
 258                      body=body),
 259         self.assertStatus(400)
 260         self.assertErrorPage(400,
 261             "The request entity could not be decoded. The following charsets "
 262             "were attempted: ['us-ascii', 'utf-8']")
 263
 264     def test_nontext(self):
 265         self.getPage('/nontext')
 266         self.assertHeader('Content-Type', 'application/binary;charset=utf-8')
 267         self.assertBody('\x00\x01\x02\x03')
 268
 269     def testEncoding(self):
 270         # Default encoding should be utf-8
 271         self.getPage('/mao_zedong')
 272         self.assertBody(sing8)
 273
 274         # Ask for utf-16.
 275         self.getPage('/mao_zedong', [('Accept-Charset', 'utf-16')])
 276         self.assertHeader('Content-Type', 'text/html;charset=utf-16')
 277         self.assertBody(sing16)
 278
 279         # Ask for multiple encodings. ISO-8859-1 should fail, and utf-16
 280         # should be produced.
 281         self.getPage('/mao_zedong', [('Accept-Charset',
 282                                       'iso-8859-1;q=1, utf-16;q=0.5')])
 283         self.assertBody(sing16)
 284
 285         # The "*" value should default to our default_encoding, utf-8
 286         self.getPage('/mao_zedong', [('Accept-Charset', '*;q=1, utf-7;q=.2')])
 287         self.assertBody(sing8)
 288
 289         # Only allow iso-8859-1, which should fail and raise 406.
 290         self.getPage('/mao_zedong', [('Accept-Charset', 'iso-8859-1, *;q=0')])
 291         self.assertStatus("406 Not Acceptable")
 292         self.assertInBody("Your client sent this Accept-Charset header: "
 293                           "iso-8859-1, *;q=0. We tried these charsets: "
 294                           "iso-8859-1.")
 295
 296         # Ask for x-mac-ce, which should be unknown. See ticket #569.
 297         self.getPage('/mao_zedong', [('Accept-Charset',
 298                                       'us-ascii, ISO-8859-1, x-mac-ce')])
 299         self.assertStatus("406 Not Acceptable")
 300         self.assertInBody("Your client sent this Accept-Charset header: "
 301                           "us-ascii, ISO-8859-1, x-mac-ce. We tried these "
 302                           "charsets: ISO-8859-1, us-ascii, x-mac-ce.")
 303
 304         # Test the 'encoding' arg to encode.
 305         self.getPage('/utf8')
 306         self.assertBody(sing8)
 307         self.getPage('/utf8', [('Accept-Charset', 'us-ascii, ISO-8859-1')])
 308         self.assertStatus("406 Not Acceptable")
 309
 310     def testGzip(self):
 311         zbuf = BytesIO()
 312         zfile = gzip.GzipFile(mode='wb', fileobj=zbuf, compresslevel=9)
 313         zfile.write(ntob("Hello, world"))
 314         zfile.close()
 315
 316         self.getPage('/gzip/', headers=[("Accept-Encoding", "gzip")])
 317         self.assertInBody(zbuf.getvalue()[:3])
 318         self.assertHeader("Vary", "Accept-Encoding")
 319         self.assertHeader("Content-Encoding", "gzip")
 320
 321         # Test when gzip is denied.
 322         self.getPage('/gzip/', headers=[("Accept-Encoding", "identity")])
 323         self.assertHeader("Vary", "Accept-Encoding")
 324         self.assertNoHeader("Content-Encoding")
 325         self.assertBody("Hello, world")
 326
 327         self.getPage('/gzip/', headers=[("Accept-Encoding", "gzip;q=0")])
 328         self.assertHeader("Vary", "Accept-Encoding")
 329         self.assertNoHeader("Content-Encoding")
 330         self.assertBody("Hello, world")
 331
 332         self.getPage('/gzip/', headers=[("Accept-Encoding", "*;q=0")])
 333         self.assertStatus(406)
 334         self.assertNoHeader("Content-Encoding")
 335         self.assertErrorPage(406, "identity, gzip")
 336
 337         # Test for ticket #147
 338         self.getPage('/gzip/noshow', headers=[("Accept-Encoding", "gzip")])
 339         self.assertNoHeader('Content-Encoding')
 340         self.assertStatus(500)
 341         self.assertErrorPage(500, pattern="IndexError\n")
 342
 343         # In this case, there's nothing we can do to deliver a
 344         # readable page, since 1) the gzip header is already set,
 345         # and 2) we may have already written some of the body.
 346         # The fix is to never stream yields when using gzip.
 347         if (cherrypy.server.protocol_version == "HTTP/1.0" or
 348             getattr(cherrypy.server, "using_apache", False)):
 349             self.getPage('/gzip/noshow_stream',
 350                          headers=[("Accept-Encoding", "gzip")])
 351             self.assertHeader('Content-Encoding', 'gzip')
 352             self.assertInBody('\x1f\x8b\x08\x00')
 353         else:
 354             # The wsgiserver will simply stop sending data, and the HTTP client
 355             # will error due to an incomplete chunk-encoded stream.
 356             self.assertRaises((ValueError, IncompleteRead), self.getPage,
 357                               '/gzip/noshow_stream',
 358                               headers=[("Accept-Encoding", "gzip")])
 359
 360     def test_UnicodeHeaders(self):
 361         self.getPage('/cookies_and_headers')
 362         self.assertBody('Any content')
 363