lib/cherrypy/lib/encoding.py

   1 import struct
   2 import time
   3
   4 import cherrypy
   5 from cherrypy._cpcompat import basestring, BytesIO, ntob, set, unicodestr
   6 from cherrypy.lib import file_generator
   7 from cherrypy.lib import set_vary_header
   8
   9
  10 def decode(encoding=None, default_encoding='utf-8'):
  11     """Replace or extend the list of charsets used to decode a request entity.
  12
  13     Either argument may be a single string or a list of strings.
  14
  15     encoding
  16         If not None, restricts the set of charsets attempted while decoding
  17         a request entity to the given set (even if a different charset is given in
  18         the Content-Type request header).
  19
  20     default_encoding
  21         Only in effect if the 'encoding' argument is not given.
  22         If given, the set of charsets attempted while decoding a request entity is
  23         *extended* with the given value(s).
  24
  25     """
  26     body = cherrypy.request.body
  27     if encoding is not None:
  28         if not isinstance(encoding, list):
  29             encoding = [encoding]
  30         body.attempt_charsets = encoding
  31     elif default_encoding:
  32         if not isinstance(default_encoding, list):
  33             default_encoding = [default_encoding]
  34         body.attempt_charsets = body.attempt_charsets + default_encoding
  35
  36
  37 class ResponseEncoder:
  38
  39     default_encoding = 'utf-8'
  40     failmsg = "Response body could not be encoded with %r."
  41     encoding = None
  42     errors = 'strict'
  43     text_only = True
  44     add_charset = True
  45     debug = False
  46
  47     def __init__(self, **kwargs):
  48         for k, v in kwargs.items():
  49             setattr(self, k, v)
  50
  51         self.attempted_charsets = set()
  52         request = cherrypy.serving.request
  53         if request.handler is not None:
  54             # Replace request.handler with self
  55             if self.debug:
  56                 cherrypy.log('Replacing request.handler', 'TOOLS.ENCODE')
  57             self.oldhandler = request.handler
  58             request.handler = self
  59
  60     def encode_stream(self, encoding):
  61         """Encode a streaming response body.
  62
  63         Use a generator wrapper, and just pray it works as the stream is
  64         being written out.
  65         """
  66         if encoding in self.attempted_charsets:
  67             return False
  68         self.attempted_charsets.add(encoding)
  69
  70         def encoder(body):
  71             for chunk in body:
  72                 if isinstance(chunk, unicodestr):
  73                     chunk = chunk.encode(encoding, self.errors)
  74                 yield chunk
  75         self.body = encoder(self.body)
  76         return True
  77
  78     def encode_string(self, encoding):
  79         """Encode a buffered response body."""
  80         if encoding in self.attempted_charsets:
  81             return False
  82         self.attempted_charsets.add(encoding)
  83
  84         try:
  85             body = []
  86             for chunk in self.body:
  87                 if isinstance(chunk, unicodestr):
  88                     chunk = chunk.encode(encoding, self.errors)
  89                 body.append(chunk)
  90             self.body = body
  91         except (LookupError, UnicodeError):
  92             return False
  93         else:
  94             return True
  95
  96     def find_acceptable_charset(self):
  97         request = cherrypy.serving.request
  98         response = cherrypy.serving.response
  99
 100         if self.debug:
 101             cherrypy.log('response.stream %r' % response.stream, 'TOOLS.ENCODE')
 102         if response.stream:
 103             encoder = self.encode_stream
 104         else:
 105             encoder = self.encode_string
 106             if "Content-Length" in response.headers:
 107                 # Delete Content-Length header so finalize() recalcs it.
 108                 # Encoded strings may be of different lengths from their
 109                 # unicode equivalents, and even from each other. For example:
 110                 # >>> t = u"\u7007\u3040"
 111                 # >>> len(t)
 112                 # 2
 113                 # >>> len(t.encode("UTF-8"))
 114                 # 6
 115                 # >>> len(t.encode("utf7"))
 116                 # 8
 117                 del response.headers["Content-Length"]
 118
 119         # Parse the Accept-Charset request header, and try to provide one
 120         # of the requested charsets (in order of user preference).
 121         encs = request.headers.elements('Accept-Charset')
 122         charsets = [enc.value.lower() for enc in encs]
 123         if self.debug:
 124             cherrypy.log('charsets %s' % repr(charsets), 'TOOLS.ENCODE')
 125
 126         if self.encoding is not None:
 127             # If specified, force this encoding to be used, or fail.
 128             encoding = self.encoding.lower()
 129             if self.debug:
 130                 cherrypy.log('Specified encoding %r' % encoding, 'TOOLS.ENCODE')
 131             if (not charsets) or "*" in charsets or encoding in charsets:
 132                 if self.debug:
 133                     cherrypy.log('Attempting encoding %r' % encoding, 'TOOLS.ENCODE')
 134                 if encoder(encoding):
 135                     return encoding
 136         else:
 137             if not encs:
 138                 if self.debug:
 139                     cherrypy.log('Attempting default encoding %r' %
 140                                  self.default_encoding, 'TOOLS.ENCODE')
 141                 # Any character-set is acceptable.
 142                 if encoder(self.default_encoding):
 143                     return self.default_encoding
 144                 else:
 145                     raise cherrypy.HTTPError(500, self.failmsg % self.default_encoding)
 146             else:
 147                 for element in encs:
 148                     if element.qvalue > 0:
 149                         if element.value == "*":
 150                             # Matches any charset. Try our default.
 151                             if self.debug:
 152                                 cherrypy.log('Attempting default encoding due '
 153                                              'to %r' % element, 'TOOLS.ENCODE')
 154                             if encoder(self.default_encoding):
 155                                 return self.default_encoding
 156                         else:
 157                             encoding = element.value
 158                             if self.debug:
 159                                 cherrypy.log('Attempting encoding %s (qvalue >'
 160                                              '0)' % element, 'TOOLS.ENCODE')
 161                             if encoder(encoding):
 162                                 return encoding
 163
 164                 if "*" not in charsets:
 165                     # If no "*" is present in an Accept-Charset field, then all
 166                     # character sets not explicitly mentioned get a quality
 167                     # value of 0, except for ISO-8859-1, which gets a quality
 168                     # value of 1 if not explicitly mentioned.
 169                     iso = 'iso-8859-1'
 170                     if iso not in charsets:
 171                         if self.debug:
 172                             cherrypy.log('Attempting ISO-8859-1 encoding',
 173                                          'TOOLS.ENCODE')
 174                         if encoder(iso):
 175                             return iso
 176
 177         # No suitable encoding found.
 178         ac = request.headers.get('Accept-Charset')
 179         if ac is None:
 180             msg = "Your client did not send an Accept-Charset header."
 181         else:
 182             msg = "Your client sent this Accept-Charset header: %s." % ac
 183         msg += " We tried these charsets: %s." % ", ".join(self.attempted_charsets)
 184         raise cherrypy.HTTPError(406, msg)
 185
 186     def __call__(self, *args, **kwargs):
 187         response = cherrypy.serving.response
 188         self.body = self.oldhandler(*args, **kwargs)
 189
 190         if isinstance(self.body, basestring):
 191             # strings get wrapped in a list because iterating over a single
 192             # item list is much faster than iterating over every character
 193             # in a long string.
 194             if self.body:
 195                 self.body = [self.body]
 196             else:
 197                 # [''] doesn't evaluate to False, so replace it with [].
 198                 self.body = []
 199         elif hasattr(self.body, 'read'):
 200             self.body = file_generator(self.body)
 201         elif self.body is None:
 202             self.body = []
 203
 204         ct = response.headers.elements("Content-Type")
 205         if self.debug:
 206             cherrypy.log('Content-Type: %r' % [str(h) for h in ct], 'TOOLS.ENCODE')
 207         if ct:
 208             ct = ct[0]
 209             if self.text_only:
 210                 if ct.value.lower().startswith("text/"):
 211                     if self.debug:
 212                         cherrypy.log('Content-Type %s starts with "text/"' % ct,
 213                                      'TOOLS.ENCODE')
 214                     do_find = True
 215                 else:
 216                     if self.debug:
 217                         cherrypy.log('Not finding because Content-Type %s does '
 218                                      'not start with "text/"' % ct,
 219                                      'TOOLS.ENCODE')
 220                     do_find = False
 221             else:
 222                 if self.debug:
 223                     cherrypy.log('Finding because not text_only', 'TOOLS.ENCODE')
 224                 do_find = True
 225
 226             if do_find:
 227                 # Set "charset=..." param on response Content-Type header
 228                 ct.params['charset'] = self.find_acceptable_charset()
 229                 if self.add_charset:
 230                     if self.debug:
 231                         cherrypy.log('Setting Content-Type %s' % ct,
 232                                      'TOOLS.ENCODE')
 233                     response.headers["Content-Type"] = str(ct)
 234
 235         return self.body
 236
 237 # GZIP
 238
 239 def compress(body, compress_level):
 240     """Compress 'body' at the given compress_level."""
 241     import zlib
 242
 243     # See http://www.gzip.org/zlib/rfc-gzip.html
 244     yield ntob('\x1f\x8b')       # ID1 and ID2: gzip marker
 245     yield ntob('\x08')           # CM: compression method
 246     yield ntob('\x00')           # FLG: none set
 247     # MTIME: 4 bytes
 248     yield struct.pack("<L", int(time.time()) & int('FFFFFFFF', 16))
 249     yield ntob('\x02')           # XFL: max compression, slowest algo
 250     yield ntob('\xff')           # OS: unknown
 251
 252     crc = zlib.crc32(ntob(""))
 253     size = 0
 254     zobj = zlib.compressobj(compress_level,
 255                             zlib.DEFLATED, -zlib.MAX_WBITS,
 256                             zlib.DEF_MEM_LEVEL, 0)
 257     for line in body:
 258         size += len(line)
 259         crc = zlib.crc32(line, crc)
 260         yield zobj.compress(line)
 261     yield zobj.flush()
 262
 263     # CRC32: 4 bytes
 264     yield struct.pack("<L", crc & int('FFFFFFFF', 16))
 265     # ISIZE: 4 bytes
 266     yield struct.pack("<L", size & int('FFFFFFFF', 16))
 267
 268 def decompress(body):
 269     import gzip
 270
 271     zbuf = BytesIO()
 272     zbuf.write(body)
 273     zbuf.seek(0)
 274     zfile = gzip.GzipFile(mode='rb', fileobj=zbuf)
 275     data = zfile.read()
 276     zfile.close()
 277     return data
 278
 279
 280 def gzip(compress_level=5, mime_types=['text/html', 'text/plain'], debug=False):
 281     """Try to gzip the response body if Content-Type in mime_types.
 282
 283     cherrypy.response.headers['Content-Type'] must be set to one of the
 284     values in the mime_types arg before calling this function.
 285
 286     The provided list of mime-types must be of one of the following form:
 287         * type/subtype
 288         * type/*
 289         * type/*+subtype
 290
 291     No compression is performed if any of the following hold:
 292         * The client sends no Accept-Encoding request header
 293         * No 'gzip' or 'x-gzip' is present in the Accept-Encoding header
 294         * No 'gzip' or 'x-gzip' with a qvalue > 0 is present
 295         * The 'identity' value is given with a qvalue > 0.
 296
 297     """
 298     request = cherrypy.serving.request
 299     response = cherrypy.serving.response
 300
 301     set_vary_header(response, "Accept-Encoding")
 302
 303     if not response.body:
 304         # Response body is empty (might be a 304 for instance)
 305         if debug:
 306             cherrypy.log('No response body', context='TOOLS.GZIP')
 307         return
 308
 309     # If returning cached content (which should already have been gzipped),
 310     # don't re-zip.
 311     if getattr(request, "cached", False):
 312         if debug:
 313             cherrypy.log('Not gzipping cached response', context='TOOLS.GZIP')
 314         return
 315
 316     acceptable = request.headers.elements('Accept-Encoding')
 317     if not acceptable:
 318         # If no Accept-Encoding field is present in a request,
 319         # the server MAY assume that the client will accept any
 320         # content coding. In this case, if "identity" is one of
 321         # the available content-codings, then the server SHOULD use
 322         # the "identity" content-coding, unless it has additional
 323         # information that a different content-coding is meaningful
 324         # to the client.
 325         if debug:
 326             cherrypy.log('No Accept-Encoding', context='TOOLS.GZIP')
 327         return
 328
 329     ct = response.headers.get('Content-Type', '').split(';')[0]
 330     for coding in acceptable:
 331         if coding.value == 'identity' and coding.qvalue != 0:
 332             if debug:
 333                 cherrypy.log('Non-zero identity qvalue: %s' % coding,
 334                              context='TOOLS.GZIP')
 335             return
 336         if coding.value in ('gzip', 'x-gzip'):
 337             if coding.qvalue == 0:
 338                 if debug:
 339                     cherrypy.log('Zero gzip qvalue: %s' % coding,
 340                                  context='TOOLS.GZIP')
 341                 return
 342
 343             if ct not in mime_types:
 344                 # If the list of provided mime-types contains tokens
 345                 # such as 'text/*' or 'application/*+xml',
 346                 # we go through them and find the most appropriate one
 347                 # based on the given content-type.
 348                 # The pattern matching is only caring about the most
 349                 # common cases, as stated above, and doesn't support
 350                 # for extra parameters.
 351                 found = False
 352                 if '/' in ct:
 353                     ct_media_type, ct_sub_type = ct.split('/')
 354                     for mime_type in mime_types:
 355                         if '/' in mime_type:
 356                             media_type, sub_type = mime_type.split('/')
 357                             if ct_media_type == media_type:
 358                                 if sub_type == '*':
 359                                     found = True
 360                                     break
 361                                 elif '+' in sub_type and '+' in ct_sub_type:
 362                                     ct_left, ct_right = ct_sub_type.split('+')
 363                                     left, right = sub_type.split('+')
 364                                     if left == '*' and ct_right == right:
 365                                         found = True
 366                                         break
 367
 368                 if not found:
 369                     if debug:
 370                         cherrypy.log('Content-Type %s not in mime_types %r' %
 371                                      (ct, mime_types), context='TOOLS.GZIP')
 372                     return
 373
 374             if debug:
 375                 cherrypy.log('Gzipping', context='TOOLS.GZIP')
 376             # Return a generator that compresses the page
 377             response.headers['Content-Encoding'] = 'gzip'
 378             response.body = compress(response.body, compress_level)
 379             if "Content-Length" in response.headers:
 380                 # Delete Content-Length header so finalize() recalcs it.
 381                 del response.headers["Content-Length"]
 382
 383             return
 384
 385     if debug:
 386         cherrypy.log('No acceptable encoding found.', context='GZIP')
 387     cherrypy.HTTPError(406, "identity, gzip").set_response()
 388