python/lib/django-1.2/django/http/multipartparser.py

   1 """
   2 Multi-part parsing for file uploads.
   3
   4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
   5 file upload handlers for processing.
   6 """
   7
   8 import cgi
   9 from django.conf import settings
  10 from django.core.exceptions import SuspiciousOperation
  11 from django.utils.datastructures import MultiValueDict
  12 from django.utils.encoding import force_unicode
  13 from django.utils.text import unescape_entities
  14 from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
  15
  16 __all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
  17
  18 class MultiPartParserError(Exception):
  19     pass
  20
  21 class InputStreamExhausted(Exception):
  22     """
  23     No more reads are allowed from this device.
  24     """
  25     pass
  26
  27 RAW = "raw"
  28 FILE = "file"
  29 FIELD = "field"
  30
  31 class MultiPartParser(object):
  32     """
  33     A rfc2388 multipart/form-data parser.
  34
  35     ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
  36     and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
  37     """
  38     def __init__(self, META, input_data, upload_handlers, encoding=None):
  39         """
  40         Initialize the MultiPartParser object.
  41
  42         :META:
  43             The standard ``META`` dictionary in Django request objects.
  44         :input_data:
  45             The raw post data, as a file-like object.
  46         :upload_handler:
  47             An UploadHandler instance that performs operations on the uploaded
  48             data.
  49         :encoding:
  50             The encoding with which to treat the incoming data.
  51         """
  52
  53         #
  54         # Content-Type should containt multipart and the boundary information.
  55         #
  56
  57         content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
  58         if not content_type.startswith('multipart/'):
  59             raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
  60
  61         # Parse the header to get the boundary to split the parts.
  62         ctypes, opts = parse_header(content_type)
  63         boundary = opts.get('boundary')
  64         if not boundary or not cgi.valid_boundary(boundary):
  65             raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
  66
  67
  68         #
  69         # Content-Length should contain the length of the body we are about
  70         # to receive.
  71         #
  72         try:
  73             content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
  74         except (ValueError, TypeError):
  75             # For now set it to 0; we'll try again later on down.
  76             content_length = 0
  77
  78         if content_length <= 0:
  79             # This means we shouldn't continue...raise an error.
  80             raise MultiPartParserError("Invalid content length: %r" % content_length)
  81
  82         self._boundary = boundary
  83         self._input_data = input_data
  84
  85         # For compatibility with low-level network APIs (with 32-bit integers),
  86         # the chunk size should be < 2^31, but still divisible by 4.
  87         possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
  88         self._chunk_size = min([2**31-4] + possible_sizes)
  89
  90         self._meta = META
  91         self._encoding = encoding or settings.DEFAULT_CHARSET
  92         self._content_length = content_length
  93         self._upload_handlers = upload_handlers
  94
  95     def parse(self):
  96         """
  97         Parse the POST data and break it into a FILES MultiValueDict and a POST
  98         MultiValueDict.
  99
 100         Returns a tuple containing the POST and FILES dictionary, respectively.
 101         """
 102         # We have to import QueryDict down here to avoid a circular import.
 103         from django.http import QueryDict
 104
 105         encoding = self._encoding
 106         handlers = self._upload_handlers
 107
 108         limited_input_data = LimitBytes(self._input_data, self._content_length)
 109
 110         # See if the handler will want to take care of the parsing.
 111         # This allows overriding everything if somebody wants it.
 112         for handler in handlers:
 113             result = handler.handle_raw_input(limited_input_data,
 114                                               self._meta,
 115                                               self._content_length,
 116                                               self._boundary,
 117                                               encoding)
 118             if result is not None:
 119                 return result[0], result[1]
 120
 121         # Create the data structures to be used later.
 122         self._post = QueryDict('', mutable=True)
 123         self._files = MultiValueDict()
 124
 125         # Instantiate the parser and stream:
 126         stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
 127
 128         # Whether or not to signal a file-completion at the beginning of the loop.
 129         old_field_name = None
 130         counters = [0] * len(handlers)
 131
 132         try:
 133             for item_type, meta_data, field_stream in Parser(stream, self._boundary):
 134                 if old_field_name:
 135                     # We run this at the beginning of the next loop
 136                     # since we cannot be sure a file is complete until
 137                     # we hit the next boundary/part of the multipart content.
 138                     self.handle_file_complete(old_field_name, counters)
 139                     old_field_name = None
 140
 141                 try:
 142                     disposition = meta_data['content-disposition'][1]
 143                     field_name = disposition['name'].strip()
 144                 except (KeyError, IndexError, AttributeError):
 145                     continue
 146
 147                 transfer_encoding = meta_data.get('content-transfer-encoding')
 148                 field_name = force_unicode(field_name, encoding, errors='replace')
 149
 150                 if item_type == FIELD:
 151                     # This is a post field, we can just set it in the post
 152                     if transfer_encoding == 'base64':
 153                         raw_data = field_stream.read()
 154                         try:
 155                             data = str(raw_data).decode('base64')
 156                         except:
 157                             data = raw_data
 158                     else:
 159                         data = field_stream.read()
 160
 161                     self._post.appendlist(field_name,
 162                                           force_unicode(data, encoding, errors='replace'))
 163                 elif item_type == FILE:
 164                     # This is a file, use the handler...
 165                     file_name = disposition.get('filename')
 166                     if not file_name:
 167                         continue
 168                     file_name = force_unicode(file_name, encoding, errors='replace')
 169                     file_name = self.IE_sanitize(unescape_entities(file_name))
 170
 171                     content_type = meta_data.get('content-type', ('',))[0].strip()
 172                     try:
 173                         charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
 174                     except:
 175                         charset = None
 176
 177                     try:
 178                         content_length = int(meta_data.get('content-length')[0])
 179                     except (IndexError, TypeError, ValueError):
 180                         content_length = None
 181
 182                     counters = [0] * len(handlers)
 183                     try:
 184                         for handler in handlers:
 185                             try:
 186                                 handler.new_file(field_name, file_name,
 187                                                  content_type, content_length,
 188                                                  charset)
 189                             except StopFutureHandlers:
 190                                 break
 191
 192                         for chunk in field_stream:
 193                             if transfer_encoding == 'base64':
 194                                 # We only special-case base64 transfer encoding
 195                                 try:
 196                                     chunk = str(chunk).decode('base64')
 197                                 except Exception, e:
 198                                     # Since this is only a chunk, any error is an unfixable error.
 199                                     raise MultiPartParserError("Could not decode base64 data: %r" % e)
 200
 201                             for i, handler in enumerate(handlers):
 202                                 chunk_length = len(chunk)
 203                                 chunk = handler.receive_data_chunk(chunk,
 204                                                                    counters[i])
 205                                 counters[i] += chunk_length
 206                                 if chunk is None:
 207                                     # If the chunk received by the handler is None, then don't continue.
 208                                     break
 209
 210                     except SkipFile, e:
 211                         # Just use up the rest of this file...
 212                         exhaust(field_stream)
 213                     else:
 214                         # Handle file upload completions on next iteration.
 215                         old_field_name = field_name
 216                 else:
 217                     # If this is neither a FIELD or a FILE, just exhaust the stream.
 218                     exhaust(stream)
 219         except StopUpload, e:
 220             if not e.connection_reset:
 221                 exhaust(limited_input_data)
 222         else:
 223             # Make sure that the request data is all fed
 224             exhaust(limited_input_data)
 225
 226         # Signal that the upload has completed.
 227         for handler in handlers:
 228             retval = handler.upload_complete()
 229             if retval:
 230                 break
 231
 232         return self._post, self._files
 233
 234     def handle_file_complete(self, old_field_name, counters):
 235         """
 236         Handle all the signalling that takes place when a file is complete.
 237         """
 238         for i, handler in enumerate(self._upload_handlers):
 239             file_obj = handler.file_complete(counters[i])
 240             if file_obj:
 241                 # If it returns a file object, then set the files dict.
 242                 self._files.appendlist(force_unicode(old_field_name,
 243                                                      self._encoding,
 244                                                      errors='replace'),
 245                                        file_obj)
 246                 break
 247
 248     def IE_sanitize(self, filename):
 249         """Cleanup filename from Internet Explorer full paths."""
 250         return filename and filename[filename.rfind("\\")+1:].strip()
 251
 252 class LazyStream(object):
 253     """
 254     The LazyStream wrapper allows one to get and "unget" bytes from a stream.
 255
 256     Given a producer object (an iterator that yields bytestrings), the
 257     LazyStream object will support iteration, reading, and keeping a "look-back"
 258     variable in case you need to "unget" some bytes.
 259     """
 260     def __init__(self, producer, length=None):
 261         """
 262         Every LazyStream must have a producer when instantiated.
 263
 264         A producer is an iterable that returns a string each time it
 265         is called.
 266         """
 267         self._producer = producer
 268         self._empty = False
 269         self._leftover = ''
 270         self.length = length
 271         self.position = 0
 272         self._remaining = length
 273         self._unget_history = []
 274
 275     def tell(self):
 276         return self.position
 277
 278     def read(self, size=None):
 279         def parts():
 280             remaining = (size is not None and [size] or [self._remaining])[0]
 281             # do the whole thing in one shot if no limit was provided.
 282             if remaining is None:
 283                 yield ''.join(self)
 284                 return
 285
 286             # otherwise do some bookkeeping to return exactly enough
 287             # of the stream and stashing any extra content we get from
 288             # the producer
 289             while remaining != 0:
 290                 assert remaining > 0, 'remaining bytes to read should never go negative'
 291
 292                 chunk = self.next()
 293
 294                 emitting = chunk[:remaining]
 295                 self.unget(chunk[remaining:])
 296                 remaining -= len(emitting)
 297                 yield emitting
 298
 299         out = ''.join(parts())
 300         return out
 301
 302     def next(self):
 303         """
 304         Used when the exact number of bytes to read is unimportant.
 305
 306         This procedure just returns whatever is chunk is conveniently returned
 307         from the iterator instead. Useful to avoid unnecessary bookkeeping if
 308         performance is an issue.
 309         """
 310         if self._leftover:
 311             output = self._leftover
 312             self._leftover = ''
 313         else:
 314             output = self._producer.next()
 315             self._unget_history = []
 316         self.position += len(output)
 317         return output
 318
 319     def close(self):
 320         """
 321         Used to invalidate/disable this lazy stream.
 322
 323         Replaces the producer with an empty list. Any leftover bytes that have
 324         already been read will still be reported upon read() and/or next().
 325         """
 326         self._producer = []
 327
 328     def __iter__(self):
 329         return self
 330
 331     def unget(self, bytes):
 332         """
 333         Places bytes back onto the front of the lazy stream.
 334
 335         Future calls to read() will return those bytes first. The
 336         stream position and thus tell() will be rewound.
 337         """
 338         if not bytes:
 339             return
 340         self._update_unget_history(len(bytes))
 341         self.position -= len(bytes)
 342         self._leftover = ''.join([bytes, self._leftover])
 343
 344     def _update_unget_history(self, num_bytes):
 345         """
 346         Updates the unget history as a sanity check to see if we've pushed
 347         back the same number of bytes in one chunk. If we keep ungetting the
 348         same number of bytes many times (here, 50), we're mostly likely in an
 349         infinite loop of some sort. This is usually caused by a
 350         maliciously-malformed MIME request.
 351         """
 352         self._unget_history = [num_bytes] + self._unget_history[:49]
 353         number_equal = len([current_number for current_number in self._unget_history
 354                             if current_number == num_bytes])
 355
 356         if number_equal > 40:
 357             raise SuspiciousOperation(
 358                 "The multipart parser got stuck, which shouldn't happen with"
 359                 " normal uploaded files. Check for malicious upload activity;"
 360                 " if there is none, report this to the Django developers."
 361             )
 362
 363 class ChunkIter(object):
 364     """
 365     An iterable that will yield chunks of data. Given a file-like object as the
 366     constructor, this object will yield chunks of read operations from that
 367     object.
 368     """
 369     def __init__(self, flo, chunk_size=64 * 1024):
 370         self.flo = flo
 371         self.chunk_size = chunk_size
 372
 373     def next(self):
 374         try:
 375             data = self.flo.read(self.chunk_size)
 376         except InputStreamExhausted:
 377             raise StopIteration()
 378         if data:
 379             return data
 380         else:
 381             raise StopIteration()
 382
 383     def __iter__(self):
 384         return self
 385
 386 class LimitBytes(object):
 387     """ Limit bytes for a file object. """
 388     def __init__(self, fileobject, length):
 389         self._file = fileobject
 390         self.remaining = length
 391
 392     def read(self, num_bytes=None):
 393         """
 394         Read data from the underlying file.
 395         If you ask for too much or there isn't anything left,
 396         this will raise an InputStreamExhausted error.
 397         """
 398         if self.remaining <= 0:
 399             raise InputStreamExhausted()
 400         if num_bytes is None:
 401             num_bytes = self.remaining
 402         else:
 403             num_bytes = min(num_bytes, self.remaining)
 404         self.remaining -= num_bytes
 405         return self._file.read(num_bytes)
 406
 407 class InterBoundaryIter(object):
 408     """
 409     A Producer that will iterate over boundaries.
 410     """
 411     def __init__(self, stream, boundary):
 412         self._stream = stream
 413         self._boundary = boundary
 414
 415     def __iter__(self):
 416         return self
 417
 418     def next(self):
 419         try:
 420             return LazyStream(BoundaryIter(self._stream, self._boundary))
 421         except InputStreamExhausted:
 422             raise StopIteration()
 423
 424 class BoundaryIter(object):
 425     """
 426     A Producer that is sensitive to boundaries.
 427
 428     Will happily yield bytes until a boundary is found. Will yield the bytes
 429     before the boundary, throw away the boundary bytes themselves, and push the
 430     post-boundary bytes back on the stream.
 431
 432     The future calls to .next() after locating the boundary will raise a
 433     StopIteration exception.
 434     """
 435
 436     def __init__(self, stream, boundary):
 437         self._stream = stream
 438         self._boundary = boundary
 439         self._done = False
 440         # rollback an additional six bytes because the format is like
 441         # this: CRLF<boundary>[--CRLF]
 442         self._rollback = len(boundary) + 6
 443
 444         # Try to use mx fast string search if available. Otherwise
 445         # use Python find. Wrap the latter for consistency.
 446         unused_char = self._stream.read(1)
 447         if not unused_char:
 448             raise InputStreamExhausted()
 449         self._stream.unget(unused_char)
 450         try:
 451             from mx.TextTools import FS
 452             self._fs = FS(boundary).find
 453         except ImportError:
 454             self._fs = lambda data: data.find(boundary)
 455
 456     def __iter__(self):
 457         return self
 458
 459     def next(self):
 460         if self._done:
 461             raise StopIteration()
 462
 463         stream = self._stream
 464         rollback = self._rollback
 465
 466         bytes_read = 0
 467         chunks = []
 468         for bytes in stream:
 469             bytes_read += len(bytes)
 470             chunks.append(bytes)
 471             if bytes_read > rollback:
 472                 break
 473             if not bytes:
 474                 break
 475         else:
 476             self._done = True
 477
 478         if not chunks:
 479             raise StopIteration()
 480
 481         chunk = ''.join(chunks)
 482         boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
 483
 484         if boundary:
 485             end, next = boundary
 486             stream.unget(chunk[next:])
 487             self._done = True
 488             return chunk[:end]
 489         else:
 490             # make sure we dont treat a partial boundary (and
 491             # its separators) as data
 492             if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
 493                 # There's nothing left, we should just return and mark as done.
 494                 self._done = True
 495                 return chunk
 496             else:
 497                 stream.unget(chunk[-rollback:])
 498                 return chunk[:-rollback]
 499
 500     def _find_boundary(self, data, eof = False):
 501         """
 502         Finds a multipart boundary in data.
 503
 504         Should no boundry exist in the data None is returned instead. Otherwise
 505         a tuple containing the indices of the following are returned:
 506
 507          * the end of current encapsulation
 508          * the start of the next encapsulation
 509         """
 510         index = self._fs(data)
 511         if index < 0:
 512             return None
 513         else:
 514             end = index
 515             next = index + len(self._boundary)
 516             # backup over CRLF
 517             if data[max(0,end-1)] == '\n':
 518                 end -= 1
 519             if data[max(0,end-1)] == '\r':
 520                 end -= 1
 521             return end, next
 522
 523 def exhaust(stream_or_iterable):
 524     """
 525     Completely exhausts an iterator or stream.
 526
 527     Raise a MultiPartParserError if the argument is not a stream or an iterable.
 528     """
 529     iterator = None
 530     try:
 531         iterator = iter(stream_or_iterable)
 532     except TypeError:
 533         iterator = ChunkIter(stream_or_iterable, 16384)
 534
 535     if iterator is None:
 536         raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
 537
 538     for __ in iterator:
 539         pass
 540
 541 def parse_boundary_stream(stream, max_header_size):
 542     """
 543     Parses one and exactly one stream that encapsulates a boundary.
 544     """
 545     # Stream at beginning of header, look for end of header
 546     # and parse it if found. The header must fit within one
 547     # chunk.
 548     chunk = stream.read(max_header_size)
 549
 550     # 'find' returns the top of these four bytes, so we'll
 551     # need to munch them later to prevent them from polluting
 552     # the payload.
 553     header_end = chunk.find('\r\n\r\n')
 554
 555     def _parse_header(line):
 556         main_value_pair, params = parse_header(line)
 557         try:
 558             name, value = main_value_pair.split(':', 1)
 559         except:
 560             raise ValueError("Invalid header: %r" % line)
 561         return name, (value, params)
 562
 563     if header_end == -1:
 564         # we find no header, so we just mark this fact and pass on
 565         # the stream verbatim
 566         stream.unget(chunk)
 567         return (RAW, {}, stream)
 568
 569     header = chunk[:header_end]
 570
 571     # here we place any excess chunk back onto the stream, as
 572     # well as throwing away the CRLFCRLF bytes from above.
 573     stream.unget(chunk[header_end + 4:])
 574
 575     TYPE = RAW
 576     outdict = {}
 577
 578     # Eliminate blank lines
 579     for line in header.split('\r\n'):
 580         # This terminology ("main value" and "dictionary of
 581         # parameters") is from the Python docs.
 582         try:
 583             name, (value, params) = _parse_header(line)
 584         except:
 585             continue
 586
 587         if name == 'content-disposition':
 588             TYPE = FIELD
 589             if params.get('filename'):
 590                 TYPE = FILE
 591
 592         outdict[name] = value, params
 593
 594     if TYPE == RAW:
 595         stream.unget(chunk)
 596
 597     return (TYPE, outdict, stream)
 598
 599 class Parser(object):
 600     def __init__(self, stream, boundary):
 601         self._stream = stream
 602         self._separator = '--' + boundary
 603
 604     def __iter__(self):
 605         boundarystream = InterBoundaryIter(self._stream, self._separator)
 606         for sub_stream in boundarystream:
 607             # Iterate over each part
 608             yield parse_boundary_stream(sub_stream, 1024)
 609
 610 def parse_header(line):
 611     """ Parse the header into a key-value. """
 612     plist = _parse_header_params(';' + line)
 613     key = plist.pop(0).lower()
 614     pdict = {}
 615     for p in plist:
 616         i = p.find('=')
 617         if i >= 0:
 618             name = p[:i].strip().lower()
 619             value = p[i+1:].strip()
 620             if len(value) >= 2 and value[0] == value[-1] == '"':
 621                 value = value[1:-1]
 622                 value = value.replace('\\\\', '\\').replace('\\"', '"')
 623             pdict[name] = value
 624     return key, pdict
 625
 626 def _parse_header_params(s):
 627     plist = []
 628     while s[:1] == ';':
 629         s = s[1:]
 630         end = s.find(';')
 631         while end > 0 and s.count('"', 0, end) % 2:
 632             end = s.find(';', end + 1)
 633         if end < 0:
 634             end = len(s)
 635         f = s[:end]
 636         plist.append(f.strip())
 637         s = s[end:]
 638     return plist