1.9.30 sync.
[gae.git] / python / lib / django-1.2 / django / http / multipartparser.py
blobe45d5d1035040713982746beaa216b2a05b3660d
1 """
2 Multi-part parsing for file uploads.
4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5 file upload handlers for processing.
6 """
8 import cgi
9 from django.conf import settings
10 from django.core.exceptions import SuspiciousOperation
11 from django.utils.datastructures import MultiValueDict
12 from django.utils.encoding import force_unicode
13 from django.utils.text import unescape_entities
14 from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
16 __all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
18 class MultiPartParserError(Exception):
19 pass
21 class InputStreamExhausted(Exception):
22 """
23 No more reads are allowed from this device.
24 """
25 pass
27 RAW = "raw"
28 FILE = "file"
29 FIELD = "field"
31 class MultiPartParser(object):
32 """
33 A rfc2388 multipart/form-data parser.
35 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
36 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
37 """
38 def __init__(self, META, input_data, upload_handlers, encoding=None):
39 """
40 Initialize the MultiPartParser object.
42 :META:
43 The standard ``META`` dictionary in Django request objects.
44 :input_data:
45 The raw post data, as a file-like object.
46 :upload_handler:
47 An UploadHandler instance that performs operations on the uploaded
48 data.
49 :encoding:
50 The encoding with which to treat the incoming data.
51 """
54 # Content-Type should containt multipart and the boundary information.
57 content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
58 if not content_type.startswith('multipart/'):
59 raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
61 # Parse the header to get the boundary to split the parts.
62 ctypes, opts = parse_header(content_type)
63 boundary = opts.get('boundary')
64 if not boundary or not cgi.valid_boundary(boundary):
65 raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
69 # Content-Length should contain the length of the body we are about
70 # to receive.
72 try:
73 content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
74 except (ValueError, TypeError):
75 # For now set it to 0; we'll try again later on down.
76 content_length = 0
78 if content_length <= 0:
79 # This means we shouldn't continue...raise an error.
80 raise MultiPartParserError("Invalid content length: %r" % content_length)
82 self._boundary = boundary
83 self._input_data = input_data
85 # For compatibility with low-level network APIs (with 32-bit integers),
86 # the chunk size should be < 2^31, but still divisible by 4.
87 possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
88 self._chunk_size = min([2**31-4] + possible_sizes)
90 self._meta = META
91 self._encoding = encoding or settings.DEFAULT_CHARSET
92 self._content_length = content_length
93 self._upload_handlers = upload_handlers
95 def parse(self):
96 """
97 Parse the POST data and break it into a FILES MultiValueDict and a POST
98 MultiValueDict.
100 Returns a tuple containing the POST and FILES dictionary, respectively.
102 # We have to import QueryDict down here to avoid a circular import.
103 from django.http import QueryDict
105 encoding = self._encoding
106 handlers = self._upload_handlers
108 limited_input_data = LimitBytes(self._input_data, self._content_length)
110 # See if the handler will want to take care of the parsing.
111 # This allows overriding everything if somebody wants it.
112 for handler in handlers:
113 result = handler.handle_raw_input(limited_input_data,
114 self._meta,
115 self._content_length,
116 self._boundary,
117 encoding)
118 if result is not None:
119 return result[0], result[1]
121 # Create the data structures to be used later.
122 self._post = QueryDict('', mutable=True)
123 self._files = MultiValueDict()
125 # Instantiate the parser and stream:
126 stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
128 # Whether or not to signal a file-completion at the beginning of the loop.
129 old_field_name = None
130 counters = [0] * len(handlers)
132 try:
133 for item_type, meta_data, field_stream in Parser(stream, self._boundary):
134 if old_field_name:
135 # We run this at the beginning of the next loop
136 # since we cannot be sure a file is complete until
137 # we hit the next boundary/part of the multipart content.
138 self.handle_file_complete(old_field_name, counters)
139 old_field_name = None
141 try:
142 disposition = meta_data['content-disposition'][1]
143 field_name = disposition['name'].strip()
144 except (KeyError, IndexError, AttributeError):
145 continue
147 transfer_encoding = meta_data.get('content-transfer-encoding')
148 field_name = force_unicode(field_name, encoding, errors='replace')
150 if item_type == FIELD:
151 # This is a post field, we can just set it in the post
152 if transfer_encoding == 'base64':
153 raw_data = field_stream.read()
154 try:
155 data = str(raw_data).decode('base64')
156 except:
157 data = raw_data
158 else:
159 data = field_stream.read()
161 self._post.appendlist(field_name,
162 force_unicode(data, encoding, errors='replace'))
163 elif item_type == FILE:
164 # This is a file, use the handler...
165 file_name = disposition.get('filename')
166 if not file_name:
167 continue
168 file_name = force_unicode(file_name, encoding, errors='replace')
169 file_name = self.IE_sanitize(unescape_entities(file_name))
171 content_type = meta_data.get('content-type', ('',))[0].strip()
172 try:
173 charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
174 except:
175 charset = None
177 try:
178 content_length = int(meta_data.get('content-length')[0])
179 except (IndexError, TypeError, ValueError):
180 content_length = None
182 counters = [0] * len(handlers)
183 try:
184 for handler in handlers:
185 try:
186 handler.new_file(field_name, file_name,
187 content_type, content_length,
188 charset)
189 except StopFutureHandlers:
190 break
192 for chunk in field_stream:
193 if transfer_encoding == 'base64':
194 # We only special-case base64 transfer encoding
195 try:
196 chunk = str(chunk).decode('base64')
197 except Exception, e:
198 # Since this is only a chunk, any error is an unfixable error.
199 raise MultiPartParserError("Could not decode base64 data: %r" % e)
201 for i, handler in enumerate(handlers):
202 chunk_length = len(chunk)
203 chunk = handler.receive_data_chunk(chunk,
204 counters[i])
205 counters[i] += chunk_length
206 if chunk is None:
207 # If the chunk received by the handler is None, then don't continue.
208 break
210 except SkipFile, e:
211 # Just use up the rest of this file...
212 exhaust(field_stream)
213 else:
214 # Handle file upload completions on next iteration.
215 old_field_name = field_name
216 else:
217 # If this is neither a FIELD or a FILE, just exhaust the stream.
218 exhaust(stream)
219 except StopUpload, e:
220 if not e.connection_reset:
221 exhaust(limited_input_data)
222 else:
223 # Make sure that the request data is all fed
224 exhaust(limited_input_data)
226 # Signal that the upload has completed.
227 for handler in handlers:
228 retval = handler.upload_complete()
229 if retval:
230 break
232 return self._post, self._files
234 def handle_file_complete(self, old_field_name, counters):
236 Handle all the signalling that takes place when a file is complete.
238 for i, handler in enumerate(self._upload_handlers):
239 file_obj = handler.file_complete(counters[i])
240 if file_obj:
241 # If it returns a file object, then set the files dict.
242 self._files.appendlist(force_unicode(old_field_name,
243 self._encoding,
244 errors='replace'),
245 file_obj)
246 break
248 def IE_sanitize(self, filename):
249 """Cleanup filename from Internet Explorer full paths."""
250 return filename and filename[filename.rfind("\\")+1:].strip()
252 class LazyStream(object):
254 The LazyStream wrapper allows one to get and "unget" bytes from a stream.
256 Given a producer object (an iterator that yields bytestrings), the
257 LazyStream object will support iteration, reading, and keeping a "look-back"
258 variable in case you need to "unget" some bytes.
260 def __init__(self, producer, length=None):
262 Every LazyStream must have a producer when instantiated.
264 A producer is an iterable that returns a string each time it
265 is called.
267 self._producer = producer
268 self._empty = False
269 self._leftover = ''
270 self.length = length
271 self.position = 0
272 self._remaining = length
273 self._unget_history = []
275 def tell(self):
276 return self.position
278 def read(self, size=None):
279 def parts():
280 remaining = (size is not None and [size] or [self._remaining])[0]
281 # do the whole thing in one shot if no limit was provided.
282 if remaining is None:
283 yield ''.join(self)
284 return
286 # otherwise do some bookkeeping to return exactly enough
287 # of the stream and stashing any extra content we get from
288 # the producer
289 while remaining != 0:
290 assert remaining > 0, 'remaining bytes to read should never go negative'
292 chunk = self.next()
294 emitting = chunk[:remaining]
295 self.unget(chunk[remaining:])
296 remaining -= len(emitting)
297 yield emitting
299 out = ''.join(parts())
300 return out
302 def next(self):
304 Used when the exact number of bytes to read is unimportant.
306 This procedure just returns whatever is chunk is conveniently returned
307 from the iterator instead. Useful to avoid unnecessary bookkeeping if
308 performance is an issue.
310 if self._leftover:
311 output = self._leftover
312 self._leftover = ''
313 else:
314 output = self._producer.next()
315 self._unget_history = []
316 self.position += len(output)
317 return output
319 def close(self):
321 Used to invalidate/disable this lazy stream.
323 Replaces the producer with an empty list. Any leftover bytes that have
324 already been read will still be reported upon read() and/or next().
326 self._producer = []
328 def __iter__(self):
329 return self
331 def unget(self, bytes):
333 Places bytes back onto the front of the lazy stream.
335 Future calls to read() will return those bytes first. The
336 stream position and thus tell() will be rewound.
338 if not bytes:
339 return
340 self._update_unget_history(len(bytes))
341 self.position -= len(bytes)
342 self._leftover = ''.join([bytes, self._leftover])
344 def _update_unget_history(self, num_bytes):
346 Updates the unget history as a sanity check to see if we've pushed
347 back the same number of bytes in one chunk. If we keep ungetting the
348 same number of bytes many times (here, 50), we're mostly likely in an
349 infinite loop of some sort. This is usually caused by a
350 maliciously-malformed MIME request.
352 self._unget_history = [num_bytes] + self._unget_history[:49]
353 number_equal = len([current_number for current_number in self._unget_history
354 if current_number == num_bytes])
356 if number_equal > 40:
357 raise SuspiciousOperation(
358 "The multipart parser got stuck, which shouldn't happen with"
359 " normal uploaded files. Check for malicious upload activity;"
360 " if there is none, report this to the Django developers."
363 class ChunkIter(object):
365 An iterable that will yield chunks of data. Given a file-like object as the
366 constructor, this object will yield chunks of read operations from that
367 object.
369 def __init__(self, flo, chunk_size=64 * 1024):
370 self.flo = flo
371 self.chunk_size = chunk_size
373 def next(self):
374 try:
375 data = self.flo.read(self.chunk_size)
376 except InputStreamExhausted:
377 raise StopIteration()
378 if data:
379 return data
380 else:
381 raise StopIteration()
383 def __iter__(self):
384 return self
386 class LimitBytes(object):
387 """ Limit bytes for a file object. """
388 def __init__(self, fileobject, length):
389 self._file = fileobject
390 self.remaining = length
392 def read(self, num_bytes=None):
394 Read data from the underlying file.
395 If you ask for too much or there isn't anything left,
396 this will raise an InputStreamExhausted error.
398 if self.remaining <= 0:
399 raise InputStreamExhausted()
400 if num_bytes is None:
401 num_bytes = self.remaining
402 else:
403 num_bytes = min(num_bytes, self.remaining)
404 self.remaining -= num_bytes
405 return self._file.read(num_bytes)
407 class InterBoundaryIter(object):
409 A Producer that will iterate over boundaries.
411 def __init__(self, stream, boundary):
412 self._stream = stream
413 self._boundary = boundary
415 def __iter__(self):
416 return self
418 def next(self):
419 try:
420 return LazyStream(BoundaryIter(self._stream, self._boundary))
421 except InputStreamExhausted:
422 raise StopIteration()
424 class BoundaryIter(object):
426 A Producer that is sensitive to boundaries.
428 Will happily yield bytes until a boundary is found. Will yield the bytes
429 before the boundary, throw away the boundary bytes themselves, and push the
430 post-boundary bytes back on the stream.
432 The future calls to .next() after locating the boundary will raise a
433 StopIteration exception.
436 def __init__(self, stream, boundary):
437 self._stream = stream
438 self._boundary = boundary
439 self._done = False
440 # rollback an additional six bytes because the format is like
441 # this: CRLF<boundary>[--CRLF]
442 self._rollback = len(boundary) + 6
444 # Try to use mx fast string search if available. Otherwise
445 # use Python find. Wrap the latter for consistency.
446 unused_char = self._stream.read(1)
447 if not unused_char:
448 raise InputStreamExhausted()
449 self._stream.unget(unused_char)
450 try:
451 from mx.TextTools import FS
452 self._fs = FS(boundary).find
453 except ImportError:
454 self._fs = lambda data: data.find(boundary)
456 def __iter__(self):
457 return self
459 def next(self):
460 if self._done:
461 raise StopIteration()
463 stream = self._stream
464 rollback = self._rollback
466 bytes_read = 0
467 chunks = []
468 for bytes in stream:
469 bytes_read += len(bytes)
470 chunks.append(bytes)
471 if bytes_read > rollback:
472 break
473 if not bytes:
474 break
475 else:
476 self._done = True
478 if not chunks:
479 raise StopIteration()
481 chunk = ''.join(chunks)
482 boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
484 if boundary:
485 end, next = boundary
486 stream.unget(chunk[next:])
487 self._done = True
488 return chunk[:end]
489 else:
490 # make sure we dont treat a partial boundary (and
491 # its separators) as data
492 if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
493 # There's nothing left, we should just return and mark as done.
494 self._done = True
495 return chunk
496 else:
497 stream.unget(chunk[-rollback:])
498 return chunk[:-rollback]
500 def _find_boundary(self, data, eof = False):
502 Finds a multipart boundary in data.
504 Should no boundry exist in the data None is returned instead. Otherwise
505 a tuple containing the indices of the following are returned:
507 * the end of current encapsulation
508 * the start of the next encapsulation
510 index = self._fs(data)
511 if index < 0:
512 return None
513 else:
514 end = index
515 next = index + len(self._boundary)
516 # backup over CRLF
517 if data[max(0,end-1)] == '\n':
518 end -= 1
519 if data[max(0,end-1)] == '\r':
520 end -= 1
521 return end, next
523 def exhaust(stream_or_iterable):
525 Completely exhausts an iterator or stream.
527 Raise a MultiPartParserError if the argument is not a stream or an iterable.
529 iterator = None
530 try:
531 iterator = iter(stream_or_iterable)
532 except TypeError:
533 iterator = ChunkIter(stream_or_iterable, 16384)
535 if iterator is None:
536 raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
538 for __ in iterator:
539 pass
541 def parse_boundary_stream(stream, max_header_size):
543 Parses one and exactly one stream that encapsulates a boundary.
545 # Stream at beginning of header, look for end of header
546 # and parse it if found. The header must fit within one
547 # chunk.
548 chunk = stream.read(max_header_size)
550 # 'find' returns the top of these four bytes, so we'll
551 # need to munch them later to prevent them from polluting
552 # the payload.
553 header_end = chunk.find('\r\n\r\n')
555 def _parse_header(line):
556 main_value_pair, params = parse_header(line)
557 try:
558 name, value = main_value_pair.split(':', 1)
559 except:
560 raise ValueError("Invalid header: %r" % line)
561 return name, (value, params)
563 if header_end == -1:
564 # we find no header, so we just mark this fact and pass on
565 # the stream verbatim
566 stream.unget(chunk)
567 return (RAW, {}, stream)
569 header = chunk[:header_end]
571 # here we place any excess chunk back onto the stream, as
572 # well as throwing away the CRLFCRLF bytes from above.
573 stream.unget(chunk[header_end + 4:])
575 TYPE = RAW
576 outdict = {}
578 # Eliminate blank lines
579 for line in header.split('\r\n'):
580 # This terminology ("main value" and "dictionary of
581 # parameters") is from the Python docs.
582 try:
583 name, (value, params) = _parse_header(line)
584 except:
585 continue
587 if name == 'content-disposition':
588 TYPE = FIELD
589 if params.get('filename'):
590 TYPE = FILE
592 outdict[name] = value, params
594 if TYPE == RAW:
595 stream.unget(chunk)
597 return (TYPE, outdict, stream)
599 class Parser(object):
600 def __init__(self, stream, boundary):
601 self._stream = stream
602 self._separator = '--' + boundary
604 def __iter__(self):
605 boundarystream = InterBoundaryIter(self._stream, self._separator)
606 for sub_stream in boundarystream:
607 # Iterate over each part
608 yield parse_boundary_stream(sub_stream, 1024)
610 def parse_header(line):
611 """ Parse the header into a key-value. """
612 plist = _parse_header_params(';' + line)
613 key = plist.pop(0).lower()
614 pdict = {}
615 for p in plist:
616 i = p.find('=')
617 if i >= 0:
618 name = p[:i].strip().lower()
619 value = p[i+1:].strip()
620 if len(value) >= 2 and value[0] == value[-1] == '"':
621 value = value[1:-1]
622 value = value.replace('\\\\', '\\').replace('\\"', '"')
623 pdict[name] = value
624 return key, pdict
626 def _parse_header_params(s):
627 plist = []
628 while s[:1] == ';':
629 s = s[1:]
630 end = s.find(';')
631 while end > 0 and s.count('"', 0, end) % 2:
632 end = s.find(';', end + 1)
633 if end < 0:
634 end = len(s)
635 f = s[:end]
636 plist.append(f.strip())
637 s = s[end:]
638 return plist