2 Multi-part parsing for file uploads.
4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5 file upload handlers for processing.
9 from django
.conf
import settings
10 from django
.core
.exceptions
import SuspiciousOperation
11 from django
.utils
.datastructures
import MultiValueDict
12 from django
.utils
.encoding
import force_unicode
13 from django
.utils
.text
import unescape_entities
14 from django
.core
.files
.uploadhandler
import StopUpload
, SkipFile
, StopFutureHandlers
16 __all__
= ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
18 class MultiPartParserError(Exception):
21 class InputStreamExhausted(Exception):
23 No more reads are allowed from this device.
31 class MultiPartParser(object):
33 A rfc2388 multipart/form-data parser.
35 ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
36 and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
38 def __init__(self
, META
, input_data
, upload_handlers
, encoding
=None):
40 Initialize the MultiPartParser object.
43 The standard ``META`` dictionary in Django request objects.
45 The raw post data, as a file-like object.
47 An UploadHandler instance that performs operations on the uploaded
50 The encoding with which to treat the incoming data.
54 # Content-Type should containt multipart and the boundary information.
57 content_type
= META
.get('HTTP_CONTENT_TYPE', META
.get('CONTENT_TYPE', ''))
58 if not content_type
.startswith('multipart/'):
59 raise MultiPartParserError('Invalid Content-Type: %s' % content_type
)
61 # Parse the header to get the boundary to split the parts.
62 ctypes
, opts
= parse_header(content_type
)
63 boundary
= opts
.get('boundary')
64 if not boundary
or not cgi
.valid_boundary(boundary
):
65 raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary
)
69 # Content-Length should contain the length of the body we are about
73 content_length
= int(META
.get('HTTP_CONTENT_LENGTH', META
.get('CONTENT_LENGTH',0)))
74 except (ValueError, TypeError):
75 # For now set it to 0; we'll try again later on down.
78 if content_length
<= 0:
79 # This means we shouldn't continue...raise an error.
80 raise MultiPartParserError("Invalid content length: %r" % content_length
)
82 self
._boundary
= boundary
83 self
._input
_data
= input_data
85 # For compatibility with low-level network APIs (with 32-bit integers),
86 # the chunk size should be < 2^31, but still divisible by 4.
87 possible_sizes
= [x
.chunk_size
for x
in upload_handlers
if x
.chunk_size
]
88 self
._chunk
_size
= min([2**31-4] + possible_sizes
)
91 self
._encoding
= encoding
or settings
.DEFAULT_CHARSET
92 self
._content
_length
= content_length
93 self
._upload
_handlers
= upload_handlers
97 Parse the POST data and break it into a FILES MultiValueDict and a POST
100 Returns a tuple containing the POST and FILES dictionary, respectively.
102 # We have to import QueryDict down here to avoid a circular import.
103 from django
.http
import QueryDict
105 encoding
= self
._encoding
106 handlers
= self
._upload
_handlers
108 limited_input_data
= LimitBytes(self
._input
_data
, self
._content
_length
)
110 # See if the handler will want to take care of the parsing.
111 # This allows overriding everything if somebody wants it.
112 for handler
in handlers
:
113 result
= handler
.handle_raw_input(limited_input_data
,
115 self
._content
_length
,
118 if result
is not None:
119 return result
[0], result
[1]
121 # Create the data structures to be used later.
122 self
._post
= QueryDict('', mutable
=True)
123 self
._files
= MultiValueDict()
125 # Instantiate the parser and stream:
126 stream
= LazyStream(ChunkIter(limited_input_data
, self
._chunk
_size
))
128 # Whether or not to signal a file-completion at the beginning of the loop.
129 old_field_name
= None
130 counters
= [0] * len(handlers
)
133 for item_type
, meta_data
, field_stream
in Parser(stream
, self
._boundary
):
135 # We run this at the beginning of the next loop
136 # since we cannot be sure a file is complete until
137 # we hit the next boundary/part of the multipart content.
138 self
.handle_file_complete(old_field_name
, counters
)
139 old_field_name
= None
142 disposition
= meta_data
['content-disposition'][1]
143 field_name
= disposition
['name'].strip()
144 except (KeyError, IndexError, AttributeError):
147 transfer_encoding
= meta_data
.get('content-transfer-encoding')
148 field_name
= force_unicode(field_name
, encoding
, errors
='replace')
150 if item_type
== FIELD
:
151 # This is a post field, we can just set it in the post
152 if transfer_encoding
== 'base64':
153 raw_data
= field_stream
.read()
155 data
= str(raw_data
).decode('base64')
159 data
= field_stream
.read()
161 self
._post
.appendlist(field_name
,
162 force_unicode(data
, encoding
, errors
='replace'))
163 elif item_type
== FILE
:
164 # This is a file, use the handler...
165 file_name
= disposition
.get('filename')
168 file_name
= force_unicode(file_name
, encoding
, errors
='replace')
169 file_name
= self
.IE_sanitize(unescape_entities(file_name
))
171 content_type
= meta_data
.get('content-type', ('',))[0].strip()
173 charset
= meta_data
.get('content-type', (0,{}))[1].get('charset', None)
178 content_length
= int(meta_data
.get('content-length')[0])
179 except (IndexError, TypeError, ValueError):
180 content_length
= None
182 counters
= [0] * len(handlers
)
184 for handler
in handlers
:
186 handler
.new_file(field_name
, file_name
,
187 content_type
, content_length
,
189 except StopFutureHandlers
:
192 for chunk
in field_stream
:
193 if transfer_encoding
== 'base64':
194 # We only special-case base64 transfer encoding
196 chunk
= str(chunk
).decode('base64')
198 # Since this is only a chunk, any error is an unfixable error.
199 raise MultiPartParserError("Could not decode base64 data: %r" % e
)
201 for i
, handler
in enumerate(handlers
):
202 chunk_length
= len(chunk
)
203 chunk
= handler
.receive_data_chunk(chunk
,
205 counters
[i
] += chunk_length
207 # If the chunk received by the handler is None, then don't continue.
211 # Just use up the rest of this file...
212 exhaust(field_stream
)
214 # Handle file upload completions on next iteration.
215 old_field_name
= field_name
217 # If this is neither a FIELD or a FILE, just exhaust the stream.
219 except StopUpload
, e
:
220 if not e
.connection_reset
:
221 exhaust(limited_input_data
)
223 # Make sure that the request data is all fed
224 exhaust(limited_input_data
)
226 # Signal that the upload has completed.
227 for handler
in handlers
:
228 retval
= handler
.upload_complete()
232 return self
._post
, self
._files
234 def handle_file_complete(self
, old_field_name
, counters
):
236 Handle all the signalling that takes place when a file is complete.
238 for i
, handler
in enumerate(self
._upload
_handlers
):
239 file_obj
= handler
.file_complete(counters
[i
])
241 # If it returns a file object, then set the files dict.
242 self
._files
.appendlist(force_unicode(old_field_name
,
248 def IE_sanitize(self
, filename
):
249 """Cleanup filename from Internet Explorer full paths."""
250 return filename
and filename
[filename
.rfind("\\")+1:].strip()
252 class LazyStream(object):
254 The LazyStream wrapper allows one to get and "unget" bytes from a stream.
256 Given a producer object (an iterator that yields bytestrings), the
257 LazyStream object will support iteration, reading, and keeping a "look-back"
258 variable in case you need to "unget" some bytes.
260 def __init__(self
, producer
, length
=None):
262 Every LazyStream must have a producer when instantiated.
264 A producer is an iterable that returns a string each time it
267 self
._producer
= producer
272 self
._remaining
= length
273 self
._unget
_history
= []
278 def read(self
, size
=None):
280 remaining
= (size
is not None and [size
] or [self
._remaining
])[0]
281 # do the whole thing in one shot if no limit was provided.
282 if remaining
is None:
286 # otherwise do some bookkeeping to return exactly enough
287 # of the stream and stashing any extra content we get from
289 while remaining
!= 0:
290 assert remaining
> 0, 'remaining bytes to read should never go negative'
294 emitting
= chunk
[:remaining
]
295 self
.unget(chunk
[remaining
:])
296 remaining
-= len(emitting
)
299 out
= ''.join(parts())
304 Used when the exact number of bytes to read is unimportant.
306 This procedure just returns whatever is chunk is conveniently returned
307 from the iterator instead. Useful to avoid unnecessary bookkeeping if
308 performance is an issue.
311 output
= self
._leftover
314 output
= self
._producer
.next()
315 self
._unget
_history
= []
316 self
.position
+= len(output
)
321 Used to invalidate/disable this lazy stream.
323 Replaces the producer with an empty list. Any leftover bytes that have
324 already been read will still be reported upon read() and/or next().
331 def unget(self
, bytes
):
333 Places bytes back onto the front of the lazy stream.
335 Future calls to read() will return those bytes first. The
336 stream position and thus tell() will be rewound.
340 self
._update
_unget
_history
(len(bytes
))
341 self
.position
-= len(bytes
)
342 self
._leftover
= ''.join([bytes
, self
._leftover
])
344 def _update_unget_history(self
, num_bytes
):
346 Updates the unget history as a sanity check to see if we've pushed
347 back the same number of bytes in one chunk. If we keep ungetting the
348 same number of bytes many times (here, 50), we're mostly likely in an
349 infinite loop of some sort. This is usually caused by a
350 maliciously-malformed MIME request.
352 self
._unget
_history
= [num_bytes
] + self
._unget
_history
[:49]
353 number_equal
= len([current_number
for current_number
in self
._unget
_history
354 if current_number
== num_bytes
])
356 if number_equal
> 40:
357 raise SuspiciousOperation(
358 "The multipart parser got stuck, which shouldn't happen with"
359 " normal uploaded files. Check for malicious upload activity;"
360 " if there is none, report this to the Django developers."
363 class ChunkIter(object):
365 An iterable that will yield chunks of data. Given a file-like object as the
366 constructor, this object will yield chunks of read operations from that
369 def __init__(self
, flo
, chunk_size
=64 * 1024):
371 self
.chunk_size
= chunk_size
375 data
= self
.flo
.read(self
.chunk_size
)
376 except InputStreamExhausted
:
377 raise StopIteration()
381 raise StopIteration()
386 class LimitBytes(object):
387 """ Limit bytes for a file object. """
388 def __init__(self
, fileobject
, length
):
389 self
._file
= fileobject
390 self
.remaining
= length
392 def read(self
, num_bytes
=None):
394 Read data from the underlying file.
395 If you ask for too much or there isn't anything left,
396 this will raise an InputStreamExhausted error.
398 if self
.remaining
<= 0:
399 raise InputStreamExhausted()
400 if num_bytes
is None:
401 num_bytes
= self
.remaining
403 num_bytes
= min(num_bytes
, self
.remaining
)
404 self
.remaining
-= num_bytes
405 return self
._file
.read(num_bytes
)
407 class InterBoundaryIter(object):
409 A Producer that will iterate over boundaries.
411 def __init__(self
, stream
, boundary
):
412 self
._stream
= stream
413 self
._boundary
= boundary
420 return LazyStream(BoundaryIter(self
._stream
, self
._boundary
))
421 except InputStreamExhausted
:
422 raise StopIteration()
424 class BoundaryIter(object):
426 A Producer that is sensitive to boundaries.
428 Will happily yield bytes until a boundary is found. Will yield the bytes
429 before the boundary, throw away the boundary bytes themselves, and push the
430 post-boundary bytes back on the stream.
432 The future calls to .next() after locating the boundary will raise a
433 StopIteration exception.
436 def __init__(self
, stream
, boundary
):
437 self
._stream
= stream
438 self
._boundary
= boundary
440 # rollback an additional six bytes because the format is like
441 # this: CRLF<boundary>[--CRLF]
442 self
._rollback
= len(boundary
) + 6
444 # Try to use mx fast string search if available. Otherwise
445 # use Python find. Wrap the latter for consistency.
446 unused_char
= self
._stream
.read(1)
448 raise InputStreamExhausted()
449 self
._stream
.unget(unused_char
)
451 from mx
.TextTools
import FS
452 self
._fs
= FS(boundary
).find
454 self
._fs
= lambda data
: data
.find(boundary
)
461 raise StopIteration()
463 stream
= self
._stream
464 rollback
= self
._rollback
469 bytes_read
+= len(bytes
)
471 if bytes_read
> rollback
:
479 raise StopIteration()
481 chunk
= ''.join(chunks
)
482 boundary
= self
._find
_boundary
(chunk
, len(chunk
) < self
._rollback
)
486 stream
.unget(chunk
[next
:])
490 # make sure we dont treat a partial boundary (and
491 # its separators) as data
492 if not chunk
[:-rollback
]:# and len(chunk) >= (len(self._boundary) + 6):
493 # There's nothing left, we should just return and mark as done.
497 stream
.unget(chunk
[-rollback
:])
498 return chunk
[:-rollback
]
500 def _find_boundary(self
, data
, eof
= False):
502 Finds a multipart boundary in data.
504 Should no boundry exist in the data None is returned instead. Otherwise
505 a tuple containing the indices of the following are returned:
507 * the end of current encapsulation
508 * the start of the next encapsulation
510 index
= self
._fs
(data
)
515 next
= index
+ len(self
._boundary
)
517 if data
[max(0,end
-1)] == '\n':
519 if data
[max(0,end
-1)] == '\r':
523 def exhaust(stream_or_iterable
):
525 Completely exhausts an iterator or stream.
527 Raise a MultiPartParserError if the argument is not a stream or an iterable.
531 iterator
= iter(stream_or_iterable
)
533 iterator
= ChunkIter(stream_or_iterable
, 16384)
536 raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
541 def parse_boundary_stream(stream
, max_header_size
):
543 Parses one and exactly one stream that encapsulates a boundary.
545 # Stream at beginning of header, look for end of header
546 # and parse it if found. The header must fit within one
548 chunk
= stream
.read(max_header_size
)
550 # 'find' returns the top of these four bytes, so we'll
551 # need to munch them later to prevent them from polluting
553 header_end
= chunk
.find('\r\n\r\n')
555 def _parse_header(line
):
556 main_value_pair
, params
= parse_header(line
)
558 name
, value
= main_value_pair
.split(':', 1)
560 raise ValueError("Invalid header: %r" % line
)
561 return name
, (value
, params
)
564 # we find no header, so we just mark this fact and pass on
565 # the stream verbatim
567 return (RAW
, {}, stream
)
569 header
= chunk
[:header_end
]
571 # here we place any excess chunk back onto the stream, as
572 # well as throwing away the CRLFCRLF bytes from above.
573 stream
.unget(chunk
[header_end
+ 4:])
578 # Eliminate blank lines
579 for line
in header
.split('\r\n'):
580 # This terminology ("main value" and "dictionary of
581 # parameters") is from the Python docs.
583 name
, (value
, params
) = _parse_header(line
)
587 if name
== 'content-disposition':
589 if params
.get('filename'):
592 outdict
[name
] = value
, params
597 return (TYPE
, outdict
, stream
)
599 class Parser(object):
600 def __init__(self
, stream
, boundary
):
601 self
._stream
= stream
602 self
._separator
= '--' + boundary
605 boundarystream
= InterBoundaryIter(self
._stream
, self
._separator
)
606 for sub_stream
in boundarystream
:
607 # Iterate over each part
608 yield parse_boundary_stream(sub_stream
, 1024)
610 def parse_header(line
):
611 """ Parse the header into a key-value. """
612 plist
= _parse_header_params(';' + line
)
613 key
= plist
.pop(0).lower()
618 name
= p
[:i
].strip().lower()
619 value
= p
[i
+1:].strip()
620 if len(value
) >= 2 and value
[0] == value
[-1] == '"':
622 value
= value
.replace('\\\\', '\\').replace('\\"', '"')
626 def _parse_header_params(s
):
631 while end
> 0 and s
.count('"', 0, end
) % 2:
632 end
= s
.find(';', end
+ 1)
636 plist
.append(f
.strip())