1 # Copyright (C) 2001-2007 by the Free Software Foundation, Inc.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
18 """Cleanse a message for archiving."""
20 from __future__
import with_statement
31 from cStringIO
import StringIO
32 from mimetypes
import guess_all_extensions
34 from email
.charset
import Charset
35 from email
.generator
import Generator
36 from email
.parser
import HeaderParser
37 from email
.utils
import make_msgid
, parsedate
39 from Mailman
import Message
40 from Mailman
import Utils
41 from Mailman
.Errors
import DiscardMessage
42 from Mailman
.app
.archiving
import get_base_archive_url
43 from Mailman
.configuration
import config
44 from Mailman
.i18n
import _
45 from Mailman
.lockfile
import LockFile
47 # Path characters for common platforms
48 pre
= re
.compile(r
'[/\\:]')
49 # All other characters to strip out of Content-Disposition: filenames
50 # (essentially anything that isn't an alphanum, dot, slash, or underscore.
51 sre
= re
.compile(r
'[^-\w.]')
52 # Regexp to strip out leading dots
53 dre
= re
.compile(r
'^\.*')
58 log
= logging
.getLogger('mailman.error')
62 def guess_extension(ctype
, ext
):
63 # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
64 # and .wiz are all mapped to application/msword. This sucks for finding
65 # the best reverse mapping. If the extension is one of the giving
66 # mappings, we'll trust that, otherwise we'll just guess. :/
67 all
= guess_all_extensions(ctype
, strict
=False)
74 # We're using a subclass of the standard Generator because we want to suppress
75 # headers in the subparts of multiparts. We use a hack -- the ctor argument
76 # skipheaders to accomplish this. It's set to true for the outer Message
77 # object, but false for all internal objects. We recognize that
78 # sub-Generators will get created passing only mangle_from_ and maxheaderlen
81 # This isn't perfect because we still get stuff like the multipart boundaries,
82 # but see below for how we corrupt that to our nefarious goals.
83 class ScrubberGenerator(Generator
):
84 def __init__(self
, outfp
, mangle_from_
=True,
85 maxheaderlen
=78, skipheaders
=True):
86 Generator
.__init
__(self
, outfp
, mangle_from_
=False)
87 self
.__skipheaders
= skipheaders
89 def _write_headers(self
, msg
):
90 if not self
.__skipheaders
:
91 Generator
._write
_headers
(self
, msg
)
94 def safe_strftime(fmt
, t
):
96 return time
.strftime(fmt
, t
)
97 except (TypeError, ValueError, OverflowError):
101 def calculate_attachments_dir(mlist
, msg
, msgdata
):
102 # Calculate the directory that attachments for this message will go
103 # under. To avoid inode limitations, the scheme will be:
104 # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
105 # Start by calculating the date-based and msgid-hash components.
107 datestr
= msg
.get('Date')
109 now
= parsedate(datestr
)
111 now
= time
.gmtime(msgdata
.get('received_time', time
.time()))
112 datedir
= safe_strftime(fmt
, now
)
114 datestr
= msgdata
.get('X-List-Received-Date')
116 datedir
= safe_strftime(fmt
, datestr
)
118 # What next? Unixfrom, I guess.
119 parts
= msg
.get_unixfrom().split()
121 month
= {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
122 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
126 except (IndexError, ValueError):
127 # Best we can do I think
128 month
= day
= year
= 0
129 datedir
= '%04d%02d%02d' % (year
, month
, day
)
131 # As for the msgid hash, we'll base this part on the Message-ID: so that
132 # all attachments for the same message end up in the same directory (we'll
133 # uniquify the filenames in that directory as needed). We use the first 2
134 # and last 2 bytes of the SHA1 hash of the message id as the basis of the
135 # directory name. Clashes here don't really matter too much, and that
136 # still gives us a 32-bit space to work with.
137 msgid
= msg
['message-id']
139 msgid
= msg
['Message-ID'] = make_msgid()
140 # We assume that the message id actually /is/ unique!
141 digest
= sha
.new(msgid
).hexdigest()
142 return os
.path
.join('attachments', datedir
, digest
[:4] + digest
[-4:])
145 def replace_payload_by_text(msg
, text
, charset
):
146 # TK: This is a common function in replacing the attachment and the main
147 # message by a text (scrubbing).
148 del msg
['content-type']
149 del msg
['content-transfer-encoding']
150 if isinstance(text
, unicode):
151 text
= text
.encode(charset
)
152 if not isinstance(charset
, str):
153 charset
= str(charset
)
154 msg
.set_payload(text
, charset
)
158 def process(mlist
, msg
, msgdata
=None):
159 sanitize
= config
.ARCHIVE_HTML_SANITIZER
164 # msgdata is available if it is in GLOBAL_PIPELINE
165 # ie. not in digest or archiver
166 # check if the list owner want to scrub regular delivery
167 if not mlist
.scrub_nondigest
:
169 dir = calculate_attachments_dir(mlist
, msg
, msgdata
)
170 charset
= format
= delsp
= None
171 lcset
= Utils
.GetCharSet(mlist
.preferred_language
)
172 lcset_out
= Charset(lcset
).output_charset
or lcset
173 # Now walk over all subparts of this message and scrub out various types
174 for part
in msg
.walk():
175 ctype
= part
.get_content_type()
176 # If the part is text/plain, we leave it alone
177 if ctype
== 'text/plain':
178 # We need to choose a charset for the scrubbed message, so we'll
179 # arbitrarily pick the charset of the first text/plain part in the
182 # Also get the RFC 3676 stuff from this part. This seems to
183 # work okay for scrub_nondigest. It will also work as far as
184 # scrubbing messages for the archive is concerned, but Pipermail
185 # doesn't pay any attention to the RFC 3676 parameters. The plain
186 # format digest is going to be a disaster in any case as some of
187 # messages will be format="flowed" and some not. ToDigest creates
188 # its own Content-Type: header for the plain digest which won't
189 # have RFC 3676 parameters. If the message Content-Type: headers
190 # are retained for display in the digest, the parameters will be
191 # there for information, but not for the MUA. This is the best we
192 # can do without having get_payload() process the parameters.
194 charset
= part
.get_content_charset(lcset
)
195 format
= part
.get_param('format')
196 delsp
= part
.get_param('delsp')
197 # TK: if part is attached then check charset and scrub if none
198 if part
.get('content-disposition') and \
199 not part
.get_content_charset():
200 url
= save_attachment(mlist
, part
, dir)
201 filename
= part
.get_filename(_('not available'))
202 filename
= Utils
.oneline(filename
, lcset
)
203 replace_payload_by_text(part
, _("""\
204 An embedded and charset-unspecified text was scrubbed...
208 elif ctype
== 'text/html' and isinstance(sanitize
, int):
212 replace_payload_by_text(part
,
213 _('HTML attachment scrubbed and removed'),
214 # Adding charset arg and removing content-type
215 # sets content-type to text/plain
218 # By leaving it alone, Pipermail will automatically escape it
221 # Pull it out as an attachment but leave it unescaped. This
222 # is dangerous, but perhaps useful for heavily moderated
224 url
= save_attachment(mlist
, part
, dir, filter_html
=False)
225 replace_payload_by_text(part
, _("""\
226 An HTML attachment was scrubbed...
230 # HTML-escape it and store it as an attachment, but make it
231 # look a /little/ bit prettier. :(
232 payload
= Utils
.websafe(part
.get_payload(decode
=True))
233 # For whitespace in the margin, change spaces into
234 # non-breaking spaces, and tabs into 8 of those. Then use a
235 # mono-space font. Still looks hideous to me, but then I'd
236 # just as soon discard them.
238 return s
.replace(' ', ' ').replace('\t', ' '*8)
239 lines
= [doreplace(s
) for s
in payload
.split('\n')]
240 payload
= '<tt>\n' + BR
.join(lines
) + '\n</tt>\n'
241 part
.set_payload(payload
)
242 # We're replacing the payload with the decoded payload so this
243 # will just get in the way.
244 del part
['content-transfer-encoding']
245 url
= save_attachment(mlist
, part
, dir, filter_html
=False)
246 replace_payload_by_text(part
, _("""\
247 An HTML attachment was scrubbed...
250 elif ctype
== 'message/rfc822':
251 # This part contains a submessage, so it too needs scrubbing
252 submsg
= part
.get_payload(0)
253 url
= save_attachment(mlist
, part
, dir)
254 subject
= submsg
.get('subject', _('no subject'))
255 date
= submsg
.get('date', _('no date'))
256 who
= submsg
.get('from', _('unknown sender'))
257 size
= len(str(submsg
))
258 replace_payload_by_text(part
, _("""\
259 An embedded message was scrubbed...
266 # If the message isn't a multipart, then we'll strip it out as an
267 # attachment that would have to be separately downloaded. Pipermail
268 # will transform the url into a hyperlink.
269 elif part
and not part
.is_multipart():
270 payload
= part
.get_payload(decode
=True)
271 ctype
= part
.get_content_type()
272 # XXX Under email 2.5, it is possible that payload will be None.
273 # This can happen when you have a Content-Type: multipart/* with
274 # only one part and that part has two blank lines between the
275 # first boundary and the end boundary. In email 3.0 you end up
276 # with a string in the payload. I think in this case it's safe to
281 url
= save_attachment(mlist
, part
, dir)
282 desc
= part
.get('content-description', _('not available'))
283 desc
= Utils
.oneline(desc
, lcset
)
284 filename
= part
.get_filename(_('not available'))
285 filename
= Utils
.oneline(filename
, lcset
)
286 replace_payload_by_text(part
, _("""\
287 A non-text attachment was scrubbed...
295 # We still have to sanitize multipart messages to flat text because
296 # Pipermail can't handle messages with list payloads. This is a kludge;
297 # def (n) clever hack ;).
298 if msg
.is_multipart() and sanitize
<> 2:
299 # By default we take the charset of the first text/plain part in the
300 # message, but if there was none, we'll use the list's preferred
301 # language's charset.
302 if not charset
or charset
== 'us-ascii':
305 # normalize to the output charset if input/output are different
306 charset
= Charset(charset
).output_charset
or charset
307 # We now want to concatenate all the parts which have been scrubbed to
308 # text/plain, into a single text/plain payload. We need to make sure
309 # all the characters in the concatenated string are in the same
310 # encoding, so we'll use the 'replace' key in the coercion call.
311 # BAW: Martin's original patch suggested we might want to try
312 # generalizing to utf-8, and that's probably a good idea (eventually).
315 for part
in msg
.walk():
316 # TK: bug-id 1099138 and multipart
317 if not part
or part
.is_multipart():
319 # All parts should be scrubbed to text/plain by now.
320 partctype
= part
.get_content_type()
321 if partctype
<> 'text/plain':
322 text
.append(_('Skipped content of type %(partctype)s\n'))
325 t
= part
.get_payload(decode
=True) or ''
326 # MAS: TypeError exception can occur if payload is None. This
327 # was observed with a message that contained an attached
328 # message/delivery-status part. Because of the special parsing
329 # of this type, this resulted in a text/plain sub-part with a
330 # null body. See bug 1430236.
331 except (binascii
.Error
, TypeError):
332 t
= part
.get_payload() or ''
333 # Email problem was solved by Mark Sapiro. (TK)
334 partcharset
= part
.get_content_charset('us-ascii')
336 t
= unicode(t
, partcharset
, 'replace')
337 except (UnicodeError, LookupError, ValueError, TypeError,
339 # We can get here if partcharset is bogus in come way.
340 # Replace funny characters. We use errors='replace'.
341 t
= unicode(t
, 'ascii', 'replace')
342 # Separation is useful
343 if isinstance(t
, basestring
):
344 if not t
.endswith('\n'):
347 if partcharset
not in charsets
:
348 charsets
.append(partcharset
)
349 # Now join the text and set the payload
350 sep
= _('-------------- next part --------------\n')
351 assert isinstance(sep
, unicode), (
352 'Expected a unicode separator, got %s' % type(sep
))
353 rept
= sep
.join(text
)
354 # Replace entire message with text and scrubbed notice.
355 # Try with message charsets and utf-8
356 if 'utf-8' not in charsets
:
357 charsets
.append('utf-8')
358 for charset
in charsets
:
360 replace_payload_by_text(msg
, rept
, charset
)
362 # Bogus charset can throw several exceptions
363 except (UnicodeError, LookupError, ValueError, TypeError,
367 msg
.set_param('format', format
)
369 msg
.set_param('delsp', delsp
)
375 # Create all the directories to store this attachment in and try to make
376 # sure that the permissions of the directories are set correctly.
378 os
.makedirs(dir, 02775)
380 if e
.errno
== errno
.EEXIST
:
382 # Some systems such as FreeBSD ignore mkdir's mode, so walk the just
383 # created directories and try to set the mode, ignoring any OSErrors that
385 for dirpath
, dirnames
, filenames
in os
.walk(dir):
387 os
.chmod(dirpath
, 02775)
393 def save_attachment(mlist
, msg
, dir, filter_html
=True):
394 fsdir
= os
.path
.join(config
.PRIVATE_ARCHIVE_FILE_DIR
,
395 mlist
.fqdn_listname
, dir)
397 # Figure out the attachment type and get the decoded data
398 decodedpayload
= msg
.get_payload(decode
=True)
399 # BAW: mimetypes ought to handle non-standard, but commonly found types,
400 # e.g. image/jpg (should be image/jpeg). For now we just store such
401 # things as application/octet-streams since that seems the safest.
402 ctype
= msg
.get_content_type()
403 # i18n file name is encoded
404 lcset
= Utils
.GetCharSet(mlist
.preferred_language
)
405 filename
= Utils
.oneline(msg
.get_filename(''), lcset
)
406 filename
, fnext
= os
.path
.splitext(filename
)
407 # For safety, we should confirm this is valid ext for content-type
408 # but we can use fnext if we introduce fnext filtering
409 if config
.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION
:
410 # HTML message doesn't have filename :-(
411 ext
= fnext
or guess_extension(ctype
, fnext
)
413 ext
= guess_extension(ctype
, fnext
)
414 # Allow only alphanumerics, dash, underscore, and dot
415 ext
= sre
.sub('', ext
)
417 # We don't know what it is, so assume it's just a shapeless
418 # application/octet-stream, unless the Content-Type: is
419 # message/rfc822, in which case we know we'll coerce the type to
421 if ctype
== 'message/rfc822':
426 # We need a lock to calculate the next attachment number
427 with
LockFile(os
.path
.join(fsdir
, 'attachments.lock')):
428 # Now base the filename on what's in the attachment, uniquifying it if
430 if not filename
or config
.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME
:
431 filebase
= 'attachment'
433 # Sanitize the filename given in the message headers
434 parts
= pre
.split(filename
)
436 # Strip off leading dots
437 filename
= dre
.sub('', filename
)
438 # Allow only alphanumerics, dash, underscore, and dot
439 filename
= sre
.sub('', filename
)
440 # If the filename's extension doesn't match the type we guessed,
441 # which one should we go with? For now, let's go with the one we
442 # guessed so attachments can't lie about their type. Also, if the
443 # filename /has/ no extension, then tack on the one we guessed.
444 # The extension was removed from the name above.
446 # Now we're looking for a unique name for this file on the file
447 # system. If msgdir/filebase.ext isn't unique, we'll add a counter
448 # after filebase, e.g. msgdir/filebase-cnt.ext
452 path
= os
.path
.join(fsdir
, filebase
+ extra
+ ext
)
453 # Generally it is not a good idea to test for file existance
454 # before just trying to create it, but the alternatives aren't
455 # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
456 # NFS-safe). Besides, we have an exclusive lock now, so we're
457 # guaranteed that no other process will be racing with us.
458 if os
.path
.exists(path
):
460 extra
= '-%04d' % counter
463 # `path' now contains the unique filename for the attachment. There's
464 # just one more step we need to do. If the part is text/html and
465 # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
466 # here), then send the attachment through the filter program for
468 if filter_html
and ctype
== 'text/html':
469 base
, ext
= os
.path
.splitext(path
)
470 tmppath
= base
+ '-tmp' + ext
471 fp
= open(tmppath
, 'w')
473 fp
.write(decodedpayload
)
475 cmd
= config
.ARCHIVE_HTML_SANITIZER
% {'filename' : tmppath
}
476 progfp
= os
.popen(cmd
, 'r')
477 decodedpayload
= progfp
.read()
478 status
= progfp
.close()
480 log
.error('HTML sanitizer exited with non-zero status: %s',
484 # BAW: Since we've now sanitized the document, it should be plain
485 # text. Blarg, we really want the sanitizer to tell us what the type
486 # if the return data is. :(
489 # Is it a message/rfc822 attachment?
490 elif ctype
== 'message/rfc822':
491 submsg
= msg
.get_payload()
492 # BAW: I'm sure we can eventually do better than this. :(
493 decodedpayload
= Utils
.websafe(str(submsg
))
495 fp
.write(decodedpayload
)
497 # Now calculate the url
498 baseurl
= get_base_archive_url(mlist
)
499 # Private archives will likely have a trailing slash. Normalize.
500 if baseurl
[-1] <> '/':
502 # Trailing space will definitely be a problem with format=flowed.
503 # Bracket the URL instead.
504 url
= '<' + baseurl
+ '%s/%s%s%s>' % (dir, filebase
, extra
, ext
)