Target Python 2.6. Make the test suite pass without deprecations.
[mailman.git] / mailman / pipeline / scrubber.py
bloba513d0689c6bbf0cb37dee30945995151d0506cf
1 # Copyright (C) 2001-2008 by the Free Software Foundation, Inc.
3 # This file is part of GNU Mailman.
5 # GNU Mailman is free software: you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free
7 # Software Foundation, either version 3 of the License, or (at your option)
8 # any later version.
10 # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 # more details.
15 # You should have received a copy of the GNU General Public License along with
16 # GNU Mailman. If not, see <http://www.gnu.org/licenses/>.
18 """Cleanse a message for archiving."""
20 from __future__ import with_statement
22 __metaclass__ = type
23 __all__ = ['Scrubber']
26 import os
27 import re
28 import time
29 import errno
30 import hashlib
31 import logging
32 import binascii
34 from email.charset import Charset
35 from email.generator import Generator
36 from email.utils import make_msgid, parsedate
37 from locknix.lockfile import Lock
38 from mimetypes import guess_all_extensions
39 from zope.interface import implements
41 from mailman import Utils
42 from mailman.configuration import config
43 from mailman.core.errors import DiscardMessage
44 from mailman.core.plugins import get_plugin
45 from mailman.i18n import _
46 from mailman.interfaces import IHandler
49 # Path characters for common platforms
50 pre = re.compile(r'[/\\:]')
51 # All other characters to strip out of Content-Disposition: filenames
52 # (essentially anything that isn't an alphanum, dot, dash, or underscore).
53 sre = re.compile(r'[^-\w.]')
54 # Regexp to strip out leading dots
55 dre = re.compile(r'^\.*')
57 BR = '<br>\n'
58 SPACE = ' '
60 log = logging.getLogger('mailman.error')
64 def guess_extension(ctype, ext):
65 # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
66 # and .wiz are all mapped to application/msword. This sucks for finding
67 # the best reverse mapping. If the extension is one of the giving
68 # mappings, we'll trust that, otherwise we'll just guess. :/
69 all = guess_all_extensions(ctype, strict=False)
70 if ext in all:
71 return ext
72 return all and all[0]
76 # We're using a subclass of the standard Generator because we want to suppress
77 # headers in the subparts of multiparts. We use a hack -- the ctor argument
78 # skipheaders to accomplish this. It's set to true for the outer Message
79 # object, but false for all internal objects. We recognize that
80 # sub-Generators will get created passing only mangle_from_ and maxheaderlen
81 # to the ctors.
83 # This isn't perfect because we still get stuff like the multipart boundaries,
84 # but see below for how we corrupt that to our nefarious goals.
85 class ScrubberGenerator(Generator):
86 def __init__(self, outfp, mangle_from_=True,
87 maxheaderlen=78, skipheaders=True):
88 Generator.__init__(self, outfp, mangle_from_=False)
89 self.__skipheaders = skipheaders
91 def _write_headers(self, msg):
92 if not self.__skipheaders:
93 Generator._write_headers(self, msg)
96 def safe_strftime(fmt, t):
97 try:
98 return time.strftime(fmt, t)
99 except (TypeError, ValueError, OverflowError):
100 return None
103 def calculate_attachments_dir(mlist, msg, msgdata):
104 # Calculate the directory that attachments for this message will go
105 # under. To avoid inode limitations, the scheme will be:
106 # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
107 # Start by calculating the date-based and msgid-hash components.
108 fmt = '%Y%m%d'
109 datestr = msg.get('Date')
110 if datestr:
111 now = parsedate(datestr)
112 else:
113 now = time.gmtime(msgdata.get('received_time', time.time()))
114 datedir = safe_strftime(fmt, now)
115 if not datedir:
116 datestr = msgdata.get('X-List-Received-Date')
117 if datestr:
118 datedir = safe_strftime(fmt, datestr)
119 if not datedir:
120 # What next? Unixfrom, I guess.
121 parts = msg.get_unixfrom().split()
122 try:
123 month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
124 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
125 }.get(parts[3], 0)
126 day = int(parts[4])
127 year = int(parts[6])
128 except (IndexError, ValueError):
129 # Best we can do I think
130 month = day = year = 0
131 datedir = '%04d%02d%02d' % (year, month, day)
132 assert datedir
133 # As for the msgid hash, we'll base this part on the Message-ID: so that
134 # all attachments for the same message end up in the same directory (we'll
135 # uniquify the filenames in that directory as needed). We use the first 2
136 # and last 2 bytes of the SHA1 hash of the message id as the basis of the
137 # directory name. Clashes here don't really matter too much, and that
138 # still gives us a 32-bit space to work with.
139 msgid = msg['message-id']
140 if msgid is None:
141 msgid = msg['Message-ID'] = make_msgid()
142 # We assume that the message id actually /is/ unique!
143 digest = hashlib.sha1(msgid).hexdigest()
144 return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
147 def replace_payload_by_text(msg, text, charset):
148 # TK: This is a common function in replacing the attachment and the main
149 # message by a text (scrubbing).
150 del msg['content-type']
151 del msg['content-transfer-encoding']
152 if isinstance(text, unicode):
153 text = text.encode(charset)
154 if not isinstance(charset, str):
155 charset = str(charset)
156 msg.set_payload(text, charset)
160 def process(mlist, msg, msgdata=None):
161 sanitize = config.ARCHIVE_HTML_SANITIZER
162 outer = True
163 if msgdata is None:
164 msgdata = {}
165 if msgdata:
166 # msgdata is available if it is in GLOBAL_PIPELINE
167 # ie. not in digest or archiver
168 # check if the list owner want to scrub regular delivery
169 if not mlist.scrub_nondigest:
170 return
171 dir = calculate_attachments_dir(mlist, msg, msgdata)
172 charset = format = delsp = None
173 lcset = Utils.GetCharSet(mlist.preferred_language)
174 lcset_out = Charset(lcset).output_charset or lcset
175 # Now walk over all subparts of this message and scrub out various types
176 for part in msg.walk():
177 ctype = part.get_content_type()
178 # If the part is text/plain, we leave it alone
179 if ctype == 'text/plain':
180 # We need to choose a charset for the scrubbed message, so we'll
181 # arbitrarily pick the charset of the first text/plain part in the
182 # message.
184 # Also get the RFC 3676 stuff from this part. This seems to
185 # work okay for scrub_nondigest. It will also work as far as
186 # scrubbing messages for the archive is concerned, but Pipermail
187 # doesn't pay any attention to the RFC 3676 parameters. The plain
188 # format digest is going to be a disaster in any case as some of
189 # messages will be format="flowed" and some not. ToDigest creates
190 # its own Content-Type: header for the plain digest which won't
191 # have RFC 3676 parameters. If the message Content-Type: headers
192 # are retained for display in the digest, the parameters will be
193 # there for information, but not for the MUA. This is the best we
194 # can do without having get_payload() process the parameters.
195 if charset is None:
196 charset = part.get_content_charset(lcset)
197 format = part.get_param('format')
198 delsp = part.get_param('delsp')
199 # TK: if part is attached then check charset and scrub if none
200 if part.get('content-disposition') and \
201 not part.get_content_charset():
202 url = save_attachment(mlist, part, dir)
203 filename = part.get_filename(_('not available'))
204 filename = Utils.oneline(filename, lcset)
205 replace_payload_by_text(part, _("""\
206 An embedded and charset-unspecified text was scrubbed...
207 Name: $filename
208 URL: $url
209 """), lcset)
210 elif ctype == 'text/html' and isinstance(sanitize, int):
211 if sanitize == 0:
212 if outer:
213 raise DiscardMessage
214 replace_payload_by_text(part,
215 _('HTML attachment scrubbed and removed'),
216 # Adding charset arg and removing content-type
217 # sets content-type to text/plain
218 lcset)
219 elif sanitize == 2:
220 # By leaving it alone, Pipermail will automatically escape it
221 pass
222 elif sanitize == 3:
223 # Pull it out as an attachment but leave it unescaped. This
224 # is dangerous, but perhaps useful for heavily moderated
225 # lists.
226 url = save_attachment(mlist, part, dir, filter_html=False)
227 replace_payload_by_text(part, _("""\
228 An HTML attachment was scrubbed...
229 URL: $url
230 """), lcset)
231 else:
232 # HTML-escape it and store it as an attachment, but make it
233 # look a /little/ bit prettier. :(
234 payload = Utils.websafe(part.get_payload(decode=True))
235 # For whitespace in the margin, change spaces into
236 # non-breaking spaces, and tabs into 8 of those. Then use a
237 # mono-space font. Still looks hideous to me, but then I'd
238 # just as soon discard them.
239 def doreplace(s):
240 return s.replace(' ', '&nbsp;').replace('\t', '&nbsp'*8)
241 lines = [doreplace(s) for s in payload.split('\n')]
242 payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
243 part.set_payload(payload)
244 # We're replacing the payload with the decoded payload so this
245 # will just get in the way.
246 del part['content-transfer-encoding']
247 url = save_attachment(mlist, part, dir, filter_html=False)
248 replace_payload_by_text(part, _("""\
249 An HTML attachment was scrubbed...
250 URL: $url
251 """), lcset)
252 elif ctype == 'message/rfc822':
253 # This part contains a submessage, so it too needs scrubbing
254 submsg = part.get_payload(0)
255 url = save_attachment(mlist, part, dir)
256 subject = submsg.get('subject', _('no subject'))
257 date = submsg.get('date', _('no date'))
258 who = submsg.get('from', _('unknown sender'))
259 size = len(str(submsg))
260 replace_payload_by_text(part, _("""\
261 An embedded message was scrubbed...
262 From: $who
263 Subject: $subject
264 Date: $date
265 Size: $size
266 URL: $url
267 """), lcset)
268 # If the message isn't a multipart, then we'll strip it out as an
269 # attachment that would have to be separately downloaded. Pipermail
270 # will transform the url into a hyperlink.
271 elif part._payload and not part.is_multipart():
272 payload = part.get_payload(decode=True)
273 ctype = part.get_content_type()
274 # XXX Under email 2.5, it is possible that payload will be None.
275 # This can happen when you have a Content-Type: multipart/* with
276 # only one part and that part has two blank lines between the
277 # first boundary and the end boundary. In email 3.0 you end up
278 # with a string in the payload. I think in this case it's safe to
279 # ignore the part.
280 if payload is None:
281 continue
282 size = len(payload)
283 url = save_attachment(mlist, part, dir)
284 desc = part.get('content-description', _('not available'))
285 desc = Utils.oneline(desc, lcset)
286 filename = part.get_filename(_('not available'))
287 filename = Utils.oneline(filename, lcset)
288 replace_payload_by_text(part, _("""\
289 A non-text attachment was scrubbed...
290 Name: $filename
291 Type: $ctype
292 Size: $size bytes
293 Desc: $desc
294 URL: $url
295 """), lcset)
296 outer = False
297 # We still have to sanitize multipart messages to flat text because
298 # Pipermail can't handle messages with list payloads. This is a kludge;
299 # def (n) clever hack ;).
300 if msg.is_multipart() and sanitize <> 2:
301 # By default we take the charset of the first text/plain part in the
302 # message, but if there was none, we'll use the list's preferred
303 # language's charset.
304 if not charset or charset == 'us-ascii':
305 charset = lcset_out
306 else:
307 # normalize to the output charset if input/output are different
308 charset = Charset(charset).output_charset or charset
309 # We now want to concatenate all the parts which have been scrubbed to
310 # text/plain, into a single text/plain payload. We need to make sure
311 # all the characters in the concatenated string are in the same
312 # encoding, so we'll use the 'replace' key in the coercion call.
313 # BAW: Martin's original patch suggested we might want to try
314 # generalizing to utf-8, and that's probably a good idea (eventually).
315 text = []
316 charsets = []
317 for part in msg.walk():
318 # TK: bug-id 1099138 and multipart
319 # MAS test payload - if part may fail if there are no headers.
320 if not part._payload or part.is_multipart():
321 continue
322 # All parts should be scrubbed to text/plain by now.
323 partctype = part.get_content_type()
324 if partctype <> 'text/plain':
325 text.append(_('Skipped content of type $partctype\n'))
326 continue
327 try:
328 t = part.get_payload(decode=True) or ''
329 # MAS: TypeError exception can occur if payload is None. This
330 # was observed with a message that contained an attached
331 # message/delivery-status part. Because of the special parsing
332 # of this type, this resulted in a text/plain sub-part with a
333 # null body. See bug 1430236.
334 except (binascii.Error, TypeError):
335 t = part.get_payload() or ''
336 # Email problem was solved by Mark Sapiro. (TK)
337 partcharset = part.get_content_charset('us-ascii')
338 try:
339 t = unicode(t, partcharset, 'replace')
340 except (UnicodeError, LookupError, ValueError, TypeError,
341 AssertionError):
342 # We can get here if partcharset is bogus in come way.
343 # Replace funny characters. We use errors='replace'.
344 t = unicode(t, 'ascii', 'replace')
345 # Separation is useful
346 if isinstance(t, basestring):
347 if not t.endswith('\n'):
348 t += '\n'
349 text.append(t)
350 if partcharset not in charsets:
351 charsets.append(partcharset)
352 # Now join the text and set the payload
353 sep = _('-------------- next part --------------\n')
354 assert isinstance(sep, unicode), (
355 'Expected a unicode separator, got %s' % type(sep))
356 rept = sep.join(text)
357 # Replace entire message with text and scrubbed notice.
358 # Try with message charsets and utf-8
359 if 'utf-8' not in charsets:
360 charsets.append('utf-8')
361 for charset in charsets:
362 try:
363 replace_payload_by_text(msg, rept, charset)
364 break
365 # Bogus charset can throw several exceptions
366 except (UnicodeError, LookupError, ValueError, TypeError,
367 AssertionError):
368 pass
369 if format:
370 msg.set_param('format', format)
371 if delsp:
372 msg.set_param('delsp', delsp)
373 return msg
377 def makedirs(dir):
378 # Create all the directories to store this attachment in and try to make
379 # sure that the permissions of the directories are set correctly.
380 try:
381 os.makedirs(dir, 02775)
382 except OSError, e:
383 if e.errno == errno.EEXIST:
384 return
385 # Some systems such as FreeBSD ignore mkdir's mode, so walk the just
386 # created directories and try to set the mode, ignoring any OSErrors that
387 # occur here.
388 for dirpath, dirnames, filenames in os.walk(dir):
389 try:
390 os.chmod(dirpath, 02775)
391 except OSError:
392 pass
396 def save_attachment(mlist, msg, dir, filter_html=True):
397 fsdir = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR,
398 mlist.fqdn_listname, dir)
399 makedirs(fsdir)
400 # Figure out the attachment type and get the decoded data
401 decodedpayload = msg.get_payload(decode=True)
402 # BAW: mimetypes ought to handle non-standard, but commonly found types,
403 # e.g. image/jpg (should be image/jpeg). For now we just store such
404 # things as application/octet-streams since that seems the safest.
405 ctype = msg.get_content_type()
406 # i18n file name is encoded
407 lcset = Utils.GetCharSet(mlist.preferred_language)
408 filename = Utils.oneline(msg.get_filename(''), lcset)
409 filename, fnext = os.path.splitext(filename)
410 # For safety, we should confirm this is valid ext for content-type
411 # but we can use fnext if we introduce fnext filtering
412 if config.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
413 # HTML message doesn't have filename :-(
414 ext = fnext or guess_extension(ctype, fnext)
415 else:
416 ext = guess_extension(ctype, fnext)
417 if not ext:
418 # We don't know what it is, so assume it's just a shapeless
419 # application/octet-stream, unless the Content-Type: is
420 # message/rfc822, in which case we know we'll coerce the type to
421 # text/plain below.
422 if ctype == 'message/rfc822':
423 ext = '.txt'
424 else:
425 ext = '.bin'
426 # Allow only alphanumerics, dash, underscore, and dot
427 ext = sre.sub('', ext)
428 path = None
429 # We need a lock to calculate the next attachment number
430 with Lock(os.path.join(fsdir, 'attachments.lock')):
431 # Now base the filename on what's in the attachment, uniquifying it if
432 # necessary.
433 if not filename or config.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME:
434 filebase = 'attachment'
435 else:
436 # Sanitize the filename given in the message headers
437 parts = pre.split(filename)
438 filename = parts[-1]
439 # Strip off leading dots
440 filename = dre.sub('', filename)
441 # Allow only alphanumerics, dash, underscore, and dot
442 filename = sre.sub('', filename)
443 # If the filename's extension doesn't match the type we guessed,
444 # which one should we go with? For now, let's go with the one we
445 # guessed so attachments can't lie about their type. Also, if the
446 # filename /has/ no extension, then tack on the one we guessed.
447 # The extension was removed from the name above.
448 filebase = filename
449 # Now we're looking for a unique name for this file on the file
450 # system. If msgdir/filebase.ext isn't unique, we'll add a counter
451 # after filebase, e.g. msgdir/filebase-cnt.ext
452 counter = 0
453 extra = ''
454 while True:
455 path = os.path.join(fsdir, filebase + extra + ext)
456 # Generally it is not a good idea to test for file existance
457 # before just trying to create it, but the alternatives aren't
458 # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
459 # NFS-safe). Besides, we have an exclusive lock now, so we're
460 # guaranteed that no other process will be racing with us.
461 if os.path.exists(path):
462 counter += 1
463 extra = '-%04d' % counter
464 else:
465 break
466 # `path' now contains the unique filename for the attachment. There's
467 # just one more step we need to do. If the part is text/html and
468 # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
469 # here), then send the attachment through the filter program for
470 # sanitization
471 if filter_html and ctype == 'text/html':
472 base, ext = os.path.splitext(path)
473 tmppath = base + '-tmp' + ext
474 fp = open(tmppath, 'w')
475 try:
476 fp.write(decodedpayload)
477 fp.close()
478 cmd = config.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
479 progfp = os.popen(cmd, 'r')
480 decodedpayload = progfp.read()
481 status = progfp.close()
482 if status:
483 log.error('HTML sanitizer exited with non-zero status: %s',
484 status)
485 finally:
486 os.unlink(tmppath)
487 # BAW: Since we've now sanitized the document, it should be plain
488 # text. Blarg, we really want the sanitizer to tell us what the type
489 # if the return data is. :(
490 ext = '.txt'
491 path = base + '.txt'
492 # Is it a message/rfc822 attachment?
493 elif ctype == 'message/rfc822':
494 submsg = msg.get_payload()
495 # BAW: I'm sure we can eventually do better than this. :(
496 decodedpayload = Utils.websafe(str(submsg))
497 fp = open(path, 'w')
498 fp.write(decodedpayload)
499 fp.close()
500 # Now calculate the url to the list's archive.
501 baseurl = get_plugin('mailman.scrubber').list_url(mlist)
502 if not baseurl.endswith('/'):
503 baseurl += '/'
504 # Trailing space will definitely be a problem with format=flowed.
505 # Bracket the URL instead.
506 url = '<' + baseurl + '%s/%s%s%s>' % (dir, filebase, extra, ext)
507 return url
511 class Scrubber:
512 """Cleanse a message for archiving."""
514 implements(IHandler)
516 name = 'scrubber'
517 description = _('Cleanse a message for archiving.')
519 def process(self, mlist, msg, msgdata):
520 """See `IHandler`."""
521 process(mlist, msg, msgdata)