mailman/pipeline/scrubber.py

   1 # Copyright (C) 2001-2008 by the Free Software Foundation, Inc.
   2 #
   3 # This file is part of GNU Mailman.
   4 #
   5 # GNU Mailman is free software: you can redistribute it and/or modify it under
   6 # the terms of the GNU General Public License as published by the Free
   7 # Software Foundation, either version 3 of the License, or (at your option)
   8 # any later version.
   9 #
  10 # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
  11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  13 # more details.
  14 #
  15 # You should have received a copy of the GNU General Public License along with
  16 # GNU Mailman.  If not, see <http://www.gnu.org/licenses/>.
  17
  18 """Cleanse a message for archiving."""
  19
  20 from __future__ import with_statement
  21
  22 __metaclass__ = type
  23 __all__ = ['Scrubber']
  24
  25
  26 import os
  27 import re
  28 import time
  29 import errno
  30 import hashlib
  31 import logging
  32 import binascii
  33
  34 from email.charset import Charset
  35 from email.generator import Generator
  36 from email.utils import make_msgid, parsedate
  37 from locknix.lockfile import Lock
  38 from mimetypes import guess_all_extensions
  39 from zope.interface import implements
  40
  41 from mailman import Utils
  42 from mailman.configuration import config
  43 from mailman.core.errors import DiscardMessage
  44 from mailman.core.plugins import get_plugin
  45 from mailman.i18n import _
  46 from mailman.interfaces import IHandler
  47
  48
  49 # Path characters for common platforms
  50 pre = re.compile(r'[/\\:]')
  51 # All other characters to strip out of Content-Disposition: filenames
  52 # (essentially anything that isn't an alphanum, dot, dash, or underscore).
  53 sre = re.compile(r'[^-\w.]')
  54 # Regexp to strip out leading dots
  55 dre = re.compile(r'^\.*')
  56
  57 BR = '<br>\n'
  58 SPACE = ' '
  59
  60 log = logging.getLogger('mailman.error')
  61
  62
  63 \f
  64 def guess_extension(ctype, ext):
  65     # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
  66     # and .wiz are all mapped to application/msword.  This sucks for finding
  67     # the best reverse mapping.  If the extension is one of the giving
  68     # mappings, we'll trust that, otherwise we'll just guess. :/
  69     all = guess_all_extensions(ctype, strict=False)
  70     if ext in all:
  71         return ext
  72     return all and all[0]
  73
  74
  75 \f
  76 # We're using a subclass of the standard Generator because we want to suppress
  77 # headers in the subparts of multiparts.  We use a hack -- the ctor argument
  78 # skipheaders to accomplish this.  It's set to true for the outer Message
  79 # object, but false for all internal objects.  We recognize that
  80 # sub-Generators will get created passing only mangle_from_ and maxheaderlen
  81 # to the ctors.
  82 #
  83 # This isn't perfect because we still get stuff like the multipart boundaries,
  84 # but see below for how we corrupt that to our nefarious goals.
  85 class ScrubberGenerator(Generator):
  86     def __init__(self, outfp, mangle_from_=True,
  87                  maxheaderlen=78, skipheaders=True):
  88         Generator.__init__(self, outfp, mangle_from_=False)
  89         self.__skipheaders = skipheaders
  90
  91     def _write_headers(self, msg):
  92         if not self.__skipheaders:
  93             Generator._write_headers(self, msg)
  94
  95 \f
  96 def safe_strftime(fmt, t):
  97     try:
  98         return time.strftime(fmt, t)
  99     except (TypeError, ValueError, OverflowError):
 100         return None
 101
 102
 103 def calculate_attachments_dir(mlist, msg, msgdata):
 104     # Calculate the directory that attachments for this message will go
 105     # under.  To avoid inode limitations, the scheme will be:
 106     # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
 107     # Start by calculating the date-based and msgid-hash components.
 108     fmt = '%Y%m%d'
 109     datestr = msg.get('Date')
 110     if datestr:
 111         now = parsedate(datestr)
 112     else:
 113         now = time.gmtime(msgdata.get('received_time', time.time()))
 114     datedir = safe_strftime(fmt, now)
 115     if not datedir:
 116         datestr = msgdata.get('X-List-Received-Date')
 117         if datestr:
 118             datedir = safe_strftime(fmt, datestr)
 119     if not datedir:
 120         # What next?  Unixfrom, I guess.
 121         parts = msg.get_unixfrom().split()
 122         try:
 123             month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
 124                      'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
 125                      }.get(parts[3], 0)
 126             day = int(parts[4])
 127             year = int(parts[6])
 128         except (IndexError, ValueError):
 129             # Best we can do I think
 130             month = day = year = 0
 131         datedir = '%04d%02d%02d' % (year, month, day)
 132     assert datedir
 133     # As for the msgid hash, we'll base this part on the Message-ID: so that
 134     # all attachments for the same message end up in the same directory (we'll
 135     # uniquify the filenames in that directory as needed).  We use the first 2
 136     # and last 2 bytes of the SHA1 hash of the message id as the basis of the
 137     # directory name.  Clashes here don't really matter too much, and that
 138     # still gives us a 32-bit space to work with.
 139     msgid = msg['message-id']
 140     if msgid is None:
 141         msgid = msg['Message-ID'] = make_msgid()
 142     # We assume that the message id actually /is/ unique!
 143     digest = hashlib.sha1(msgid).hexdigest()
 144     return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
 145
 146
 147 def replace_payload_by_text(msg, text, charset):
 148     # TK: This is a common function in replacing the attachment and the main
 149     # message by a text (scrubbing).
 150     del msg['content-type']
 151     del msg['content-transfer-encoding']
 152     if isinstance(text, unicode):
 153         text = text.encode(charset)
 154     if not isinstance(charset, str):
 155         charset = str(charset)
 156     msg.set_payload(text, charset)
 157
 158
 159 \f
 160 def process(mlist, msg, msgdata=None):
 161     sanitize = config.ARCHIVE_HTML_SANITIZER
 162     outer = True
 163     if msgdata is None:
 164         msgdata = {}
 165     if msgdata:
 166         # msgdata is available if it is in GLOBAL_PIPELINE
 167         # ie. not in digest or archiver
 168         # check if the list owner want to scrub regular delivery
 169         if not mlist.scrub_nondigest:
 170             return
 171     dir = calculate_attachments_dir(mlist, msg, msgdata)
 172     charset = format = delsp = None
 173     lcset = Utils.GetCharSet(mlist.preferred_language)
 174     lcset_out = Charset(lcset).output_charset or lcset
 175     # Now walk over all subparts of this message and scrub out various types
 176     for part in msg.walk():
 177         ctype = part.get_content_type()
 178         # If the part is text/plain, we leave it alone
 179         if ctype == 'text/plain':
 180             # We need to choose a charset for the scrubbed message, so we'll
 181             # arbitrarily pick the charset of the first text/plain part in the
 182             # message.
 183             #
 184             # Also get the RFC 3676 stuff from this part. This seems to
 185             # work okay for scrub_nondigest.  It will also work as far as
 186             # scrubbing messages for the archive is concerned, but Pipermail
 187             # doesn't pay any attention to the RFC 3676 parameters.  The plain
 188             # format digest is going to be a disaster in any case as some of
 189             # messages will be format="flowed" and some not.  ToDigest creates
 190             # its own Content-Type: header for the plain digest which won't
 191             # have RFC 3676 parameters. If the message Content-Type: headers
 192             # are retained for display in the digest, the parameters will be
 193             # there for information, but not for the MUA. This is the best we
 194             # can do without having get_payload() process the parameters.
 195             if charset is None:
 196                 charset = part.get_content_charset(lcset)
 197                 format = part.get_param('format')
 198                 delsp = part.get_param('delsp')
 199             # TK: if part is attached then check charset and scrub if none
 200             if part.get('content-disposition') and \
 201                not part.get_content_charset():
 202                 url = save_attachment(mlist, part, dir)
 203                 filename = part.get_filename(_('not available'))
 204                 filename = Utils.oneline(filename, lcset)
 205                 replace_payload_by_text(part, _("""\
 206 An embedded and charset-unspecified text was scrubbed...
 207 Name: $filename
 208 URL: $url
 209 """), lcset)
 210         elif ctype == 'text/html' and isinstance(sanitize, int):
 211             if sanitize == 0:
 212                 if outer:
 213                     raise DiscardMessage
 214                 replace_payload_by_text(part,
 215                                  _('HTML attachment scrubbed and removed'),
 216                                  # Adding charset arg and removing content-type
 217                                  # sets content-type to text/plain
 218                                  lcset)
 219             elif sanitize == 2:
 220                 # By leaving it alone, Pipermail will automatically escape it
 221                 pass
 222             elif sanitize == 3:
 223                 # Pull it out as an attachment but leave it unescaped.  This
 224                 # is dangerous, but perhaps useful for heavily moderated
 225                 # lists.
 226                 url = save_attachment(mlist, part, dir, filter_html=False)
 227                 replace_payload_by_text(part, _("""\
 228 An HTML attachment was scrubbed...
 229 URL: $url
 230 """), lcset)
 231             else:
 232                 # HTML-escape it and store it as an attachment, but make it
 233                 # look a /little/ bit prettier. :(
 234                 payload = Utils.websafe(part.get_payload(decode=True))
 235                 # For whitespace in the margin, change spaces into
 236                 # non-breaking spaces, and tabs into 8 of those.  Then use a
 237                 # mono-space font.  Still looks hideous to me, but then I'd
 238                 # just as soon discard them.
 239                 def doreplace(s):
 240                     return s.replace(' ', '&nbsp;').replace('\t', '&nbsp'*8)
 241                 lines = [doreplace(s) for s in payload.split('\n')]
 242                 payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
 243                 part.set_payload(payload)
 244                 # We're replacing the payload with the decoded payload so this
 245                 # will just get in the way.
 246                 del part['content-transfer-encoding']
 247                 url = save_attachment(mlist, part, dir, filter_html=False)
 248                 replace_payload_by_text(part, _("""\
 249 An HTML attachment was scrubbed...
 250 URL: $url
 251 """), lcset)
 252         elif ctype == 'message/rfc822':
 253             # This part contains a submessage, so it too needs scrubbing
 254             submsg = part.get_payload(0)
 255             url = save_attachment(mlist, part, dir)
 256             subject = submsg.get('subject', _('no subject'))
 257             date = submsg.get('date', _('no date'))
 258             who = submsg.get('from', _('unknown sender'))
 259             size = len(str(submsg))
 260             replace_payload_by_text(part, _("""\
 261 An embedded message was scrubbed...
 262 From: $who
 263 Subject: $subject
 264 Date: $date
 265 Size: $size
 266 URL: $url
 267 """), lcset)
 268         # If the message isn't a multipart, then we'll strip it out as an
 269         # attachment that would have to be separately downloaded.  Pipermail
 270         # will transform the url into a hyperlink.
 271         elif part._payload and not part.is_multipart():
 272             payload = part.get_payload(decode=True)
 273             ctype = part.get_content_type()
 274             # XXX Under email 2.5, it is possible that payload will be None.
 275             # This can happen when you have a Content-Type: multipart/* with
 276             # only one part and that part has two blank lines between the
 277             # first boundary and the end boundary.  In email 3.0 you end up
 278             # with a string in the payload.  I think in this case it's safe to
 279             # ignore the part.
 280             if payload is None:
 281                 continue
 282             size = len(payload)
 283             url = save_attachment(mlist, part, dir)
 284             desc = part.get('content-description', _('not available'))
 285             desc = Utils.oneline(desc, lcset)
 286             filename = part.get_filename(_('not available'))
 287             filename = Utils.oneline(filename, lcset)
 288             replace_payload_by_text(part, _("""\
 289 A non-text attachment was scrubbed...
 290 Name: $filename
 291 Type: $ctype
 292 Size: $size bytes
 293 Desc: $desc
 294 URL: $url
 295 """), lcset)
 296         outer = False
 297     # We still have to sanitize multipart messages to flat text because
 298     # Pipermail can't handle messages with list payloads.  This is a kludge;
 299     # def (n) clever hack ;).
 300     if msg.is_multipart() and sanitize <> 2:
 301         # By default we take the charset of the first text/plain part in the
 302         # message, but if there was none, we'll use the list's preferred
 303         # language's charset.
 304         if not charset or charset == 'us-ascii':
 305             charset = lcset_out
 306         else:
 307             # normalize to the output charset if input/output are different
 308             charset = Charset(charset).output_charset or charset
 309         # We now want to concatenate all the parts which have been scrubbed to
 310         # text/plain, into a single text/plain payload.  We need to make sure
 311         # all the characters in the concatenated string are in the same
 312         # encoding, so we'll use the 'replace' key in the coercion call.
 313         # BAW: Martin's original patch suggested we might want to try
 314         # generalizing to utf-8, and that's probably a good idea (eventually).
 315         text = []
 316         charsets = []
 317         for part in msg.walk():
 318             # TK: bug-id 1099138 and multipart
 319             # MAS test payload - if part may fail if there are no headers.
 320             if not part._payload or part.is_multipart():
 321                 continue
 322             # All parts should be scrubbed to text/plain by now.
 323             partctype = part.get_content_type()
 324             if partctype <> 'text/plain':
 325                 text.append(_('Skipped content of type $partctype\n'))
 326                 continue
 327             try:
 328                 t = part.get_payload(decode=True) or ''
 329             # MAS: TypeError exception can occur if payload is None. This
 330             # was observed with a message that contained an attached
 331             # message/delivery-status part. Because of the special parsing
 332             # of this type, this resulted in a text/plain sub-part with a
 333             # null body. See bug 1430236.
 334             except (binascii.Error, TypeError):
 335                 t = part.get_payload() or ''
 336             # Email problem was solved by Mark Sapiro. (TK)
 337             partcharset = part.get_content_charset('us-ascii')
 338             try:
 339                 t = unicode(t, partcharset, 'replace')
 340             except (UnicodeError, LookupError, ValueError, TypeError,
 341                     AssertionError):
 342                 # We can get here if partcharset is bogus in come way.
 343                 # Replace funny characters.  We use errors='replace'.
 344                 t = unicode(t, 'ascii', 'replace')
 345             # Separation is useful
 346             if isinstance(t, basestring):
 347                 if not t.endswith('\n'):
 348                     t += '\n'
 349                 text.append(t)
 350             if partcharset not in charsets:
 351                 charsets.append(partcharset)
 352         # Now join the text and set the payload
 353         sep = _('-------------- next part --------------\n')
 354         assert isinstance(sep, unicode), (
 355             'Expected a unicode separator, got %s' % type(sep))
 356         rept = sep.join(text)
 357         # Replace entire message with text and scrubbed notice.
 358         # Try with message charsets and utf-8
 359         if 'utf-8' not in charsets:
 360             charsets.append('utf-8')
 361         for charset in charsets:
 362             try:
 363                 replace_payload_by_text(msg, rept, charset)
 364                 break
 365             # Bogus charset can throw several exceptions
 366             except (UnicodeError, LookupError, ValueError, TypeError,
 367                     AssertionError):
 368                 pass
 369         if format:
 370             msg.set_param('format', format)
 371         if delsp:
 372             msg.set_param('delsp', delsp)
 373     return msg
 374
 375
 376 \f
 377 def makedirs(dir):
 378     # Create all the directories to store this attachment in and try to make
 379     # sure that the permissions of the directories are set correctly.
 380     try:
 381         os.makedirs(dir, 02775)
 382     except OSError, e:
 383         if e.errno == errno.EEXIST:
 384             return
 385     # Some systems such as FreeBSD ignore mkdir's mode, so walk the just
 386     # created directories and try to set the mode, ignoring any OSErrors that
 387     # occur here.
 388     for dirpath, dirnames, filenames in os.walk(dir):
 389         try:
 390             os.chmod(dirpath, 02775)
 391         except OSError:
 392             pass
 393
 394
 395 \f
 396 def save_attachment(mlist, msg, dir, filter_html=True):
 397     fsdir = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR,
 398                          mlist.fqdn_listname, dir)
 399     makedirs(fsdir)
 400     # Figure out the attachment type and get the decoded data
 401     decodedpayload = msg.get_payload(decode=True)
 402     # BAW: mimetypes ought to handle non-standard, but commonly found types,
 403     # e.g. image/jpg (should be image/jpeg).  For now we just store such
 404     # things as application/octet-streams since that seems the safest.
 405     ctype = msg.get_content_type()
 406     # i18n file name is encoded
 407     lcset = Utils.GetCharSet(mlist.preferred_language)
 408     filename = Utils.oneline(msg.get_filename(''), lcset)
 409     filename, fnext = os.path.splitext(filename)
 410     # For safety, we should confirm this is valid ext for content-type
 411     # but we can use fnext if we introduce fnext filtering
 412     if config.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
 413         # HTML message doesn't have filename :-(
 414         ext = fnext or guess_extension(ctype, fnext)
 415     else:
 416         ext = guess_extension(ctype, fnext)
 417     if not ext:
 418         # We don't know what it is, so assume it's just a shapeless
 419         # application/octet-stream, unless the Content-Type: is
 420         # message/rfc822, in which case we know we'll coerce the type to
 421         # text/plain below.
 422         if ctype == 'message/rfc822':
 423             ext = '.txt'
 424         else:
 425             ext = '.bin'
 426     # Allow only alphanumerics, dash, underscore, and dot
 427     ext = sre.sub('', ext)
 428     path = None
 429     # We need a lock to calculate the next attachment number
 430     with Lock(os.path.join(fsdir, 'attachments.lock')):
 431         # Now base the filename on what's in the attachment, uniquifying it if
 432         # necessary.
 433         if not filename or config.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME:
 434             filebase = 'attachment'
 435         else:
 436             # Sanitize the filename given in the message headers
 437             parts = pre.split(filename)
 438             filename = parts[-1]
 439             # Strip off leading dots
 440             filename = dre.sub('', filename)
 441             # Allow only alphanumerics, dash, underscore, and dot
 442             filename = sre.sub('', filename)
 443             # If the filename's extension doesn't match the type we guessed,
 444             # which one should we go with?  For now, let's go with the one we
 445             # guessed so attachments can't lie about their type.  Also, if the
 446             # filename /has/ no extension, then tack on the one we guessed.
 447             # The extension was removed from the name above.
 448             filebase = filename
 449         # Now we're looking for a unique name for this file on the file
 450         # system.  If msgdir/filebase.ext isn't unique, we'll add a counter
 451         # after filebase, e.g. msgdir/filebase-cnt.ext
 452         counter = 0
 453         extra = ''
 454         while True:
 455             path = os.path.join(fsdir, filebase + extra + ext)
 456             # Generally it is not a good idea to test for file existance
 457             # before just trying to create it, but the alternatives aren't
 458             # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
 459             # NFS-safe).  Besides, we have an exclusive lock now, so we're
 460             # guaranteed that no other process will be racing with us.
 461             if os.path.exists(path):
 462                 counter += 1
 463                 extra = '-%04d' % counter
 464             else:
 465                 break
 466     # `path' now contains the unique filename for the attachment.  There's
 467     # just one more step we need to do.  If the part is text/html and
 468     # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
 469     # here), then send the attachment through the filter program for
 470     # sanitization
 471     if filter_html and ctype == 'text/html':
 472         base, ext = os.path.splitext(path)
 473         tmppath = base + '-tmp' + ext
 474         fp = open(tmppath, 'w')
 475         try:
 476             fp.write(decodedpayload)
 477             fp.close()
 478             cmd = config.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
 479             progfp = os.popen(cmd, 'r')
 480             decodedpayload = progfp.read()
 481             status = progfp.close()
 482             if status:
 483                 log.error('HTML sanitizer exited with non-zero status: %s',
 484                           status)
 485         finally:
 486             os.unlink(tmppath)
 487         # BAW: Since we've now sanitized the document, it should be plain
 488         # text.  Blarg, we really want the sanitizer to tell us what the type
 489         # if the return data is. :(
 490         ext = '.txt'
 491         path = base + '.txt'
 492     # Is it a message/rfc822 attachment?
 493     elif ctype == 'message/rfc822':
 494         submsg = msg.get_payload()
 495         # BAW: I'm sure we can eventually do better than this. :(
 496         decodedpayload = Utils.websafe(str(submsg))
 497     fp = open(path, 'w')
 498     fp.write(decodedpayload)
 499     fp.close()
 500     # Now calculate the url to the list's archive.
 501     baseurl = get_plugin('mailman.scrubber').list_url(mlist)
 502     if not baseurl.endswith('/'):
 503         baseurl += '/'
 504     # Trailing space will definitely be a problem with format=flowed.
 505     # Bracket the URL instead.
 506     url = '<' + baseurl + '%s/%s%s%s>' % (dir, filebase, extra, ext)
 507     return url
 508
 509
 510 \f
 511 class Scrubber:
 512     """Cleanse a message for archiving."""
 513
 514     implements(IHandler)
 515
 516     name = 'scrubber'
 517     description = _('Cleanse a message for archiving.')
 518
 519     def process(self, mlist, msg, msgdata):
 520         """See `IHandler`."""
 521         process(mlist, msg, msgdata)