Mailman/Handlers/Scrubber.py

   1 # Copyright (C) 2001-2007 by the Free Software Foundation, Inc.
   2 #
   3 # This program is free software; you can redistribute it and/or
   4 # modify it under the terms of the GNU General Public License
   5 # as published by the Free Software Foundation; either version 2
   6 # of the License, or (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  16 # USA.
  17
  18 """Cleanse a message for archiving."""
  19
  20 from __future__ import with_statement
  21
  22 import os
  23 import re
  24 import sha
  25 import time
  26 import errno
  27 import logging
  28 import binascii
  29 import tempfile
  30
  31 from cStringIO import StringIO
  32 from mimetypes import guess_all_extensions
  33
  34 from email.charset import Charset
  35 from email.generator import Generator
  36 from email.parser import HeaderParser
  37 from email.utils import make_msgid, parsedate
  38
  39 from Mailman import Message
  40 from Mailman import Utils
  41 from Mailman.Errors import DiscardMessage
  42 from Mailman.app.archiving import get_base_archive_url
  43 from Mailman.configuration import config
  44 from Mailman.i18n import _
  45 from Mailman.lockfile import LockFile
  46
  47 # Path characters for common platforms
  48 pre = re.compile(r'[/\\:]')
  49 # All other characters to strip out of Content-Disposition: filenames
  50 # (essentially anything that isn't an alphanum, dot, slash, or underscore.
  51 sre = re.compile(r'[^-\w.]')
  52 # Regexp to strip out leading dots
  53 dre = re.compile(r'^\.*')
  54
  55 BR = '<br>\n'
  56 SPACE = ' '
  57
  58 log = logging.getLogger('mailman.error')
  59
  60
  61 \f
  62 def guess_extension(ctype, ext):
  63     # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
  64     # and .wiz are all mapped to application/msword.  This sucks for finding
  65     # the best reverse mapping.  If the extension is one of the giving
  66     # mappings, we'll trust that, otherwise we'll just guess. :/
  67     all = guess_all_extensions(ctype, strict=False)
  68     if ext in all:
  69         return ext
  70     return all and all[0]
  71
  72
  73 \f
  74 # We're using a subclass of the standard Generator because we want to suppress
  75 # headers in the subparts of multiparts.  We use a hack -- the ctor argument
  76 # skipheaders to accomplish this.  It's set to true for the outer Message
  77 # object, but false for all internal objects.  We recognize that
  78 # sub-Generators will get created passing only mangle_from_ and maxheaderlen
  79 # to the ctors.
  80 #
  81 # This isn't perfect because we still get stuff like the multipart boundaries,
  82 # but see below for how we corrupt that to our nefarious goals.
  83 class ScrubberGenerator(Generator):
  84     def __init__(self, outfp, mangle_from_=True,
  85                  maxheaderlen=78, skipheaders=True):
  86         Generator.__init__(self, outfp, mangle_from_=False)
  87         self.__skipheaders = skipheaders
  88
  89     def _write_headers(self, msg):
  90         if not self.__skipheaders:
  91             Generator._write_headers(self, msg)
  92
  93 \f
  94 def safe_strftime(fmt, t):
  95     try:
  96         return time.strftime(fmt, t)
  97     except (TypeError, ValueError, OverflowError):
  98         return None
  99
 100
 101 def calculate_attachments_dir(mlist, msg, msgdata):
 102     # Calculate the directory that attachments for this message will go
 103     # under.  To avoid inode limitations, the scheme will be:
 104     # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
 105     # Start by calculating the date-based and msgid-hash components.
 106     fmt = '%Y%m%d'
 107     datestr = msg.get('Date')
 108     if datestr:
 109         now = parsedate(datestr)
 110     else:
 111         now = time.gmtime(msgdata.get('received_time', time.time()))
 112     datedir = safe_strftime(fmt, now)
 113     if not datedir:
 114         datestr = msgdata.get('X-List-Received-Date')
 115         if datestr:
 116             datedir = safe_strftime(fmt, datestr)
 117     if not datedir:
 118         # What next?  Unixfrom, I guess.
 119         parts = msg.get_unixfrom().split()
 120         try:
 121             month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
 122                      'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
 123                      }.get(parts[3], 0)
 124             day = int(parts[4])
 125             year = int(parts[6])
 126         except (IndexError, ValueError):
 127             # Best we can do I think
 128             month = day = year = 0
 129         datedir = '%04d%02d%02d' % (year, month, day)
 130     assert datedir
 131     # As for the msgid hash, we'll base this part on the Message-ID: so that
 132     # all attachments for the same message end up in the same directory (we'll
 133     # uniquify the filenames in that directory as needed).  We use the first 2
 134     # and last 2 bytes of the SHA1 hash of the message id as the basis of the
 135     # directory name.  Clashes here don't really matter too much, and that
 136     # still gives us a 32-bit space to work with.
 137     msgid = msg['message-id']
 138     if msgid is None:
 139         msgid = msg['Message-ID'] = make_msgid()
 140     # We assume that the message id actually /is/ unique!
 141     digest = sha.new(msgid).hexdigest()
 142     return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
 143
 144
 145 def replace_payload_by_text(msg, text, charset):
 146     # TK: This is a common function in replacing the attachment and the main
 147     # message by a text (scrubbing).
 148     del msg['content-type']
 149     del msg['content-transfer-encoding']
 150     if isinstance(text, unicode):
 151         text = text.encode(charset)
 152     if not isinstance(charset, str):
 153         charset = str(charset)
 154     msg.set_payload(text, charset)
 155
 156
 157 \f
 158 def process(mlist, msg, msgdata=None):
 159     sanitize = config.ARCHIVE_HTML_SANITIZER
 160     outer = True
 161     if msgdata is None:
 162         msgdata = {}
 163     if msgdata:
 164         # msgdata is available if it is in GLOBAL_PIPELINE
 165         # ie. not in digest or archiver
 166         # check if the list owner want to scrub regular delivery
 167         if not mlist.scrub_nondigest:
 168             return
 169     dir = calculate_attachments_dir(mlist, msg, msgdata)
 170     charset = format = delsp = None
 171     lcset = Utils.GetCharSet(mlist.preferred_language)
 172     lcset_out = Charset(lcset).output_charset or lcset
 173     # Now walk over all subparts of this message and scrub out various types
 174     for part in msg.walk():
 175         ctype = part.get_content_type()
 176         # If the part is text/plain, we leave it alone
 177         if ctype == 'text/plain':
 178             # We need to choose a charset for the scrubbed message, so we'll
 179             # arbitrarily pick the charset of the first text/plain part in the
 180             # message.
 181             #
 182             # Also get the RFC 3676 stuff from this part. This seems to
 183             # work okay for scrub_nondigest.  It will also work as far as
 184             # scrubbing messages for the archive is concerned, but Pipermail
 185             # doesn't pay any attention to the RFC 3676 parameters.  The plain
 186             # format digest is going to be a disaster in any case as some of
 187             # messages will be format="flowed" and some not.  ToDigest creates
 188             # its own Content-Type: header for the plain digest which won't
 189             # have RFC 3676 parameters. If the message Content-Type: headers
 190             # are retained for display in the digest, the parameters will be
 191             # there for information, but not for the MUA. This is the best we
 192             # can do without having get_payload() process the parameters.
 193             if charset is None:
 194                 charset = part.get_content_charset(lcset)
 195                 format = part.get_param('format')
 196                 delsp = part.get_param('delsp')
 197             # TK: if part is attached then check charset and scrub if none
 198             if part.get('content-disposition') and \
 199                not part.get_content_charset():
 200                 url = save_attachment(mlist, part, dir)
 201                 filename = part.get_filename(_('not available'))
 202                 filename = Utils.oneline(filename, lcset)
 203                 replace_payload_by_text(part, _("""\
 204 An embedded and charset-unspecified text was scrubbed...
 205 Name: %(filename)s
 206 URL: %(url)s
 207 """), lcset)
 208         elif ctype == 'text/html' and isinstance(sanitize, int):
 209             if sanitize == 0:
 210                 if outer:
 211                     raise DiscardMessage
 212                 replace_payload_by_text(part,
 213                                  _('HTML attachment scrubbed and removed'),
 214                                  # Adding charset arg and removing content-type
 215                                  # sets content-type to text/plain
 216                                  lcset)
 217             elif sanitize == 2:
 218                 # By leaving it alone, Pipermail will automatically escape it
 219                 pass
 220             elif sanitize == 3:
 221                 # Pull it out as an attachment but leave it unescaped.  This
 222                 # is dangerous, but perhaps useful for heavily moderated
 223                 # lists.
 224                 url = save_attachment(mlist, part, dir, filter_html=False)
 225                 replace_payload_by_text(part, _("""\
 226 An HTML attachment was scrubbed...
 227 URL: %(url)s
 228 """), lcset)
 229             else:
 230                 # HTML-escape it and store it as an attachment, but make it
 231                 # look a /little/ bit prettier. :(
 232                 payload = Utils.websafe(part.get_payload(decode=True))
 233                 # For whitespace in the margin, change spaces into
 234                 # non-breaking spaces, and tabs into 8 of those.  Then use a
 235                 # mono-space font.  Still looks hideous to me, but then I'd
 236                 # just as soon discard them.
 237                 def doreplace(s):
 238                     return s.replace(' ', '&nbsp;').replace('\t', '&nbsp'*8)
 239                 lines = [doreplace(s) for s in payload.split('\n')]
 240                 payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
 241                 part.set_payload(payload)
 242                 # We're replacing the payload with the decoded payload so this
 243                 # will just get in the way.
 244                 del part['content-transfer-encoding']
 245                 url = save_attachment(mlist, part, dir, filter_html=False)
 246                 replace_payload_by_text(part, _("""\
 247 An HTML attachment was scrubbed...
 248 URL: %(url)s
 249 """), lcset)
 250         elif ctype == 'message/rfc822':
 251             # This part contains a submessage, so it too needs scrubbing
 252             submsg = part.get_payload(0)
 253             url = save_attachment(mlist, part, dir)
 254             subject = submsg.get('subject', _('no subject'))
 255             date = submsg.get('date', _('no date'))
 256             who = submsg.get('from', _('unknown sender'))
 257             size = len(str(submsg))
 258             replace_payload_by_text(part, _("""\
 259 An embedded message was scrubbed...
 260 From: %(who)s
 261 Subject: %(subject)s
 262 Date: %(date)s
 263 Size: %(size)s
 264 URL: %(url)s
 265 """), lcset)
 266         # If the message isn't a multipart, then we'll strip it out as an
 267         # attachment that would have to be separately downloaded.  Pipermail
 268         # will transform the url into a hyperlink.
 269         elif part and not part.is_multipart():
 270             payload = part.get_payload(decode=True)
 271             ctype = part.get_content_type()
 272             # XXX Under email 2.5, it is possible that payload will be None.
 273             # This can happen when you have a Content-Type: multipart/* with
 274             # only one part and that part has two blank lines between the
 275             # first boundary and the end boundary.  In email 3.0 you end up
 276             # with a string in the payload.  I think in this case it's safe to
 277             # ignore the part.
 278             if payload is None:
 279                 continue
 280             size = len(payload)
 281             url = save_attachment(mlist, part, dir)
 282             desc = part.get('content-description', _('not available'))
 283             desc = Utils.oneline(desc, lcset)
 284             filename = part.get_filename(_('not available'))
 285             filename = Utils.oneline(filename, lcset)
 286             replace_payload_by_text(part, _("""\
 287 A non-text attachment was scrubbed...
 288 Name: %(filename)s
 289 Type: %(ctype)s
 290 Size: %(size)d bytes
 291 Desc: %(desc)s
 292 URL: %(url)s
 293 """), lcset)
 294         outer = False
 295     # We still have to sanitize multipart messages to flat text because
 296     # Pipermail can't handle messages with list payloads.  This is a kludge;
 297     # def (n) clever hack ;).
 298     if msg.is_multipart() and sanitize <> 2:
 299         # By default we take the charset of the first text/plain part in the
 300         # message, but if there was none, we'll use the list's preferred
 301         # language's charset.
 302         if not charset or charset == 'us-ascii':
 303             charset = lcset_out
 304         else:
 305             # normalize to the output charset if input/output are different
 306             charset = Charset(charset).output_charset or charset
 307         # We now want to concatenate all the parts which have been scrubbed to
 308         # text/plain, into a single text/plain payload.  We need to make sure
 309         # all the characters in the concatenated string are in the same
 310         # encoding, so we'll use the 'replace' key in the coercion call.
 311         # BAW: Martin's original patch suggested we might want to try
 312         # generalizing to utf-8, and that's probably a good idea (eventually).
 313         text = []
 314         charsets = []
 315         for part in msg.walk():
 316             # TK: bug-id 1099138 and multipart
 317             if not part or part.is_multipart():
 318                 continue
 319             # All parts should be scrubbed to text/plain by now.
 320             partctype = part.get_content_type()
 321             if partctype <> 'text/plain':
 322                 text.append(_('Skipped content of type %(partctype)s\n'))
 323                 continue
 324             try:
 325                 t = part.get_payload(decode=True) or ''
 326             # MAS: TypeError exception can occur if payload is None. This
 327             # was observed with a message that contained an attached
 328             # message/delivery-status part. Because of the special parsing
 329             # of this type, this resulted in a text/plain sub-part with a
 330             # null body. See bug 1430236.
 331             except (binascii.Error, TypeError):
 332                 t = part.get_payload() or ''
 333             # Email problem was solved by Mark Sapiro. (TK)
 334             partcharset = part.get_content_charset('us-ascii')
 335             try:
 336                 t = unicode(t, partcharset, 'replace')
 337             except (UnicodeError, LookupError, ValueError, TypeError,
 338                     AssertionError):
 339                 # We can get here if partcharset is bogus in come way.
 340                 # Replace funny characters.  We use errors='replace'.
 341                 t = unicode(t, 'ascii', 'replace')
 342             # Separation is useful
 343             if isinstance(t, basestring):
 344                 if not t.endswith('\n'):
 345                     t += '\n'
 346                 text.append(t)
 347             if partcharset not in charsets:
 348                 charsets.append(partcharset)
 349         # Now join the text and set the payload
 350         sep = _('-------------- next part --------------\n')
 351         assert isinstance(sep, unicode), (
 352             'Expected a unicode separator, got %s' % type(sep))
 353         rept = sep.join(text)
 354         # Replace entire message with text and scrubbed notice.
 355         # Try with message charsets and utf-8
 356         if 'utf-8' not in charsets:
 357             charsets.append('utf-8')
 358         for charset in charsets:
 359             try:
 360                 replace_payload_by_text(msg, rept, charset)
 361                 break
 362             # Bogus charset can throw several exceptions
 363             except (UnicodeError, LookupError, ValueError, TypeError,
 364                     AssertionError):
 365                 pass
 366         if format:
 367             msg.set_param('format', format)
 368         if delsp:
 369             msg.set_param('delsp', delsp)
 370     return msg
 371
 372
 373 \f
 374 def makedirs(dir):
 375     # Create all the directories to store this attachment in and try to make
 376     # sure that the permissions of the directories are set correctly.
 377     try:
 378         os.makedirs(dir, 02775)
 379     except OSError, e:
 380         if e.errno == errno.EEXIST:
 381             return
 382     # Some systems such as FreeBSD ignore mkdir's mode, so walk the just
 383     # created directories and try to set the mode, ignoring any OSErrors that
 384     # occur here.
 385     for dirpath, dirnames, filenames in os.walk(dir):
 386         try:
 387             os.chmod(dirpath, 02775)
 388         except OSError:
 389             pass
 390
 391
 392 \f
 393 def save_attachment(mlist, msg, dir, filter_html=True):
 394     fsdir = os.path.join(config.PRIVATE_ARCHIVE_FILE_DIR,
 395                          mlist.fqdn_listname, dir)
 396     makedirs(fsdir)
 397     # Figure out the attachment type and get the decoded data
 398     decodedpayload = msg.get_payload(decode=True)
 399     # BAW: mimetypes ought to handle non-standard, but commonly found types,
 400     # e.g. image/jpg (should be image/jpeg).  For now we just store such
 401     # things as application/octet-streams since that seems the safest.
 402     ctype = msg.get_content_type()
 403     # i18n file name is encoded
 404     lcset = Utils.GetCharSet(mlist.preferred_language)
 405     filename = Utils.oneline(msg.get_filename(''), lcset)
 406     filename, fnext = os.path.splitext(filename)
 407     # For safety, we should confirm this is valid ext for content-type
 408     # but we can use fnext if we introduce fnext filtering
 409     if config.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
 410         # HTML message doesn't have filename :-(
 411         ext = fnext or guess_extension(ctype, fnext)
 412     else:
 413         ext = guess_extension(ctype, fnext)
 414     # Allow only alphanumerics, dash, underscore, and dot
 415     ext = sre.sub('', ext)
 416     if not ext:
 417         # We don't know what it is, so assume it's just a shapeless
 418         # application/octet-stream, unless the Content-Type: is
 419         # message/rfc822, in which case we know we'll coerce the type to
 420         # text/plain below.
 421         if ctype == 'message/rfc822':
 422             ext = '.txt'
 423         else:
 424             ext = '.bin'
 425     path = None
 426     # We need a lock to calculate the next attachment number
 427     with LockFile(os.path.join(fsdir, 'attachments.lock')):
 428         # Now base the filename on what's in the attachment, uniquifying it if
 429         # necessary.
 430         if not filename or config.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME:
 431             filebase = 'attachment'
 432         else:
 433             # Sanitize the filename given in the message headers
 434             parts = pre.split(filename)
 435             filename = parts[-1]
 436             # Strip off leading dots
 437             filename = dre.sub('', filename)
 438             # Allow only alphanumerics, dash, underscore, and dot
 439             filename = sre.sub('', filename)
 440             # If the filename's extension doesn't match the type we guessed,
 441             # which one should we go with?  For now, let's go with the one we
 442             # guessed so attachments can't lie about their type.  Also, if the
 443             # filename /has/ no extension, then tack on the one we guessed.
 444             # The extension was removed from the name above.
 445             filebase = filename
 446         # Now we're looking for a unique name for this file on the file
 447         # system.  If msgdir/filebase.ext isn't unique, we'll add a counter
 448         # after filebase, e.g. msgdir/filebase-cnt.ext
 449         counter = 0
 450         extra = ''
 451         while True:
 452             path = os.path.join(fsdir, filebase + extra + ext)
 453             # Generally it is not a good idea to test for file existance
 454             # before just trying to create it, but the alternatives aren't
 455             # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
 456             # NFS-safe).  Besides, we have an exclusive lock now, so we're
 457             # guaranteed that no other process will be racing with us.
 458             if os.path.exists(path):
 459                 counter += 1
 460                 extra = '-%04d' % counter
 461             else:
 462                 break
 463     # `path' now contains the unique filename for the attachment.  There's
 464     # just one more step we need to do.  If the part is text/html and
 465     # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
 466     # here), then send the attachment through the filter program for
 467     # sanitization
 468     if filter_html and ctype == 'text/html':
 469         base, ext = os.path.splitext(path)
 470         tmppath = base + '-tmp' + ext
 471         fp = open(tmppath, 'w')
 472         try:
 473             fp.write(decodedpayload)
 474             fp.close()
 475             cmd = config.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
 476             progfp = os.popen(cmd, 'r')
 477             decodedpayload = progfp.read()
 478             status = progfp.close()
 479             if status:
 480                 log.error('HTML sanitizer exited with non-zero status: %s',
 481                           status)
 482         finally:
 483             os.unlink(tmppath)
 484         # BAW: Since we've now sanitized the document, it should be plain
 485         # text.  Blarg, we really want the sanitizer to tell us what the type
 486         # if the return data is. :(
 487         ext = '.txt'
 488         path = base + '.txt'
 489     # Is it a message/rfc822 attachment?
 490     elif ctype == 'message/rfc822':
 491         submsg = msg.get_payload()
 492         # BAW: I'm sure we can eventually do better than this. :(
 493         decodedpayload = Utils.websafe(str(submsg))
 494     fp = open(path, 'w')
 495     fp.write(decodedpayload)
 496     fp.close()
 497     # Now calculate the url
 498     baseurl = get_base_archive_url(mlist)
 499     # Private archives will likely have a trailing slash.  Normalize.
 500     if baseurl[-1] <> '/':
 501         baseurl += '/'
 502     # Trailing space will definitely be a problem with format=flowed.
 503     # Bracket the URL instead.
 504     url = '<' + baseurl + '%s/%s%s%s>' % (dir, filebase, extra, ext)
 505     return url