1 # Copyright (C) 2001-2008 by the Free Software Foundation, Inc.
3 # This file is part of GNU Mailman.
5 # GNU Mailman is free software: you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free
7 # Software Foundation, either version 3 of the License, or (at your option)
10 # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 # You should have received a copy of the GNU General Public License along with
16 # GNU Mailman. If not, see <http://www.gnu.org/licenses/>.
18 """Cleanse a message for archiving."""
20 from __future__
import with_statement
23 __all__
= ['Scrubber']
34 from email
.charset
import Charset
35 from email
.generator
import Generator
36 from email
.utils
import make_msgid
, parsedate
37 from locknix
.lockfile
import Lock
38 from mimetypes
import guess_all_extensions
39 from zope
.interface
import implements
41 from mailman
import Utils
42 from mailman
.configuration
import config
43 from mailman
.core
.errors
import DiscardMessage
44 from mailman
.core
.plugins
import get_plugin
45 from mailman
.i18n
import _
46 from mailman
.interfaces
import IHandler
49 # Path characters for common platforms
50 pre
= re
.compile(r
'[/\\:]')
51 # All other characters to strip out of Content-Disposition: filenames
52 # (essentially anything that isn't an alphanum, dot, dash, or underscore).
53 sre
= re
.compile(r
'[^-\w.]')
54 # Regexp to strip out leading dots
55 dre
= re
.compile(r
'^\.*')
60 log
= logging
.getLogger('mailman.error')
64 def guess_extension(ctype
, ext
):
65 # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
66 # and .wiz are all mapped to application/msword. This sucks for finding
67 # the best reverse mapping. If the extension is one of the giving
68 # mappings, we'll trust that, otherwise we'll just guess. :/
69 all
= guess_all_extensions(ctype
, strict
=False)
76 # We're using a subclass of the standard Generator because we want to suppress
77 # headers in the subparts of multiparts. We use a hack -- the ctor argument
78 # skipheaders to accomplish this. It's set to true for the outer Message
79 # object, but false for all internal objects. We recognize that
80 # sub-Generators will get created passing only mangle_from_ and maxheaderlen
83 # This isn't perfect because we still get stuff like the multipart boundaries,
84 # but see below for how we corrupt that to our nefarious goals.
85 class ScrubberGenerator(Generator
):
86 def __init__(self
, outfp
, mangle_from_
=True,
87 maxheaderlen
=78, skipheaders
=True):
88 Generator
.__init
__(self
, outfp
, mangle_from_
=False)
89 self
.__skipheaders
= skipheaders
91 def _write_headers(self
, msg
):
92 if not self
.__skipheaders
:
93 Generator
._write
_headers
(self
, msg
)
96 def safe_strftime(fmt
, t
):
98 return time
.strftime(fmt
, t
)
99 except (TypeError, ValueError, OverflowError):
103 def calculate_attachments_dir(mlist
, msg
, msgdata
):
104 # Calculate the directory that attachments for this message will go
105 # under. To avoid inode limitations, the scheme will be:
106 # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
107 # Start by calculating the date-based and msgid-hash components.
109 datestr
= msg
.get('Date')
111 now
= parsedate(datestr
)
113 now
= time
.gmtime(msgdata
.get('received_time', time
.time()))
114 datedir
= safe_strftime(fmt
, now
)
116 datestr
= msgdata
.get('X-List-Received-Date')
118 datedir
= safe_strftime(fmt
, datestr
)
120 # What next? Unixfrom, I guess.
121 parts
= msg
.get_unixfrom().split()
123 month
= {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
124 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
128 except (IndexError, ValueError):
129 # Best we can do I think
130 month
= day
= year
= 0
131 datedir
= '%04d%02d%02d' % (year
, month
, day
)
133 # As for the msgid hash, we'll base this part on the Message-ID: so that
134 # all attachments for the same message end up in the same directory (we'll
135 # uniquify the filenames in that directory as needed). We use the first 2
136 # and last 2 bytes of the SHA1 hash of the message id as the basis of the
137 # directory name. Clashes here don't really matter too much, and that
138 # still gives us a 32-bit space to work with.
139 msgid
= msg
['message-id']
141 msgid
= msg
['Message-ID'] = make_msgid()
142 # We assume that the message id actually /is/ unique!
143 digest
= hashlib
.sha1(msgid
).hexdigest()
144 return os
.path
.join('attachments', datedir
, digest
[:4] + digest
[-4:])
147 def replace_payload_by_text(msg
, text
, charset
):
148 # TK: This is a common function in replacing the attachment and the main
149 # message by a text (scrubbing).
150 del msg
['content-type']
151 del msg
['content-transfer-encoding']
152 if isinstance(text
, unicode):
153 text
= text
.encode(charset
)
154 if not isinstance(charset
, str):
155 charset
= str(charset
)
156 msg
.set_payload(text
, charset
)
160 def process(mlist
, msg
, msgdata
=None):
161 sanitize
= config
.ARCHIVE_HTML_SANITIZER
166 # msgdata is available if it is in GLOBAL_PIPELINE
167 # ie. not in digest or archiver
168 # check if the list owner want to scrub regular delivery
169 if not mlist
.scrub_nondigest
:
171 dir = calculate_attachments_dir(mlist
, msg
, msgdata
)
172 charset
= format
= delsp
= None
173 lcset
= Utils
.GetCharSet(mlist
.preferred_language
)
174 lcset_out
= Charset(lcset
).output_charset
or lcset
175 # Now walk over all subparts of this message and scrub out various types
176 for part
in msg
.walk():
177 ctype
= part
.get_content_type()
178 # If the part is text/plain, we leave it alone
179 if ctype
== 'text/plain':
180 # We need to choose a charset for the scrubbed message, so we'll
181 # arbitrarily pick the charset of the first text/plain part in the
184 # Also get the RFC 3676 stuff from this part. This seems to
185 # work okay for scrub_nondigest. It will also work as far as
186 # scrubbing messages for the archive is concerned, but Pipermail
187 # doesn't pay any attention to the RFC 3676 parameters. The plain
188 # format digest is going to be a disaster in any case as some of
189 # messages will be format="flowed" and some not. ToDigest creates
190 # its own Content-Type: header for the plain digest which won't
191 # have RFC 3676 parameters. If the message Content-Type: headers
192 # are retained for display in the digest, the parameters will be
193 # there for information, but not for the MUA. This is the best we
194 # can do without having get_payload() process the parameters.
196 charset
= part
.get_content_charset(lcset
)
197 format
= part
.get_param('format')
198 delsp
= part
.get_param('delsp')
199 # TK: if part is attached then check charset and scrub if none
200 if part
.get('content-disposition') and \
201 not part
.get_content_charset():
202 url
= save_attachment(mlist
, part
, dir)
203 filename
= part
.get_filename(_('not available'))
204 filename
= Utils
.oneline(filename
, lcset
)
205 replace_payload_by_text(part
, _("""\
206 An embedded and charset-unspecified text was scrubbed...
210 elif ctype
== 'text/html' and isinstance(sanitize
, int):
214 replace_payload_by_text(part
,
215 _('HTML attachment scrubbed and removed'),
216 # Adding charset arg and removing content-type
217 # sets content-type to text/plain
220 # By leaving it alone, Pipermail will automatically escape it
223 # Pull it out as an attachment but leave it unescaped. This
224 # is dangerous, but perhaps useful for heavily moderated
226 url
= save_attachment(mlist
, part
, dir, filter_html
=False)
227 replace_payload_by_text(part
, _("""\
228 An HTML attachment was scrubbed...
232 # HTML-escape it and store it as an attachment, but make it
233 # look a /little/ bit prettier. :(
234 payload
= Utils
.websafe(part
.get_payload(decode
=True))
235 # For whitespace in the margin, change spaces into
236 # non-breaking spaces, and tabs into 8 of those. Then use a
237 # mono-space font. Still looks hideous to me, but then I'd
238 # just as soon discard them.
240 return s
.replace(' ', ' ').replace('\t', ' '*8)
241 lines
= [doreplace(s
) for s
in payload
.split('\n')]
242 payload
= '<tt>\n' + BR
.join(lines
) + '\n</tt>\n'
243 part
.set_payload(payload
)
244 # We're replacing the payload with the decoded payload so this
245 # will just get in the way.
246 del part
['content-transfer-encoding']
247 url
= save_attachment(mlist
, part
, dir, filter_html
=False)
248 replace_payload_by_text(part
, _("""\
249 An HTML attachment was scrubbed...
252 elif ctype
== 'message/rfc822':
253 # This part contains a submessage, so it too needs scrubbing
254 submsg
= part
.get_payload(0)
255 url
= save_attachment(mlist
, part
, dir)
256 subject
= submsg
.get('subject', _('no subject'))
257 date
= submsg
.get('date', _('no date'))
258 who
= submsg
.get('from', _('unknown sender'))
259 size
= len(str(submsg
))
260 replace_payload_by_text(part
, _("""\
261 An embedded message was scrubbed...
268 # If the message isn't a multipart, then we'll strip it out as an
269 # attachment that would have to be separately downloaded. Pipermail
270 # will transform the url into a hyperlink.
271 elif part
._payload
and not part
.is_multipart():
272 payload
= part
.get_payload(decode
=True)
273 ctype
= part
.get_content_type()
274 # XXX Under email 2.5, it is possible that payload will be None.
275 # This can happen when you have a Content-Type: multipart/* with
276 # only one part and that part has two blank lines between the
277 # first boundary and the end boundary. In email 3.0 you end up
278 # with a string in the payload. I think in this case it's safe to
283 url
= save_attachment(mlist
, part
, dir)
284 desc
= part
.get('content-description', _('not available'))
285 desc
= Utils
.oneline(desc
, lcset
)
286 filename
= part
.get_filename(_('not available'))
287 filename
= Utils
.oneline(filename
, lcset
)
288 replace_payload_by_text(part
, _("""\
289 A non-text attachment was scrubbed...
297 # We still have to sanitize multipart messages to flat text because
298 # Pipermail can't handle messages with list payloads. This is a kludge;
299 # def (n) clever hack ;).
300 if msg
.is_multipart() and sanitize
<> 2:
301 # By default we take the charset of the first text/plain part in the
302 # message, but if there was none, we'll use the list's preferred
303 # language's charset.
304 if not charset
or charset
== 'us-ascii':
307 # normalize to the output charset if input/output are different
308 charset
= Charset(charset
).output_charset
or charset
309 # We now want to concatenate all the parts which have been scrubbed to
310 # text/plain, into a single text/plain payload. We need to make sure
311 # all the characters in the concatenated string are in the same
312 # encoding, so we'll use the 'replace' key in the coercion call.
313 # BAW: Martin's original patch suggested we might want to try
314 # generalizing to utf-8, and that's probably a good idea (eventually).
317 for part
in msg
.walk():
318 # TK: bug-id 1099138 and multipart
319 # MAS test payload - if part may fail if there are no headers.
320 if not part
._payload
or part
.is_multipart():
322 # All parts should be scrubbed to text/plain by now.
323 partctype
= part
.get_content_type()
324 if partctype
<> 'text/plain':
325 text
.append(_('Skipped content of type $partctype\n'))
328 t
= part
.get_payload(decode
=True) or ''
329 # MAS: TypeError exception can occur if payload is None. This
330 # was observed with a message that contained an attached
331 # message/delivery-status part. Because of the special parsing
332 # of this type, this resulted in a text/plain sub-part with a
333 # null body. See bug 1430236.
334 except (binascii
.Error
, TypeError):
335 t
= part
.get_payload() or ''
336 # Email problem was solved by Mark Sapiro. (TK)
337 partcharset
= part
.get_content_charset('us-ascii')
339 t
= unicode(t
, partcharset
, 'replace')
340 except (UnicodeError, LookupError, ValueError, TypeError,
342 # We can get here if partcharset is bogus in come way.
343 # Replace funny characters. We use errors='replace'.
344 t
= unicode(t
, 'ascii', 'replace')
345 # Separation is useful
346 if isinstance(t
, basestring
):
347 if not t
.endswith('\n'):
350 if partcharset
not in charsets
:
351 charsets
.append(partcharset
)
352 # Now join the text and set the payload
353 sep
= _('-------------- next part --------------\n')
354 assert isinstance(sep
, unicode), (
355 'Expected a unicode separator, got %s' % type(sep
))
356 rept
= sep
.join(text
)
357 # Replace entire message with text and scrubbed notice.
358 # Try with message charsets and utf-8
359 if 'utf-8' not in charsets
:
360 charsets
.append('utf-8')
361 for charset
in charsets
:
363 replace_payload_by_text(msg
, rept
, charset
)
365 # Bogus charset can throw several exceptions
366 except (UnicodeError, LookupError, ValueError, TypeError,
370 msg
.set_param('format', format
)
372 msg
.set_param('delsp', delsp
)
378 # Create all the directories to store this attachment in and try to make
379 # sure that the permissions of the directories are set correctly.
381 os
.makedirs(dir, 02775)
383 if e
.errno
== errno
.EEXIST
:
385 # Some systems such as FreeBSD ignore mkdir's mode, so walk the just
386 # created directories and try to set the mode, ignoring any OSErrors that
388 for dirpath
, dirnames
, filenames
in os
.walk(dir):
390 os
.chmod(dirpath
, 02775)
396 def save_attachment(mlist
, msg
, dir, filter_html
=True):
397 fsdir
= os
.path
.join(config
.PRIVATE_ARCHIVE_FILE_DIR
,
398 mlist
.fqdn_listname
, dir)
400 # Figure out the attachment type and get the decoded data
401 decodedpayload
= msg
.get_payload(decode
=True)
402 # BAW: mimetypes ought to handle non-standard, but commonly found types,
403 # e.g. image/jpg (should be image/jpeg). For now we just store such
404 # things as application/octet-streams since that seems the safest.
405 ctype
= msg
.get_content_type()
406 # i18n file name is encoded
407 lcset
= Utils
.GetCharSet(mlist
.preferred_language
)
408 filename
= Utils
.oneline(msg
.get_filename(''), lcset
)
409 filename
, fnext
= os
.path
.splitext(filename
)
410 # For safety, we should confirm this is valid ext for content-type
411 # but we can use fnext if we introduce fnext filtering
412 if config
.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION
:
413 # HTML message doesn't have filename :-(
414 ext
= fnext
or guess_extension(ctype
, fnext
)
416 ext
= guess_extension(ctype
, fnext
)
418 # We don't know what it is, so assume it's just a shapeless
419 # application/octet-stream, unless the Content-Type: is
420 # message/rfc822, in which case we know we'll coerce the type to
422 if ctype
== 'message/rfc822':
426 # Allow only alphanumerics, dash, underscore, and dot
427 ext
= sre
.sub('', ext
)
429 # We need a lock to calculate the next attachment number
430 with
Lock(os
.path
.join(fsdir
, 'attachments.lock')):
431 # Now base the filename on what's in the attachment, uniquifying it if
433 if not filename
or config
.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME
:
434 filebase
= 'attachment'
436 # Sanitize the filename given in the message headers
437 parts
= pre
.split(filename
)
439 # Strip off leading dots
440 filename
= dre
.sub('', filename
)
441 # Allow only alphanumerics, dash, underscore, and dot
442 filename
= sre
.sub('', filename
)
443 # If the filename's extension doesn't match the type we guessed,
444 # which one should we go with? For now, let's go with the one we
445 # guessed so attachments can't lie about their type. Also, if the
446 # filename /has/ no extension, then tack on the one we guessed.
447 # The extension was removed from the name above.
449 # Now we're looking for a unique name for this file on the file
450 # system. If msgdir/filebase.ext isn't unique, we'll add a counter
451 # after filebase, e.g. msgdir/filebase-cnt.ext
455 path
= os
.path
.join(fsdir
, filebase
+ extra
+ ext
)
456 # Generally it is not a good idea to test for file existance
457 # before just trying to create it, but the alternatives aren't
458 # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
459 # NFS-safe). Besides, we have an exclusive lock now, so we're
460 # guaranteed that no other process will be racing with us.
461 if os
.path
.exists(path
):
463 extra
= '-%04d' % counter
466 # `path' now contains the unique filename for the attachment. There's
467 # just one more step we need to do. If the part is text/html and
468 # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
469 # here), then send the attachment through the filter program for
471 if filter_html
and ctype
== 'text/html':
472 base
, ext
= os
.path
.splitext(path
)
473 tmppath
= base
+ '-tmp' + ext
474 fp
= open(tmppath
, 'w')
476 fp
.write(decodedpayload
)
478 cmd
= config
.ARCHIVE_HTML_SANITIZER
% {'filename' : tmppath
}
479 progfp
= os
.popen(cmd
, 'r')
480 decodedpayload
= progfp
.read()
481 status
= progfp
.close()
483 log
.error('HTML sanitizer exited with non-zero status: %s',
487 # BAW: Since we've now sanitized the document, it should be plain
488 # text. Blarg, we really want the sanitizer to tell us what the type
489 # if the return data is. :(
492 # Is it a message/rfc822 attachment?
493 elif ctype
== 'message/rfc822':
494 submsg
= msg
.get_payload()
495 # BAW: I'm sure we can eventually do better than this. :(
496 decodedpayload
= Utils
.websafe(str(submsg
))
498 fp
.write(decodedpayload
)
500 # Now calculate the url to the list's archive.
501 baseurl
= get_plugin('mailman.scrubber').list_url(mlist
)
502 if not baseurl
.endswith('/'):
504 # Trailing space will definitely be a problem with format=flowed.
505 # Bracket the URL instead.
506 url
= '<' + baseurl
+ '%s/%s%s%s>' % (dir, filebase
, extra
, ext
)
512 """Cleanse a message for archiving."""
517 description
= _('Cleanse a message for archiving.')
519 def process(self
, mlist
, msg
, msgdata
):
520 """See `IHandler`."""
521 process(mlist
, msg
, msgdata
)