Content filtering preserves/forwards the original message.
[mailman.git] / src / mailman / handlers / mime_delete.py
blobcddd97f6b5a7079bb8e389b9770d248bf4d05acc
1 # Copyright (C) 2002-2023 by the Free Software Foundation, Inc.
3 # This file is part of GNU Mailman.
5 # GNU Mailman is free software: you can redistribute it and/or modify it under
6 # the terms of the GNU General Public License as published by the Free
7 # Software Foundation, either version 3 of the License, or (at your option)
8 # any later version.
10 # GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
11 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 # more details.
15 # You should have received a copy of the GNU General Public License along with
16 # GNU Mailman. If not, see <https://www.gnu.org/licenses/>.
18 """MIME-stripping filter for Mailman.
20 This module scans a message for MIME content, removing those sections whose
21 MIME types match one of a list of matches. multipart/alternative sections are
22 replaced by the first non-empty component, and multipart/mixed sections
23 wrapping only single sections after other processing are replaced by their
24 contents.
25 """
27 import os
28 import copy
29 import shutil
30 import logging
31 import tempfile
33 from contextlib import ExitStack, suppress
34 from email.iterators import typed_subpart_iterator
35 from email.mime.message import MIMEMessage
36 from email.mime.text import MIMEText
37 from itertools import count
38 from lazr.config import as_boolean
39 from mailman.config import config
40 from mailman.core.i18n import _
41 from mailman.email.message import OwnerNotification
42 from mailman.interfaces.action import FilterAction
43 from mailman.interfaces.handler import IHandler
44 from mailman.interfaces.pipeline import DiscardMessage, RejectMessage
45 from mailman.utilities.string import oneline
46 from mailman.version import VERSION
47 from public import public
48 from string import Template
49 from subprocess import CalledProcessError, check_output
50 from zope.interface import implementer
53 log = logging.getLogger('mailman.error')
56 def dispose(mlist, msg, msgdata, why):
57 if mlist.filter_action is FilterAction.reject:
58 # Bounce the message to the original author.
59 raise RejectMessage(why)
60 elif (mlist.filter_action is FilterAction.forward and
61 msgdata.get('fwd_preserve', True)):
62 # Forward it on to the list moderators.
63 text = _("""\
64 The attached message matched the ${mlist.display_name} mailing list's content
65 filtering rules and was prevented from being forwarded on to the list
66 membership. You are receiving the only remaining copy of the discarded
67 message.
69 """)
70 subject = _('Content filter message notification')
71 notice = OwnerNotification(mlist, subject, roster=mlist.administrators)
72 notice.set_type('multipart/mixed')
73 notice.attach(MIMEText(text))
74 notice.attach(MIMEMessage(msg))
75 notice.send(mlist)
76 # Let this fall through so the original message gets discarded.
77 elif (mlist.filter_action is FilterAction.preserve and
78 msgdata.get('fwd_preserve', True)):
79 if as_boolean(config.mailman.filtered_messages_are_preservable):
80 # This is just like discarding the message except that a copy is
81 # placed in the 'bad' queue should the site administrator want to
82 # inspect the message.
83 filebase = config.switchboards['bad'].enqueue(msg, msgdata)
84 log.info('{} preserved in file base {}'.format(
85 msg.get('message-id', 'n/a'), filebase))
86 elif mlist.filter_action is FilterAction.discard:
87 pass
88 elif msgdata.get('fwd_preserve', True):
89 log.error(
90 '{} invalid FilterAction: {}. Treating as discard'.format(
91 mlist.fqdn_listname, mlist.filter_action.name))
92 # Most cases also discard the message
93 raise DiscardMessage(why)
96 def process(mlist, msg, msgdata):
97 global attach_report, report
98 report = _("""
99 ___________________________________________
100 Mailman's content filtering has removed the
101 following MIME parts from this message.
102 """)
103 attach_report = False
104 ctype = msg.get_content_type()
105 mtype = msg.get_content_maintype()
106 # Check to see if the outer type matches one of the filter types
107 filtertypes = set(mlist.filter_types)
108 passtypes = set(mlist.pass_types)
109 if ctype in filtertypes or mtype in filtertypes:
110 dispose(mlist, msg, msgdata,
111 _("The message's content type was explicitly disallowed"))
112 # Check to see if there is a pass types and the outer type doesn't match
113 # one of these types
114 if passtypes and not (ctype in passtypes or mtype in passtypes):
115 dispose(mlist, msg, msgdata,
116 _("The message's content type was not explicitly allowed"))
117 # Filter by file extensions
118 filterexts = set(mlist.filter_extensions)
119 passexts = set(mlist.pass_extensions)
120 fext = get_file_ext(msg)
121 if fext:
122 if fext in filterexts:
123 dispose(
124 mlist, msg, msgdata,
125 _("The message's file extension was explicitly disallowed"))
126 if passexts and not (fext in passexts):
127 dispose(
128 mlist, msg, msgdata,
129 _("The message's file extension was not explicitly allowed"))
130 numparts = len([subpart for subpart in msg.walk()])
131 # If the message is a multipart, filter out matching subparts
132 if msg.is_multipart():
133 # Recursively filter out any subparts that match the filter list
134 prelen = len(msg.get_payload())
135 premsg = copy.deepcopy(msg)
136 filter_parts(msg, filtertypes, passtypes, filterexts, passexts)
137 # If the outer message is now an empty multipart (and it wasn't
138 # before!) then, again it gets discarded.
139 postlen = len(msg.get_payload())
140 if postlen == 0 and prelen > 0:
141 dispose(mlist, premsg, msgdata,
142 _("After content filtering, the message was empty"))
143 # Now replace all multipart/alternatives with just the first non-empty
144 # alternative. BAW: We have to special case when the outer part is a
145 # multipart/alternative because we need to retain most of the outer part's
146 # headers. For now we'll move the subpart's payload into the outer part,
147 # and then copy over its Content-Type: and Content-Transfer-Encoding:
148 # headers (any others?).
149 if mlist.collapse_alternatives:
150 collapse_multipart_alternatives(msg)
151 if ctype == 'multipart/alternative':
152 firstalt = msg.get_payload(0)
153 reset_payload(msg, firstalt)
154 report += _("""
155 Replaced multipart/alternative part with first alternative.
156 """)
157 # MAS Not setting attach_report True here will not report if the
158 # only change is collapsing an outer MPA message. On lists where
159 # most people post from MUAs that compose HTML and send MPA,
160 # setting this here will add this report to most messages which
161 # can be annoying.
162 # attach_report = True
163 # Now that we've collapsed the MPA parts, go through the message
164 # and recast any multipart parts with only one sub-part as just
165 # the sub-part.
166 if msg.is_multipart():
167 recast_multipart(msg)
168 # If we removed some parts, make note of this
169 changedp = 0
170 if numparts != len([subpart for subpart in msg.walk()]):
171 changedp = 1
172 # Now perhaps convert all text/html to text/plain.
173 if mlist.convert_html_to_plaintext:
174 changedp += to_plaintext(msg)
175 # If we're left with only two parts, an empty body and one attachment,
176 # recast the message to one of just that part
177 if msg.is_multipart() and len(msg.get_payload()) == 2:
178 if msg.get_payload(0).get_payload() == '':
179 useful = msg.get_payload(1)
180 reset_payload(msg, useful)
181 changedp = 1
182 if changedp:
183 msg['X-Content-Filtered-By'] = 'Mailman/MimeDel {}'.format(VERSION)
184 if attach_report and as_boolean(config.mailman.filter_report):
185 if msg.is_multipart():
186 if msg.get_content_type() == 'multipart/mixed':
187 msg.attach(MIMEText(report))
188 else:
189 # Some non-mixed multipart, we need to wrap it.
190 # This is based on code in handlers/decorate.py
191 # Because of the way Message objects are passed around to
192 # process(), we need to play tricks with the outer message
193 # -- i.e. the outer one must remain the same instance.
194 # So we're going to create a clone of the outer message,
195 # with all the header chrome intact, then delete unwanted
196 # headers.
197 inner = copy.deepcopy(msg)
198 # Which headers to keep? Let's just do the Content-* headers
199 for h, v in inner.items():
200 if not h.lower().startswith('content-'):
201 del inner[h]
202 # Now, play games with the outer message to make it contain two
203 # subparts: the wrapped message, and the report.
204 payload = [inner]
205 payload.append(MIMEText(report))
206 msg.set_payload(payload)
207 del msg['content-type']
208 del msg['content-transfer-encoding']
209 del msg['content-disposition']
210 msg['Content-Type'] = 'multipart/mixed'
211 else:
212 pl = msg.get_payload(decode=True)
213 cset = msg.get_content_charset(None) or 'us-ascii'
214 del msg['content-transfer-encoding']
215 new_pl = pl.decode(cset)
216 if not pl.endswith(b'\n'):
217 new_pl += '\n'
218 new_pl += report
219 msg.set_payload(new_pl, cset)
222 def reset_payload(msg, subpart):
223 # Reset payload of msg to contents of subpart, and fix up content headers
224 if subpart.is_multipart():
225 msg.set_payload(subpart.get_payload())
226 else:
227 cset = subpart.get_content_charset() or 'us-ascii'
228 msg.set_payload(subpart.get_payload(decode=True).decode(
229 cset, errors='replace'),
230 charset=cset)
231 # Don't restore Content-Transfer-Encoding; set_payload sets it based
232 # on the charset.
233 del msg['content-type']
234 del msg['content-disposition']
235 del msg['content-description']
236 msg['Content-Type'] = subpart.get('content-type', 'text/plain')
237 cdisp = subpart.get('content-disposition')
238 if cdisp:
239 msg['Content-Disposition'] = cdisp
240 cdesc = subpart.get('content-description')
241 if cdesc:
242 msg['Content-Description'] = cdesc
245 def filter_parts(msg, filtertypes, passtypes, filterexts, passexts):
246 global attach_report, report
247 # Look at all the message's subparts, and recursively filter
248 if not msg.is_multipart():
249 return True
250 payload = msg.get_payload()
251 prelen = len(payload)
252 newpayload = []
253 for subpart in payload:
254 keep = filter_parts(subpart, filtertypes, passtypes,
255 filterexts, passexts)
256 if not keep:
257 continue
258 ctype = subpart.get_content_type()
259 mtype = subpart.get_content_maintype()
260 fname = subpart.get_filename('') or subpart.get_param('name', '')
261 if ctype in filtertypes or mtype in filtertypes:
262 # Throw this subpart away
263 report += '\nContent-Type: %s\n' % ctype
264 if fname:
265 report += ' ' + _('Name: ${fname}\n')
266 attach_report = True
267 continue
268 if passtypes and not (ctype in passtypes or mtype in passtypes):
269 # Throw this subpart away
270 report += '\nContent-Type: %s\n' % ctype
271 if fname:
272 report += ' ' + _('Name: ${fname}\n')
273 attach_report = True
274 continue
275 # check file extension
276 fext = get_file_ext(subpart)
277 if fext:
278 if fext in filterexts:
279 report += '\nContent-Type: %s\n' % ctype
280 if fname:
281 report += ' ' + _('Name: ${fname}\n')
282 attach_report = True
283 continue
284 if passexts and not (fext in passexts):
285 report += '\nContent-Type: %s\n' % ctype
286 if fname:
287 report += ' ' + _('Name: ${fname}\n')
288 attach_report = True
289 continue
290 newpayload.append(subpart)
291 # Check to see if we discarded all the subparts
292 postlen = len(newpayload)
293 msg.set_payload(newpayload)
294 if postlen == 0 and prelen > 0:
295 # We threw away everything
296 return False
297 return True
300 def collapse_multipart_alternatives(msg):
301 global attach_report, report
302 if not msg.is_multipart():
303 return
304 newpayload = []
305 for subpart in msg.get_payload():
306 if subpart.get_content_type() == 'multipart/alternative':
307 with suppress(IndexError):
308 firstalt = subpart.get_payload(0)
309 if msg.get_content_type() == 'message/rfc822':
310 # This is a multipart/alternative message in a
311 # message/rfc822 part. We treat it specially so as not to
312 # lose the headers.
313 reset_payload(subpart, firstalt)
314 newpayload.append(subpart)
315 else:
316 newpayload.append(firstalt)
317 report += _("""
318 Replaced multipart/alternative part with first alternative.
319 """)
320 attach_report = True
321 elif subpart.is_multipart():
322 collapse_multipart_alternatives(subpart)
323 newpayload.append(subpart)
324 else:
325 newpayload.append(subpart)
326 msg.set_payload(newpayload)
329 def recast_multipart(msg):
330 # If we're left with a multipart message with only one sub-part, recast
331 # the message to just the sub-part, but not if the part is message/rfc822
332 # because we don't want to lose the headers.
333 # Also, if this is a multipart/signed part, stop now as the original part
334 # may have had a multipart sub-part with only one sub-sub-part, the sig
335 # may still be valid and going further may break it. (LP: #1551075)
336 if msg.get_content_type() == 'multipart/signed':
337 return
338 if msg.is_multipart():
339 if (len(msg.get_payload()) == 1 and
340 msg.get_content_type() != 'message/rfc822'):
341 reset_payload(msg, msg.get_payload(0))
342 # now that we've recast this part, check the subordinate parts
343 recast_multipart(msg)
344 else:
345 # This part's OK but check deeper.
346 for part in msg.get_payload():
347 recast_multipart(part)
350 def to_plaintext(msg):
351 changedp = 0
352 counter = count()
353 with ExitStack() as resources:
354 tempdir = tempfile.mkdtemp()
355 resources.callback(shutil.rmtree, tempdir)
356 for subpart in typed_subpart_iterator(msg, 'text', 'html'):
357 filename = os.path.join(tempdir, '{}.html'.format(next(counter)))
358 cset = subpart.get_content_charset('us-ascii')
359 with open(filename, 'w', encoding='utf-8') as fp:
360 fp.write(subpart.get_payload(decode=True).decode(cset,
361 errors='replace'))
362 template = Template(config.mailman.html_to_plain_text_command)
363 command = template.safe_substitute(filename=filename).split()
364 try:
365 stdout = check_output(command, universal_newlines=True)
366 except (CalledProcessError, FileNotFoundError, PermissionError):
367 log.exception('HTML -> text/plain command error')
368 else:
369 # Replace the payload of the subpart with the converted text
370 # and tweak the content type.
371 del subpart['content-transfer-encoding']
372 subpart.set_payload(stdout, charset=cset)
373 subpart.set_type('text/plain')
374 changedp += 1
375 return changedp
378 def get_file_ext(m):
380 Get filename extension. Caution: some virus don't put filename
381 in 'Content-Disposition' header.
383 fext = ''
384 filename = m.get_filename('') or m.get_param('name', '')
385 if filename:
386 fext = os.path.splitext(oneline(filename, 'utf-8', in_unicode=True))[1]
387 if len(fext) > 1:
388 fext = fext[1:]
389 else:
390 fext = ''
391 return fext.lower()
394 @public
395 @implementer(IHandler)
396 class MIMEDelete:
397 """Filter the MIME content of messages."""
399 name = 'mime-delete'
400 description = _('Filter the MIME content of messages.')
402 def process(self, mlist, msg, msgdata):
403 # Short-circuits
404 if not mlist.filter_content:
405 return
406 if msgdata.get('isdigest'):
407 return
408 process(mlist, msg, msgdata)