Merge from gnulib
[emacs.git] / lisp / mail / rmail-spam-filter.el
blobef6b6d0d683f200fe27d40dd690721e950918af6
1 ;;; rmail-spam-filter.el --- spam filter for Rmail, the Emacs mail reader
3 ;; Copyright (C) 2002-2015 Free Software Foundation, Inc.
4 ;; Keywords: email, spam, filter, rmail
5 ;; Author: Eli Tziperman <eli AT deas.harvard.edu>
6 ;; Package: rmail
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
23 ;;; Commentary:
24 ;;; -----------
26 ;;; Automatically recognize and delete junk email before it is
27 ;;; displayed in rmail/rmail-summary. Spam emails are defined by
28 ;;; specifying one or more of the sender, subject and contents.
29 ;;; URL: http://www.weizmann.ac.il/~eli/Downloads/rmail-spam-filter/
31 ;;; Usage:
32 ;;; ------
34 ;;; put in your .emacs:
36 ;;; (require 'rmail-spam-filter)
38 ;;; and use customize (in rmail-spam-filter group) to:
40 ;;; (*) turn on the variable rmail-use-spam-filter,
42 ;;; (*) specify in variable rsf-definitions-alist what sender,
43 ;;; subject and contents make an email be considered spam.
45 ;;; in addition, you may:
47 ;;; (*) Block future mail with the subject or sender of a message
48 ;;; while reading it in RMAIL: just click on the "Spam" item on the
49 ;;; menubar, and add the subject or sender to the list of spam
50 ;;; definitions using the mouse and the appropriate menu item. You
51 ;;; need to later also save the list of spam definitions using the
52 ;;; same menu item, or alternatively, see variable
53 ;;; `rsf-autosave-newly-added-definitions'.
55 ;;; (*) specify if blind-cc'ed mail (no "To:" header field) is to be
56 ;;; treated as spam (variable rsf-no-blind-cc; Thanks to Ethan
57 ;;; Brown <ethan@gso.saic.com> for this).
59 ;;; (*) specify if rmail-spam-filter should ignore case of spam
60 ;;; definitions (variable rsf-ignore-case; Thanks to
61 ;;; Ethan Brown <ethan@gso.saic.com> for the suggestion).
63 ;;; (*) Specify a "white-list" of trusted senders. If any
64 ;;; rsf-white-list string matches a substring of the "From"
65 ;;; header, the message is flagged as a valid, non-spam message (Ethan
66 ;;; Brown <ethan@gso.saic.com>).
68 ;;; (*) rmail-spam-filter is best used with a general purpose spam
69 ;;; filter such as the procmail-based http://www.spambouncer.org/.
70 ;;; Spambouncer is set to only mark messages as spam/blocked/bulk/OK
71 ;;; via special headers, and these headers may then be defined in
72 ;;; rmail-spam-filter such that the spam is rejected by
73 ;;; rmail-spam-filter itself.
75 (require 'rmail)
76 (require 'rmailsum)
78 (defgroup rmail-spam-filter nil
79 "Spam filter for Rmail, the Emacs mail reader."
80 :group 'rmail)
82 (defcustom rmail-use-spam-filter nil
83 "Non-nil to activate the Rmail spam filter.
84 Set `rsf-definitions-alist' to define what you consider spam emails."
85 :type 'boolean
86 :group 'rmail-spam-filter)
88 (defcustom rsf-file "~/XRMAIL-SPAM"
89 "Name of Rmail file for optionally saving some of the spam.
90 You can either just delete spam, or save it in this file for
91 later review. Which action to take for each spam definition is
92 specified by the \"action\" element of the definition."
93 :type 'string
94 :group 'rmail-spam-filter)
96 (defcustom rsf-no-blind-cc nil
97 "Non-nil means mail with no explicit To: or Cc: is spam."
98 :type 'boolean
99 :group 'rmail-spam-filter)
101 (defcustom rsf-ignore-case nil
102 "Non-nil means to ignore case in `rsf-definitions-alist'."
103 :type 'boolean
104 :group 'rmail-spam-filter)
106 (defcustom rsf-beep nil
107 "Non-nil means to beep if spam is found."
108 :type 'boolean
109 :group 'rmail-spam-filter)
111 (defcustom rsf-sleep-after-message 2.0
112 "Seconds to wait after displaying a message that spam was found."
113 :type 'number
114 :group 'rmail-spam-filter)
116 (defcustom rsf-min-region-to-spam-list 7
117 "Minimum size of region that you can add to the spam list.
118 The aim is to avoid adding too short a region, which could result
119 in false positive identification of a valid message as spam."
120 :type 'integer
121 :group 'rmail-spam-filter)
123 (defcustom rsf-autosave-newly-added-definitions nil
124 "Non-nil to auto-save new spam entries.
125 Any time you add an entry via the \"Spam\" menu, immediately saves
126 the custom file."
127 :type 'boolean
128 :group 'rmail-spam-filter)
130 (defcustom rsf-white-list nil
131 "List of regexps to identify valid senders.
132 If any element matches the \"From\" header, the message is
133 flagged as a valid, non-spam message. E.g., if your domain is
134 \"emacs.com\" then including \"emacs\\\\.com\" in this list would
135 flag all mail (purporting to be) from your colleagues as valid."
136 :type '(repeat string)
137 :group 'rmail-spam-filter)
139 (defcustom rsf-definitions-alist nil
140 "A list of rules (definitions) matching spam messages.
141 Each rule is an alist, with elements of the form (FIELD . REGEXP).
142 The recognized FIELDS are: from, to, subject, content-type,
143 x-spam-status, and contents. The \"contents\" element refers to
144 the entire text of the message; all the other elements refer to
145 message headers of the same name.
147 Using an empty-string for REGEXP is the same as omitting that
148 element altogether.
150 Each rule should contain one \"action\" element, saying what to do
151 if the rule is matched. This has the form (action . CHOICE), where
152 CHOICE may be either `output-and-delete' (save to `rsf-file', then delete),
153 or `delete-spam' (just delete).
155 A rule matches only if all the specified elements match."
156 :type '(repeat
157 (list :format "%v"
158 (cons :format "%v" :value (from . "")
159 (const :format "" from)
160 (string :tag "From" ""))
161 (cons :format "%v" :value (to . "")
162 (const :format "" to)
163 (string :tag "To" ""))
164 (cons :format "%v" :value (subject . "")
165 (const :format "" subject)
166 (string :tag "Subject" ""))
167 (cons :format "%v" :value (content-type . "")
168 (const :format "" content-type)
169 (string :tag "Content-Type" ""))
170 (cons :format "%v" :value (contents . "")
171 (const :format "" contents)
172 (string :tag "Contents" ""))
173 (cons :format "%v" :value (x-spam-status . "")
174 (const :format "" x-spam-status)
175 (string :tag "X-Spam-Status" ""))
176 (cons :format "%v" :value (action . output-and-delete)
177 (const :format "" action)
178 (choice :tag "Action selection"
179 (const :tag "Output and delete" output-and-delete)
180 (const :tag "Delete" delete-spam)
181 ))))
182 :group 'rmail-spam-filter)
184 ;; FIXME nothing uses this, and it could just be let-bound.
185 (defvar rsf-scanning-messages-now nil
186 "Non-nil when `rmail-spam-filter' scans messages.")
188 ;; the advantage over the automatic filter definitions is the AND conjunction
189 ;; of in-one-definition-elements
190 (defun rsf-check-field (field-symbol message-data definition result)
191 "Check if a message appears to be spam.
192 FIELD-SYMBOL is one of the possible keys of a `rsf-definitions-alist'
193 rule; e.g. from, to. MESSAGE-DATA is a string giving the value of
194 FIELD-SYMBOL in the current message. DEFINITION is the element of
195 `rsf-definitions-alist' currently being checked.
197 RESULT is a cons of the form (MAYBE-SPAM . IS-SPAM). If the car
198 is nil, or if the entry for FIELD-SYMBOL in this DEFINITION is
199 absent or the empty string, this function does nothing.
201 Otherwise, if MESSAGE-DATA is non-nil and the entry matches it,
202 the cdr is set to t. Else, the car is set to nil."
203 (let ((definition-field (cdr (assoc field-symbol definition))))
204 ;; Only in this case can maybe-spam change from t to nil.
205 (if (and (car result) (> (length definition-field) 0))
206 ;; If FIELD-SYMBOL field appears in the message, and also in
207 ;; spam definition list, this is potentially a spam.
208 (if (and message-data
209 (string-match definition-field message-data))
210 ;; If we do not get a contradiction from another field, this is spam
211 (setcdr result t)
212 ;; The message data contradicts the specification, this is not spam.
213 ;; Note that the total absence of a header specified in the
214 ;; rule means this cannot be spam.
215 (setcar result nil)))))
217 (defun rmail-spam-filter (msg)
218 "Return nil if message number MSG is spam based on `rsf-definitions-alist'.
219 If spam, optionally output message to a file `rsf-file' and delete
220 it from rmail file. Called for each new message retrieved by
221 `rmail-get-new-mail'."
222 (let ((return-value)
223 ;; maybe-spam is in the car, this-is-a-spam-email in cdr.
224 (maybe-spam '(nil . nil))
225 message-sender message-to message-cc message-recipients
226 message-subject message-content-type message-spam-status
227 (num-spam-definition-elements (safe-length rsf-definitions-alist))
228 (num-element 0)
229 (exit-while-loop nil)
230 ;; Do we want to ignore case in spam definitions.
231 (case-fold-search rsf-ignore-case)
232 ;; make sure bbdb does not create entries for messages while spam
233 ;; filter is scanning the rmail file:
234 (bbdb/mail_auto_create_p nil)
235 ;; Other things may wish to know if we are running (nothing
236 ;; uses this at present).
237 (rsf-scanning-messages-now t))
238 (save-excursion
239 ;; Narrow buffer to header of message and get Sender and
240 ;; Subject fields to be used below:
241 (save-restriction
242 (goto-char (rmail-msgbeg msg))
243 (narrow-to-region (point) (progn (search-forward "\n\n") (point)))
244 (setq message-sender (mail-fetch-field "From"))
245 (setq message-to (mail-fetch-field "To")
246 message-cc (mail-fetch-field "Cc")
247 message-recipients (or (and message-to message-cc
248 (concat message-to ", " message-cc))
249 message-to
250 message-cc))
251 (setq message-subject (mail-fetch-field "Subject"))
252 (setq message-content-type (mail-fetch-field "Content-Type"))
253 (setq message-spam-status (mail-fetch-field "X-Spam-Status")))
254 ;; Check for blind CC condition. Set vars such that while
255 ;; loop will be bypassed and spam condition will trigger.
256 (and rsf-no-blind-cc
257 (null message-recipients)
258 (setq exit-while-loop t
259 maybe-spam '(t . t)))
260 ;; Check white list, and likewise cause while loop bypass.
261 (and message-sender
262 (let ((white-list rsf-white-list)
263 (found nil))
264 (while (and (not found) white-list)
265 (if (string-match (car white-list) message-sender)
266 (setq found t)
267 (setq white-list (cdr white-list))))
268 found)
269 (setq exit-while-loop t
270 maybe-spam '(nil . nil)))
271 ;; Scan all elements of the list rsf-definitions-alist.
272 (while (and (< num-element num-spam-definition-elements)
273 (not exit-while-loop))
274 (let ((definition (nth num-element rsf-definitions-alist)))
275 ;; Initialize car, which is set to t in one of two cases:
276 ;; (1) unspecified definition-elements are found in
277 ;; rsf-definitions-alist, (2) empty field is found in the
278 ;; message being scanned (e.g. empty subject, sender,
279 ;; recipients, etc). It is set to nil if a non-empty field
280 ;; of the scanned message does not match a specified field
281 ;; in rsf-definitions-alist.
282 ;; FIXME the car is never set to t?!
284 ;; Initialize cdr to nil. This is set to t if one of the
285 ;; spam definitions matches a field in the scanned message.
286 (setq maybe-spam (cons t nil))
288 ;; Maybe the different fields should also be done in a
289 ;; loop to make the whole thing more flexible.
291 ;; If sender field is not specified in message being
292 ;; scanned, AND if "from" field does not appear in spam
293 ;; definitions for this element, this may still be spam due
294 ;; to another element...
295 (rsf-check-field 'from message-sender definition maybe-spam)
296 ;; Next, if spam was not ruled out already, check recipients:
297 (rsf-check-field 'to message-recipients definition maybe-spam)
298 ;; Next, if spam was not ruled out already, check subject:
299 (rsf-check-field 'subject message-subject definition maybe-spam)
300 ;; Next, if spam was not ruled out already, check content-type:
301 (rsf-check-field 'content-type message-content-type
302 definition maybe-spam)
303 ;; Next, if spam was not ruled out already, check contents:
304 ;; If contents field is not specified, this may still be
305 ;; spam due to another element...
306 (rsf-check-field 'contents
307 (buffer-substring-no-properties
308 (rmail-msgbeg msg) (rmail-msgend msg))
309 definition maybe-spam)
311 ;; Finally, check the X-Spam-Status header. You will typically
312 ;; look for the "Yes" string in this header field.
313 (rsf-check-field 'x-spam-status message-spam-status
314 definition maybe-spam)
316 ;; If the search in rsf-definitions-alist found
317 ;; that this email is spam, output the email to the spam
318 ;; rmail file, mark the email for deletion, leave the
319 ;; while loop and return nil so that an rmail summary line
320 ;; won't be displayed for this message: (FIXME ?)
321 (if (and (car maybe-spam) (cdr maybe-spam))
322 (setq exit-while-loop t)
323 ;; Else, spam was not yet found, proceed to next element
324 ;; in rsf-definitions-alist:
325 (setq num-element (1+ num-element)))))
327 (if (and (car maybe-spam) (cdr maybe-spam))
328 ;; Temporarily set rmail-current-message in order to output
329 ;; and delete the spam msg if needed:
330 (let ((rmail-current-message msg) ; FIXME does this do anything?
331 (action (cdr (assq 'action
332 (nth num-element rsf-definitions-alist))))
333 (newfile (not (file-exists-p rsf-file))))
334 ;; Check action item in rsf-definitions-alist and do it.
335 (cond
336 ((eq action 'output-and-delete)
337 ;; Else the prompt to write a new file leaves the raw
338 ;; mbox buffer visible.
339 (and newfile
340 (rmail-show-message (rmail-first-unseen-message) t))
341 (rmail-output rsf-file)
342 ;; Swap back, else rmail-get-new-mail-1 gets confused.
343 (when newfile
344 (rmail-swap-buffers-maybe)
345 (widen))
346 ;; Don't delete if automatic deletion after output is on.
347 (or rmail-delete-after-output (rmail-delete-message)))
348 ((eq action 'delete-spam)
349 (rmail-delete-message)))
350 (setq return-value nil))
351 (setq return-value t)))
352 return-value))
354 (defun rmail-get-new-mail-filter-spam (nnew)
355 "Check the most NNEW recent messages for spam.
356 This is called at the end of `rmail-get-new-mail-1' if there is new mail."
357 (let* ((nold (- rmail-total-messages nnew))
358 (nspam 0)
359 (nscan (1+ nold))
360 ;; Save the original deleted state of all the messages.
361 (rdv-old rmail-deleted-vector)
362 errflag)
363 ;; Set all messages undeleted so that the expunge only affects spam.
364 (setq rmail-deleted-vector (make-string (1+ rmail-total-messages) ?\s))
365 (while (and (not errflag) (<= nscan rmail-total-messages))
366 (condition-case nil
367 (or (rmail-spam-filter nscan)
368 (setq nspam (1+ nspam)))
369 (error (setq errflag nscan)))
370 (setq nscan (1+ nscan)))
371 (unwind-protect
372 (if errflag
373 (progn
374 (setq rmail-use-spam-filter nil)
375 (if rsf-beep (ding t))
376 (message "Spam filter error for new message %d, disabled" errflag)
377 (sleep-for rsf-sleep-after-message))
378 (when (> nspam 0)
379 ;; Otherwise sleep or expunge prompt leaves raw mbox buffer showing.
380 (rmail-show-message (or (rmail-first-unseen-message) 1) t)
381 (unwind-protect
382 (progn
383 (if rsf-beep (ding t))
384 (message "Rmail spam-filter detected and deleted %d spam \
385 message%s"
386 nspam (if (= 1 nspam) "" "s"))
387 (sleep-for rsf-sleep-after-message)
388 (if (rmail-expunge-confirmed) (rmail-only-expunge t)))
389 ;; Swap back, else get-new-mail-1 gets confused.
390 (rmail-swap-buffers-maybe)
391 (widen))))
392 ;; Restore the original deleted state. Character N refers to message N.
393 (setq rmail-deleted-vector
394 (concat (substring rdv-old 0 (1+ nold))
395 ;; This still works if we deleted all the new mail.
396 (substring rmail-deleted-vector (1+ nold)))))
397 ;; Return a message based on the number of spam messages found.
398 (cond
399 (errflag ", error in spam filter")
400 ((zerop nspam) "")
401 ((= 1 nnew) ", and it appears to be spam")
402 ((= nspam nnew) ", and all appear to be spam")
403 (t (format ", and %d appear%s to be spam" nspam
404 (if (= 1 nspam) "s" ""))))))
406 ;; define functions for interactively adding sender/subject of a
407 ;; specific message to the spam definitions while reading it, using
408 ;; the menubar:
409 (defun rsf-add-subject-to-spam-list ()
410 "Add the \"Subject\" header to the spam list."
411 (interactive)
412 (let ((message-subject (regexp-quote (rmail-get-header "Subject"))))
413 ;; Note the use of a backquote and comma on the subject line here,
414 ;; to make sure message-subject is actually evaluated and its value
415 ;; substituted.
416 (add-to-list 'rsf-definitions-alist
417 ;; Note that an empty element is treated the same as
418 ;; an absent one, so why does it bother to add them?
419 (list '(from . "")
420 '(to . "")
421 `(subject . ,message-subject)
422 '(content-type . "")
423 '(contents . "")
424 '(action . output-and-delete))
426 (customize-mark-to-save 'rsf-definitions-alist)
427 (if rsf-autosave-newly-added-definitions
428 (progn
429 (custom-save-all)
430 (message "Added subject `%s' to spam list, and saved it"
431 message-subject))
432 (message "Added subject `%s' to spam list (remember to save it)"
433 message-subject))))
435 (defun rsf-add-sender-to-spam-list ()
436 "Add the \"From\" address to the spam list."
437 (interactive)
438 (let ((message-sender (regexp-quote (rmail-get-header "From"))))
439 (add-to-list 'rsf-definitions-alist
440 (list `(from . ,message-sender)
441 '(to . "")
442 '(subject . "")
443 '(content-type . "")
444 '(contents . "")
445 '(action . output-and-delete))
447 (customize-mark-to-save 'rsf-definitions-alist)
448 (if rsf-autosave-newly-added-definitions
449 (progn
450 (custom-save-all)
451 (message "Added sender `%s' to spam list, and saved it"
452 message-sender))
453 (message "Added sender `%s' to spam list (remember to save it)"
454 message-sender))))
456 (defun rsf-add-region-to-spam-list ()
457 "Add the marked region in the Rmail buffer to the spam list.
458 Adds to spam definitions as a \"contents\" field."
459 (interactive)
460 (set-buffer rmail-buffer)
461 ;; Check if region is inactive or has zero size.
462 (if (not (and mark-active (not (= (region-beginning) (region-end)))))
463 ;; If inactive, print error message.
464 (message "You must highlight some text in the Rmail buffer")
465 (if (< (- (region-end) (region-beginning)) rsf-min-region-to-spam-list)
466 (message "Region is too small (minimum %d characters)"
467 rsf-min-region-to-spam-list)
468 ;; If region active and long enough, add to list of spam definitions.
469 (let ((region-to-spam-list (regexp-quote
470 (buffer-substring-no-properties
471 (region-beginning) (region-end)))))
472 (add-to-list 'rsf-definitions-alist
473 (list '(from . "")
474 '(to . "")
475 '(subject . "")
476 '(content-type . "")
477 `(contents . ,region-to-spam-list)
478 '(action . output-and-delete))
480 (customize-mark-to-save 'rsf-definitions-alist)
481 (if rsf-autosave-newly-added-definitions
482 (progn
483 (custom-save-all)
484 (message "Added highlighted text:\n%s\n\
485 to the spam list, and saved it" region-to-spam-list))
486 (message "Added highlighted text:\n%s\n\
487 to the spam list (remember to save it)" region-to-spam-list))))))
489 (defun rsf-customize-spam-definitions ()
490 "Customize `rsf-definitions-alist'."
491 (interactive)
492 (customize-variable 'rsf-definitions-alist))
494 (defun rsf-customize-group ()
495 "Customize the rmail-spam-filter group."
496 (interactive)
497 (customize-group 'rmail-spam-filter))
499 (defun rsf-custom-save-all ()
500 "Interactive version of `custom-save-all'."
501 (interactive)
502 (custom-save-all))
504 ;; Add menu items (and keyboard shortcuts) to both rmail and rmail-summary.
505 (dolist (map (list rmail-summary-mode-map rmail-mode-map))
506 (easy-menu-define nil map nil
507 '("Spam"
508 ["Add subject to spam list" rsf-add-subject-to-spam-list]
509 ["Add sender to spam list" rsf-add-sender-to-spam-list]
510 ["Add region to spam list" rsf-add-region-to-spam-list]
511 ["Save spam definitions" rsf-custom-save-all]
512 "--"
513 ["Customize spam definitions" rsf-customize-spam-definitions]
514 ["Browse spam customizations" rsf-customize-group]
516 (define-key map "\C-cSt" 'rsf-add-subject-to-spam-list)
517 (define-key map "\C-cSr" 'rsf-add-sender-to-spam-list)
518 (define-key map "\C-cSn" 'rsf-add-region-to-spam-list)
519 (define-key map "\C-cSa" 'rsf-custom-save-all)
520 (define-key map "\C-cSd" 'rsf-customize-spam-definitions)
521 (define-key map "\C-cSg" 'rsf-customize-group))
523 (defun rsf-add-content-type-field ()
524 "Maintain backward compatibility for `rmail-spam-filter'.
525 The most recent version of `rmail-spam-filter' checks the content-type
526 field of the incoming mail to see if it is spam. The format of
527 `rsf-definitions-alist' has therefore changed. This function
528 checks to see if the old format is used, and updates it if necessary."
529 (interactive)
530 (if (and rsf-definitions-alist
531 (not (assoc 'content-type (car rsf-definitions-alist))))
532 (let ((result nil)
533 (current nil)
534 (definitions rsf-definitions-alist))
535 (while definitions
536 (setq current (car definitions))
537 (setq definitions (cdr definitions))
538 (setq result
539 (append result
540 (list
541 (list (assoc 'from current)
542 (assoc 'to current)
543 (assoc 'subject current)
544 (cons 'content-type "")
545 (assoc 'contents current)
546 (assoc 'action current))))))
547 (setq rsf-definitions-alist result)
548 (customize-mark-to-save 'rsf-definitions-alist)
549 (if rsf-autosave-newly-added-definitions
550 (progn
551 (custom-save-all)
552 (message "Spam definitions converted to new format, and saved"))
553 (message "Spam definitions converted to new format (remember to save)")))))
555 (provide 'rmail-spam-filter)
557 ;;; rmail-spam-filter ends here