mailman/pipeline/tagger.py

   1 # Copyright (C) 2001-2008 by the Free Software Foundation, Inc.
   2 #
   3 # This program is free software; you can redistribute it and/or
   4 # modify it under the terms of the GNU General Public License
   5 # as published by the Free Software Foundation; either version 2
   6 # of the License, or (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  16 # USA.
  17
  18 """Extract topics from the original mail message."""
  19
  20 __metaclass__ = type
  21 __all__ = ['Tagger']
  22
  23
  24 import re
  25 import email
  26 import email.Errors
  27 import email.Iterators
  28 import email.Parser
  29
  30 from zope.interface import implements
  31
  32 from mailman.i18n import _
  33 from mailman.interfaces import IHandler
  34
  35
  36 OR = '|'
  37 CRNL = '\r\n'
  38 EMPTYSTRING = ''
  39 NLTAB = '\n\t'
  40
  41
  42 \f
  43 def process(mlist, msg, msgdata):
  44     if not mlist.topics_enabled:
  45         return
  46     # Extract the Subject:, Keywords:, and possibly body text
  47     matchlines = []
  48     matchlines.append(msg.get('subject', None))
  49     matchlines.append(msg.get('keywords', None))
  50     if mlist.topics_bodylines_limit == 0:
  51         # Don't scan any body lines
  52         pass
  53     elif mlist.topics_bodylines_limit < 0:
  54         # Scan all body lines
  55         matchlines.extend(scanbody(msg))
  56     else:
  57         # Scan just some of the body lines
  58         matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit))
  59     matchlines = filter(None, matchlines)
  60     # For each regular expression in the topics list, see if any of the lines
  61     # of interest from the message match the regexp.  If so, the message gets
  62     # added to the specific topics bucket.
  63     hits = {}
  64     for name, pattern, desc, emptyflag in mlist.topics:
  65         pattern = OR.join(pattern.splitlines())
  66         cre = re.compile(pattern, re.IGNORECASE)
  67         for line in matchlines:
  68             if cre.search(line):
  69                 hits[name] = 1
  70                 break
  71     if hits:
  72         msgdata['topichits'] = hits.keys()
  73         msg['X-Topics'] = NLTAB.join(hits.keys())
  74
  75
  76 \f
  77 def scanbody(msg, numlines=None):
  78     # We only scan the body of the message if it is of MIME type text/plain,
  79     # or if the outer type is multipart/alternative and there is a text/plain
  80     # part.  Anything else, and the body is ignored for header-scan purposes.
  81     found = None
  82     if msg.get_content_type() == 'text/plain':
  83         found = msg
  84     elif msg.is_multipart()\
  85          and msg.get_content_type() == 'multipart/alternative':
  86         for found in msg.get_payload():
  87             if found.get_content_type() == 'text/plain':
  88                 break
  89         else:
  90             found = None
  91     if not found:
  92         return []
  93     # Now that we have a Message object that meets our criteria, let's extract
  94     # the first numlines of body text.
  95     lines = []
  96     lineno = 0
  97     reader = list(email.Iterators.body_line_iterator(msg))
  98     while numlines is None or lineno < numlines:
  99         try:
 100             line = reader.pop(0)
 101         except IndexError:
 102             break
 103         # Blank lines don't count
 104         if not line.strip():
 105             continue
 106         lineno += 1
 107         lines.append(line)
 108     # Concatenate those body text lines with newlines, and then create a new
 109     # message object from those lines.
 110     p = _ForgivingParser()
 111     msg = p.parsestr(EMPTYSTRING.join(lines))
 112     return msg.get_all('subject', []) + msg.get_all('keywords', [])
 113
 114
 115 \f
 116 class _ForgivingParser(email.Parser.HeaderParser):
 117     # Be a little more forgiving about non-header/continuation lines, since
 118     # we'll just read as much as we can from "header-like" lines in the body.
 119     #
 120     # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to
 121     # specialize the way it returns?
 122     def _parseheaders(self, container, fp):
 123         # Parse the headers, returning a list of header/value pairs.  None as
 124         # the header means the Unix-From header.
 125         lastheader = ''
 126         lastvalue = []
 127         lineno = 0
 128         while 1:
 129             # Don't strip the line before we test for the end condition,
 130             # because whitespace-only header lines are RFC compliant
 131             # continuation lines.
 132             line = fp.readline()
 133             if not line:
 134                 break
 135             line = line.splitlines()[0]
 136             if not line:
 137                 break
 138             # Ignore the trailing newline
 139             lineno += 1
 140             # Check for initial Unix From_ line
 141             if line.startswith('From '):
 142                 if lineno == 1:
 143                     container.set_unixfrom(line)
 144                     continue
 145                 else:
 146                     break
 147             # Header continuation line
 148             if line[0] in ' \t':
 149                 if not lastheader:
 150                     break
 151                 lastvalue.append(line)
 152                 continue
 153             # Normal, non-continuation header.  BAW: this should check to make
 154             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
 155             # should expose the header matching algorithm in the API, and
 156             # allow for a non-strict parsing mode (that ignores the line
 157             # instead of raising the exception).
 158             i = line.find(':')
 159             if i < 0:
 160                 break
 161             if lastheader:
 162                 container[lastheader] = NLTAB.join(lastvalue)
 163             lastheader = line[:i]
 164             lastvalue = [line[i+1:].lstrip()]
 165         # Make sure we retain the last header
 166         if lastheader:
 167             container[lastheader] = NLTAB.join(lastvalue)
 168
 169
 170 \f
 171 class Tagger:
 172     """Tag messages with topic matches."""
 173
 174     implements(IHandler)
 175
 176     name = 'tagger'
 177     description = _('Tag messages with topic matches.')
 178
 179     def process(self, mlist, msg, msgdata):
 180         """See `IHandler`."""
 181         process(mlist, msg, msgdata)