thread merge
[mailman.git] / mailman / pipeline / tagger.py
blob6899e1141f69613389455c9642d908bdbb0b0dfe
1 # Copyright (C) 2001-2008 by the Free Software Foundation, Inc.
3 # This program is free software; you can redistribute it and/or
4 # modify it under the terms of the GNU General Public License
5 # as published by the Free Software Foundation; either version 2
6 # of the License, or (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
16 # USA.
18 """Extract topics from the original mail message."""
20 __metaclass__ = type
21 __all__ = ['Tagger']
24 import re
25 import email
26 import email.Errors
27 import email.Iterators
28 import email.Parser
30 from zope.interface import implements
32 from mailman.i18n import _
33 from mailman.interfaces import IHandler
36 OR = '|'
37 CRNL = '\r\n'
38 EMPTYSTRING = ''
39 NLTAB = '\n\t'
43 def process(mlist, msg, msgdata):
44 if not mlist.topics_enabled:
45 return
46 # Extract the Subject:, Keywords:, and possibly body text
47 matchlines = []
48 matchlines.append(msg.get('subject', None))
49 matchlines.append(msg.get('keywords', None))
50 if mlist.topics_bodylines_limit == 0:
51 # Don't scan any body lines
52 pass
53 elif mlist.topics_bodylines_limit < 0:
54 # Scan all body lines
55 matchlines.extend(scanbody(msg))
56 else:
57 # Scan just some of the body lines
58 matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit))
59 matchlines = filter(None, matchlines)
60 # For each regular expression in the topics list, see if any of the lines
61 # of interest from the message match the regexp. If so, the message gets
62 # added to the specific topics bucket.
63 hits = {}
64 for name, pattern, desc, emptyflag in mlist.topics:
65 pattern = OR.join(pattern.splitlines())
66 cre = re.compile(pattern, re.IGNORECASE)
67 for line in matchlines:
68 if cre.search(line):
69 hits[name] = 1
70 break
71 if hits:
72 msgdata['topichits'] = hits.keys()
73 msg['X-Topics'] = NLTAB.join(hits.keys())
77 def scanbody(msg, numlines=None):
78 # We only scan the body of the message if it is of MIME type text/plain,
79 # or if the outer type is multipart/alternative and there is a text/plain
80 # part. Anything else, and the body is ignored for header-scan purposes.
81 found = None
82 if msg.get_content_type() == 'text/plain':
83 found = msg
84 elif msg.is_multipart()\
85 and msg.get_content_type() == 'multipart/alternative':
86 for found in msg.get_payload():
87 if found.get_content_type() == 'text/plain':
88 break
89 else:
90 found = None
91 if not found:
92 return []
93 # Now that we have a Message object that meets our criteria, let's extract
94 # the first numlines of body text.
95 lines = []
96 lineno = 0
97 reader = list(email.Iterators.body_line_iterator(msg))
98 while numlines is None or lineno < numlines:
99 try:
100 line = reader.pop(0)
101 except IndexError:
102 break
103 # Blank lines don't count
104 if not line.strip():
105 continue
106 lineno += 1
107 lines.append(line)
108 # Concatenate those body text lines with newlines, and then create a new
109 # message object from those lines.
110 p = _ForgivingParser()
111 msg = p.parsestr(EMPTYSTRING.join(lines))
112 return msg.get_all('subject', []) + msg.get_all('keywords', [])
116 class _ForgivingParser(email.Parser.HeaderParser):
117 # Be a little more forgiving about non-header/continuation lines, since
118 # we'll just read as much as we can from "header-like" lines in the body.
120 # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to
121 # specialize the way it returns?
122 def _parseheaders(self, container, fp):
123 # Parse the headers, returning a list of header/value pairs. None as
124 # the header means the Unix-From header.
125 lastheader = ''
126 lastvalue = []
127 lineno = 0
128 while 1:
129 # Don't strip the line before we test for the end condition,
130 # because whitespace-only header lines are RFC compliant
131 # continuation lines.
132 line = fp.readline()
133 if not line:
134 break
135 line = line.splitlines()[0]
136 if not line:
137 break
138 # Ignore the trailing newline
139 lineno += 1
140 # Check for initial Unix From_ line
141 if line.startswith('From '):
142 if lineno == 1:
143 container.set_unixfrom(line)
144 continue
145 else:
146 break
147 # Header continuation line
148 if line[0] in ' \t':
149 if not lastheader:
150 break
151 lastvalue.append(line)
152 continue
153 # Normal, non-continuation header. BAW: this should check to make
154 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
155 # should expose the header matching algorithm in the API, and
156 # allow for a non-strict parsing mode (that ignores the line
157 # instead of raising the exception).
158 i = line.find(':')
159 if i < 0:
160 break
161 if lastheader:
162 container[lastheader] = NLTAB.join(lastvalue)
163 lastheader = line[:i]
164 lastvalue = [line[i+1:].lstrip()]
165 # Make sure we retain the last header
166 if lastheader:
167 container[lastheader] = NLTAB.join(lastvalue)
171 class Tagger:
172 """Tag messages with topic matches."""
174 implements(IHandler)
176 name = 'tagger'
177 description = _('Tag messages with topic matches.')
179 def process(self, mlist, msg, msgdata):
180 """See `IHandler`."""
181 process(mlist, msg, msgdata)