mail: Be more paranoid about 255-character limit
[trackgit.git] / mail.py
blob7f258ae19693447d4f548c0fe97b33e885f137b9
1 #!/usr/bin/python
3 import sys
4 import re
5 import time
6 import random
7 import mailbox
8 import email.Iterators
9 import email.Parser
10 import email.utils
11 import sqlalchemy
12 from sqlalchemy.orm import join
13 import cStringIO as StringIO
15 import db
16 import patch
17 from git import git
18 from blobtracker import BlobTracker
19 import dbcache
21 _msg_id_regex = re.compile(r'<([^<>]+)>')
22 def _parse_msg_id(str):
23 m = _msg_id_regex.search(str)
24 if m:
25 return m.group(1)
27 parser = email.Parser.Parser()
29 def _detect_reply_id(msg):
30 if msg['In-Reply-To']:
31 return _parse_msg_id(msg['In-Reply-To'])
32 if msg['References']:
33 refs = ' '.join(msg.get_all('References'))
34 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
35 return ref_ids[-1]
37 def _get_text_payloads(msg):
38 if not msg.is_multipart():
39 yield msg.get_payload()
40 return
41 for part in email.Iterators.typed_subpart_iterator(msg):
42 if part.is_multipart():
43 yield part.get_payload(0)
44 else:
45 yield part.get_payload()
47 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
48 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
49 re.MULTILINE|re.DOTALL)
50 def _guess_patch_contents(msg):
51 for p in _get_text_payloads(msg):
52 if _format_patch_regex.match(p):
53 msg.set_payload(p)
54 return msg.as_string()
55 m = _snip_patch_regex.match(p)
56 if m:
57 msg.set_payload(m.group(1))
58 return msg.as_string()
59 # no patch found
60 return None
62 def try_patch(m, pp, base_sha1):
63 git('checkout', base_sha1)
64 try:
65 pp.apply()
66 except patch.PatchError:
67 return # failed
68 pipe = git('show', ret_pipe=True)
69 output = git('patch-id', input_pipe=pipe)[0]
70 if not output:
71 # this means the patch had no diff; e.g., a mode change
72 return
73 patch_id, commit_id = output.split()
74 c = db.session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
75 if not c:
76 new_commit = True
77 output = git('log', '-1', '--pretty=format:%ct %at')[0]
78 adate, cdate = [int(s) for s in output.split()]
79 c = db.Commit(commit_id, cdate, adate, patch_id, False)
80 blobtracker.scan_commit_tree(c)
81 p = db.Patch(c, m.id, pp.notes)
82 db.session.add(p)
83 return p
85 def find_blobs(upstream, blobs):
86 commits = []
87 for prefix in blobs:
88 ret = (db.session.query(db.Blob, db.Commit)
89 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
90 .filter(db.Blob.sha1.like(prefix+'%'))
91 .filter(db.Commit.upstream == upstream)
92 .order_by(db.Commit.cdate.desc()).first())
93 if not ret:
94 print 'blob %s not found?!' % prefix
95 break
96 commits.append(ret[1])
97 else:
98 if commits:
99 # all blobs found
100 cmt = min(commits)
101 return cmt
103 def try_patch_anywhere(msg, m):
104 print '*', m.message_id
105 pdata = _guess_patch_contents(msg)
106 if not pdata:
107 return
108 pp = patch.Patch(pdata)
109 if pp.missing_files:
110 m.has_patch = False
111 return # probably for another project
112 # perhaps we already know about this patch id
113 patch_id = pp.fast_patch_id()
114 if patch_id:
115 cmt = db.query(db.Commit).filter(db.Commit.patch_id==patch_id).first()
116 if cmt:
117 p = db.Patch(cmt, m.id, pp.notes)
118 db.session.add(p)
119 return p
120 # first try on the commit given by the blobs
121 cmt = find_blobs(True, pp.blobs_pre)
122 if cmt:
123 print 'trying canonical commit %s (upstream)' % cmt.sha1
124 applied = try_patch(m, pp, cmt.sha1)
125 if applied:
126 return applied
127 # this is just hopeless: it doesn't apply to the commit it should!
128 return
129 else:
130 print "no canonical commit found (upstream)"
132 cmt = find_blobs(False, pp.blobs_pre)
133 if cmt:
134 print 'trying canonical commit %s (local)' % cmt.sha1
135 applied = try_patch(m, pp, cmt.sha1)
136 if applied:
137 return applied
138 # this is just hopeless: it doesn't apply to the commit it should!
139 return
140 else:
141 print "no canonical commit found (local)"
142 # if we have a parent, try on the parent
143 parent = db.session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
144 if parent and parent.has_patch and parent.patch_id:
145 cmt = (db.session.query(db.Commit)
146 .filter(db.Commit.patch_id==parent.patch_id)
147 .order_by(db.Commit.cdate.desc()).first())
148 print 'trying to apply on parent %s' % cmt.sha1
149 applied = try_patch(m, pp, cmt.sha1)
150 if applied:
151 return applied
152 else:
153 print "no parent commit found"
154 # try on origin/master
155 sha1 = git('rev-list', '--first-parent', '-1', '--before=%d' % m.post_date,
156 'origin/master')[0].strip()
157 if not sha1:
158 sha1 = 'origin/master'
159 print 'trying on master at time of patch (%s)' % sha1
160 applied = try_patch(m, pp, sha1)
161 if applied:
162 return applied
163 # same for origin/next
164 print 'trying on origin/next'
165 applied = try_patch(m, pp, 'origin/next')
166 if applied:
167 return applied
168 # all out of ideas!
170 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
171 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
172 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
173 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
175 def parse_whats_cooking(msg, mail):
176 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
177 return
178 category = None
179 branch = 'pu' # initial part goes on 'pu'
180 notes = []
181 def _rotate_notes(category, branch, notes):
182 if branch:
183 t = db.session.query(db.Topic).filter(db.Topic.name==branch).first()
184 if not t:
185 t = db.Topic()
186 t.name = branch
187 db.session.add(t)
188 t.mail_id = mail.id
189 t.cooking_notes = '\n'.join(notes)
190 notes = []
191 if category:
192 notes.append("[%s]" % category)
193 return notes
194 text = ''.join(_get_text_payloads(msg))
195 for line in text.splitlines():
196 if _whats_cooking_separator.match(line):
197 category = None
198 notes = _rotate_notes(category, branch, notes)
199 branch = None
200 continue
201 m = _whats_cooking_category.match(line)
202 if m:
203 category = m.group(1)
204 notes = _rotate_notes(category, branch, notes)
205 continue
206 m = _whats_cooking_header.match(line)
207 if m:
208 notes = _rotate_notes(category, branch, notes)
209 notes.append(line)
210 branch = m.group(1)
211 continue
212 notes.append(line)
215 def process_mail(mail):
216 msg = parser.parse(StringIO.StringIO(mail.data))
217 parse_whats_cooking(msg, mail)
218 patch = try_patch_anywhere(msg, mail)
219 if patch:
220 mail.patch_id = patch.commit.patch_id
221 mail.stale = False
224 def _query_stale_mail():
225 return (db.session.query(db.Mail)
226 .filter(db.Mail.stale == True)
227 .order_by(db.Mail.post_date.asc(), db.Mail.subject.asc()))
229 def walk_stale_mail():
230 global blobtracker
231 blobtracker = BlobTracker()
232 count = _query_stale_mail().count()
233 for mail in _query_stale_mail():
234 print "** %6d\n" % count
235 process_mail(mail)
236 count = count - 1
239 def get_mail_by_id(msg_id):
240 # Note: use first() because we don't know it exists. The DB
241 # guarantees uniqueness anyway.
242 try:
243 return dbcache.mail_cache[msg_id]
244 except KeyError:
245 return None
248 _space_regex = re.compile(r'\s+')
249 def sanitize_single_line(s):
250 return _space_regex.sub(' ', s)[:255]
252 _gmane_id_regex = re.compile(r'<http://permalink\.gmane\.org/gmane\.comp\.version-control\.git/(\d+)>')
253 def insert_mail_into_db(msg):
254 if (msg.get('Message-Id', None)
255 and get_mail_by_id(_parse_msg_id(msg['Message-Id']))):
256 return # already exists
257 gmane_id = None
258 if msg['Archived-At']:
259 m = _gmane_id_regex.match(msg['Archived-At'])
260 if m:
261 gmane_id = int(m.group(1))
262 msgid = msg.get('Message-Id', None)
263 if not msgid or not _parse_msg_id(msgid):
264 if gmane_id:
265 msgid = 'gmane-%d@mailnotes.trast.ch' % gmane_id
266 else:
267 msgid = 'fallback-%X@mailnotes.trast.ch' % random.randrange(2**32)
268 else:
269 msgid = _parse_msg_id(msgid)
270 mail = dbcache.mail_cache.get(msgid)
271 mail.gmane_id = gmane_id
272 mail.message_id = sanitize_single_line(msgid)
273 if msg['From']:
274 mail.author = sanitize_single_line(msg['From'])
275 tm = None
276 if msg['Date']:
277 tm = email.utils.parsedate_tz(msg['Date'])
278 if tm:
279 tm = email.utils.mktime_tz(time.mktime(tm))
280 else:
281 tm = time.time()
282 mail.post_date = tm
283 if msg['Subject']:
284 mail.subject = sanitize_single_line(msg['Subject'])
285 mail.in_reply_to = sanitize_single_line(_detect_reply_id(msg))
286 mail.data = msg.as_string()
287 mail.stale = mail.has_patch = bool(_guess_patch_contents(msg))
288 if msg['Subject'] and _whats_cooking_subject.match(msg['Subject']):
289 mail.stale = True
290 db.session.add(mail)
291 starter = mail
292 if mail.in_reply_to:
293 parent = get_mail_by_id(mail.in_reply_to)
294 if parent:
295 starter = parent
296 # Flag all so-far unapplied patches downwards of this one as
297 # 'stale' so they'll be tried again. XXX should use an sql UPDATE
298 # here!
299 for child in (db.session.query(db.Mail)
300 .select_from(join(db.Mail, db.Reference,
301 db.Mail.id == db.Reference.mail_id))
302 .filter(db.Reference.reference_id == starter.message_id)
303 .filter(db.Mail.has_patch == True)
304 .filter(db.Mail.patch_id == None)
305 .filter(db.Mail.message_id != mail.message_id)
306 .filter(db.Mail.stale == False)):
307 child.stale = True
308 if msg['References']:
309 for m in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
310 db.session.add(db.Reference(mail.id,
311 sanitize_single_line(m.group(1))))
313 def parse_mbox(fname):
314 mbox = mailbox.mbox(fname, parser.parse)
315 mbox_parsed = list(mbox)
316 count = len(mbox_parsed)
317 for msg in mbox_parsed:
318 insert_mail_into_db(msg)
319 count = count - 1
320 sys.stderr.write("%6d\r" % count)
321 sys.stderr.write("\n")
322 dbcache.mail_cache.flush()
324 if __name__ == '__main__':
325 for mbox in sys.argv[1:]:
326 parse_mbox(mbox)
327 walk_stale_mail()
328 db.session.commit()