try ripping out expensive computations
[trackgit.git] / mail.py
blob9dcc110f0bbecab0625c811155a49a5d38367050
1 #!/usr/bin/python
3 import sys
4 import re
5 import time
6 import random
7 import mailbox
8 import email.Iterators
9 import email.Parser
10 import email.utils
11 import email.header
12 import sqlalchemy
13 from sqlalchemy.orm import join
14 import cStringIO as StringIO
16 import db
17 import patch
18 from git import git
19 from blobtracker import BlobTracker
20 import dbcache
22 _msg_id_regex = re.compile(r'<([^<>]+)>')
23 def _parse_msg_id(str):
24 m = _msg_id_regex.search(str)
25 if m:
26 return m.group(1)
28 parser = email.Parser.Parser()
30 def _detect_reply_id(msg):
31 if msg['In-Reply-To']:
32 return _parse_msg_id(msg['In-Reply-To'])
33 if msg['References']:
34 refs = ' '.join(msg.get_all('References'))
35 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
36 return ref_ids[-1]
38 def _get_text_payloads(msg):
39 if not msg.is_multipart():
40 yield msg.get_payload()
41 return
42 for part in email.Iterators.typed_subpart_iterator(msg):
43 if part.is_multipart():
44 yield part.get_payload(0)
45 else:
46 yield part.get_payload()
48 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
49 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
50 re.MULTILINE|re.DOTALL)
51 def _guess_patch_contents(msg):
52 for p in _get_text_payloads(msg):
53 if _format_patch_regex.match(p):
54 msg.set_payload(p)
55 return msg.as_string()
56 m = _snip_patch_regex.match(p)
57 if m:
58 msg.set_payload(m.group(1))
59 return msg.as_string()
60 # no patch found
61 return None
63 def try_patch(m, pp, base_sha1):
64 git('checkout', '-f', base_sha1)
65 try:
66 pp.apply()
67 except patch.PatchError:
68 return # failed
69 pipe = git('show', ret_pipe=True)
70 output = git('patch-id', input_pipe=pipe)[0]
71 if not output:
72 # this means the patch had no diff; e.g., a mode change
73 return
74 patch_id, commit_id = output.split()
75 c = db.session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
76 if not c:
77 new_commit = True
78 output = git('log', '-1', '--pretty=format:%aD\t%cD\t%an <%ae>', commit_id)[0]
79 adate_s, cdate_s, author = output.split('\t', 2)
80 adate = email.utils.mktime_tz(email.utils.parsedate_tz(adate_s))
81 cdate = email.utils.mktime_tz(email.utils.parsedate_tz(cdate_s))
82 try:
83 author = author.decode('utf8')
84 except UnicodeDecodeError:
85 author = author.decode('latin1')
86 c = db.Commit(commit_id, cdate, adate, author, patch_id, False)
87 blobtracker.scan_commit_tree(c)
88 p = db.Patch(c, m.id, pp.notes)
89 db.session.add(p)
90 return p
92 def find_blobs(upstream, blobs):
93 commits = []
94 for prefix in blobs:
95 ret = (db.session.query(db.Blob, db.Commit)
96 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
97 .filter(db.Blob.sha1.like(prefix+'%'))
98 .filter(db.Commit.upstream == upstream)
99 .order_by(db.Commit.cdate.desc()).first())
100 if not ret:
101 print 'blob %s not found?!' % prefix
102 break
103 commits.append(ret[1])
104 else:
105 if commits:
106 # all blobs found
107 cmt = min(commits)
108 return cmt
110 def try_patch_anywhere(msg, m):
111 print '*', m.message_id
112 pdata = _guess_patch_contents(msg)
113 if not pdata:
114 return
115 pp = patch.Patch(pdata)
116 if pp.missing_files:
117 m.has_patch = False
118 return # probably for another project
119 # perhaps we already know about this patch id
120 patch_id = pp.fast_patch_id()
121 if patch_id:
122 cmt = (db.query(db.Commit).filter(db.Commit.patch_id==patch_id)
123 .filter(db.Commit.upstream==True).first())
124 if cmt:
125 print "patch id %s" % patch_id
126 print "matches commit %s" % cmt.sha1
127 p = db.Patch(cmt, m.id, pp.notes)
128 db.session.add(p)
129 return p
130 # first try on the commit given by the blobs
131 cmt = find_blobs(True, pp.blobs_pre)
132 if cmt:
133 print 'trying canonical commit %s (upstream)' % cmt.sha1
134 applied = try_patch(m, pp, cmt.sha1)
135 if applied:
136 return applied
137 # this is just hopeless: it doesn't apply to the commit it should!
138 return
139 else:
140 print "no canonical commit found (upstream)"
142 cmt = find_blobs(False, pp.blobs_pre)
143 if cmt:
144 print 'trying canonical commit %s (local)' % cmt.sha1
145 applied = try_patch(m, pp, cmt.sha1)
146 if applied:
147 return applied
148 # this is just hopeless: it doesn't apply to the commit it should!
149 return
150 else:
151 print "no canonical commit found (local)"
152 # if we have a parent, try on the parent
153 parent = db.session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
154 if parent and parent.has_patch and parent.patch_id:
155 cmt = (db.session.query(db.Commit)
156 .filter(db.Commit.patch_id==parent.patch_id)
157 .order_by(db.Commit.cdate.desc()).first())
158 print 'trying to apply on parent %s' % cmt.sha1
159 applied = try_patch(m, pp, cmt.sha1)
160 if applied:
161 return applied
162 else:
163 print "no parent commit found"
164 # try on origin/master
165 sha1 = git('rev-list', '--first-parent', '-1', '--before=%d' % m.post_date,
166 'origin/master')[0].strip()
167 if not sha1:
168 sha1 = 'origin/master'
169 print 'trying on master at time of patch (%s)' % sha1
170 applied = try_patch(m, pp, sha1)
171 if applied:
172 return applied
173 # same for origin/next
174 print 'trying on origin/next'
175 applied = try_patch(m, pp, 'origin/next')
176 if applied:
177 return applied
178 # all out of ideas!
180 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
181 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
182 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
183 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
185 def parse_whats_cooking(msg, mail):
186 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
187 return
188 category = None
189 branch = 'pu' # initial part goes on 'pu'
190 notes = []
191 def _rotate_notes(category, branch, notes):
192 if branch:
193 t = db.session.query(db.Topic).filter(db.Topic.name==branch).first()
194 if not t:
195 t = db.Topic()
196 t.name = branch
197 db.session.add(t)
198 t.mail_id = mail.id
199 t.cooking_notes = '\n'.join(notes)
200 notes = []
201 if category:
202 notes.append("[%s]" % category)
203 return notes
204 text = ''.join(_get_text_payloads(msg))
205 for line in text.splitlines():
206 if _whats_cooking_separator.match(line):
207 category = None
208 notes = _rotate_notes(category, branch, notes)
209 branch = None
210 continue
211 m = _whats_cooking_category.match(line)
212 if m:
213 category = m.group(1)
214 notes = _rotate_notes(category, branch, notes)
215 continue
216 m = _whats_cooking_header.match(line)
217 if m:
218 notes = _rotate_notes(category, branch, notes)
219 notes.append(line)
220 branch = m.group(1)
221 continue
222 notes.append(line)
225 def process_mail(mail):
226 msg = parser.parse(StringIO.StringIO(mail.data))
227 parse_whats_cooking(msg, mail)
228 #patch = try_patch_anywhere(msg, mail)
229 #if patch:
230 # mail.patch_id = patch.commit.patch_id
231 mail.stale = False
234 def _query_stale_mail():
235 return (db.session.query(db.Mail)
236 .filter(db.Mail.stale == True)
237 .order_by(db.Mail.post_date.asc(), db.Mail.subject.asc()))
239 def walk_stale_mail():
240 global blobtracker
241 blobtracker = BlobTracker()
242 count = _query_stale_mail().count()
243 for mail in _query_stale_mail():
244 print "** %6d\n" % count
245 process_mail(mail)
246 count = count - 1
249 def get_mail_by_id(msg_id):
250 # Note: use first() because we don't know it exists. The DB
251 # guarantees uniqueness anyway.
252 try:
253 return dbcache.mail_cache[msg_id]
254 except KeyError:
255 return None
257 def decode_quoted(s):
258 try:
259 ret = []
260 for s, e in email.header.decode_header(s):
261 if e:
262 s = s.decode(e)
263 ret.append(s)
264 return ''.join(ret)
265 except UnicodeError:
266 return s
268 _space_regex = re.compile(r'\s+')
269 def sanitize_single_line(s):
270 if s is not None:
271 return _space_regex.sub(' ', s)[:255]
273 _gmane_id_regex = re.compile(r'<http://permalink\.gmane\.org/gmane\.comp\.version-control\.git/(\d+)>')
274 def insert_mail_into_db(msg):
275 if (msg.get('Message-Id', None)
276 and get_mail_by_id(_parse_msg_id(msg['Message-Id']))):
277 return # already exists
278 gmane_id = None
279 if msg['Archived-At']:
280 m = _gmane_id_regex.match(msg['Archived-At'])
281 if m:
282 gmane_id = int(m.group(1))
283 msgid = msg.get('Message-Id', None)
284 if not msgid or not _parse_msg_id(msgid):
285 if gmane_id:
286 msgid = 'gmane-%d@mailnotes.thomasrast.ch' % gmane_id
287 else:
288 msgid = 'fallback-%X@mailnotes.thomasrast.ch' % random.randrange(2**32)
289 else:
290 msgid = _parse_msg_id(msgid)
291 mail = dbcache.mail_cache.get(msgid)
292 mail.gmane_id = gmane_id
293 mail.message_id = sanitize_single_line(msgid)
294 if msg['From']:
295 name, addr = email.utils.parseaddr(msg['From'])
296 if name and addr:
297 mail.author = sanitize_single_line("%s <%s>" % (decode_quoted(name), addr))
298 else:
299 mail.author = sanitize_single_line(decode_quoted(msg['From']))
300 tm = None
301 if msg['Date']:
302 tm = email.utils.parsedate_tz(msg['Date'])
303 if tm:
304 tm = email.utils.mktime_tz(tm)
305 else:
306 tm = time.time()
307 mail.post_date = tm
308 if msg['Subject']:
309 mail.subject = sanitize_single_line(decode_quoted(msg['Subject']))
310 mail.in_reply_to = sanitize_single_line(_detect_reply_id(msg))
311 mail.data = msg.as_string()
312 mail.stale = mail.has_patch = bool(_guess_patch_contents(msg))
313 if msg['Subject'] and _whats_cooking_subject.match(msg['Subject']):
314 mail.stale = True
315 db.session.add(mail)
316 starter = mail
317 if mail.in_reply_to:
318 parent = get_mail_by_id(mail.in_reply_to)
319 if parent:
320 starter = parent
321 # Flag all so-far unapplied patches downwards of this one as
322 # 'stale' so they'll be tried again. XXX should use an sql UPDATE
323 # here!
324 for child in (db.session.query(db.Mail)
325 .select_from(join(db.Mail, db.Reference,
326 db.Mail.id == db.Reference.mail_id))
327 .filter(db.Reference.reference_id == starter.message_id)
328 .filter(db.Mail.has_patch == True)
329 .filter(db.Mail.patch_id == None)
330 .filter(db.Mail.message_id != mail.message_id)
331 .filter(db.Mail.stale == False)):
332 child.stale = True
333 if msg['References']:
334 for m in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
335 db.session.add(db.Reference(mail.id,
336 sanitize_single_line(m.group(1))))
338 def _parse_mail(s):
339 return parser.parse(StringIO.StringIO(s))
341 def parse_mbox(fname):
342 mbox = open(fname)
343 mbox_parsed = []
344 cur = None
345 for line in mbox:
346 if line.startswith('From news@gmane.org'):
347 if cur:
348 mbox_parsed.append(_parse_mail(''.join(cur)))
349 cur = []
350 else:
351 cur.append(line)
352 if cur:
353 mbox_parsed.append(_parse_mail(''.join(cur)))
354 count = len(mbox_parsed)
355 for msg in mbox_parsed:
356 insert_mail_into_db(msg)
357 count = count - 1
358 sys.stderr.write("%6d\r" % count)
359 sys.stderr.write("\n")
360 dbcache.mail_cache.flush()
362 if __name__ == '__main__':
363 for mbox in sys.argv[1:]:
364 parse_mbox(mbox)
365 walk_stale_mail()
366 db.session.commit()