make the blobtracker... not a blobtracker any more.
[trackgit.git] / mail.py
blob2ce69d23baec1e2c18b1f4c045f0a5c43993611f
1 #!/usr/bin/python
3 import sys
4 import re
5 import time
6 import random
7 import mailbox
8 import email.Iterators
9 import email.Parser
10 import email.utils
11 import email.header
12 import sqlalchemy
13 from sqlalchemy.orm import join
14 import cStringIO as StringIO
16 import db
17 import patch
18 from git import git
19 from blobtracker import BlobTracker
20 import dbcache
22 _msg_id_regex = re.compile(r'<([^<>]+)>')
23 def _parse_msg_id(str):
24 m = _msg_id_regex.search(str)
25 if m:
26 return m.group(1)
28 parser = email.Parser.Parser()
30 def _detect_reply_id(msg):
31 if msg['In-Reply-To']:
32 return _parse_msg_id(msg['In-Reply-To'])
33 if msg['References']:
34 refs = ' '.join(msg.get_all('References'))
35 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
36 if ref_ids:
37 return ref_ids[-1]
39 def _get_text_payloads(msg):
40 if not msg.is_multipart():
41 yield msg.get_payload()
42 return
43 for part in email.Iterators.typed_subpart_iterator(msg):
44 if part.is_multipart():
45 yield part.get_payload(0)
46 else:
47 yield part.get_payload()
49 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
50 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
51 re.MULTILINE|re.DOTALL)
52 def _guess_patch_contents(msg):
53 for p in _get_text_payloads(msg):
54 if _format_patch_regex.match(p):
55 msg.set_payload(p)
56 return msg.as_string()
57 m = _snip_patch_regex.match(p)
58 if m:
59 msg.set_payload(m.group(1))
60 return msg.as_string()
61 # no patch found
62 return None
64 def try_patch(m, pp, base_sha1):
65 git('checkout', '-f', base_sha1)
66 try:
67 pp.apply()
68 except patch.PatchError:
69 return # failed
70 pipe = git('show', ret_pipe=True)
71 output = git('patch-id', input_pipe=pipe)[0]
72 if not output:
73 # this means the patch had no diff; e.g., a mode change
74 return
75 patch_id, commit_id = output.split()
76 c = db.session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
77 if not c:
78 new_commit = True
79 output = git('log', '-1', '--pretty=format:%aD\t%cD\t%an <%ae>', commit_id)[0]
80 adate_s, cdate_s, author = output.split('\t', 2)
81 adate = email.utils.mktime_tz(email.utils.parsedate_tz(adate_s))
82 cdate = email.utils.mktime_tz(email.utils.parsedate_tz(cdate_s))
83 try:
84 author = author.decode('utf8')
85 except UnicodeDecodeError:
86 author = author.decode('latin1')
87 c = db.Commit(commit_id, cdate, adate, author, patch_id, False)
88 blobtracker.scan_commit_tree(c)
89 p = db.Patch(c, m.id, pp.notes)
90 db.session.add(p)
91 return p
93 def find_blobs(upstream, blobs):
94 commits = []
95 for prefix in blobs:
96 ret = (db.session.query(db.Blob, db.Commit)
97 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
98 .filter(db.Blob.sha1.like(prefix+'%'))
99 .filter(db.Commit.upstream == upstream)
100 .order_by(db.Commit.cdate.desc()).first())
101 if not ret:
102 print 'blob %s not found?!' % prefix
103 break
104 commits.append(ret[1])
105 else:
106 if commits:
107 # all blobs found
108 cmt = min(commits)
109 return cmt
111 def try_patch_anywhere(msg, m):
112 print '*', m.message_id
113 pdata = _guess_patch_contents(msg)
114 if not pdata:
115 return
116 pp = patch.Patch(pdata)
117 if pp.missing_files:
118 m.has_patch = False
119 return # probably for another project
120 # perhaps we already know about this patch id
121 patch_id = pp.fast_patch_id()
122 if patch_id:
123 cmt = (db.query(db.Commit).filter(db.Commit.patch_id==patch_id)
124 .filter(db.Commit.upstream==True).first())
125 if cmt:
126 print "patch id %s" % patch_id
127 print "matches commit %s" % cmt.sha1
128 p = db.Patch(cmt, m.id, pp.notes)
129 db.session.add(p)
130 return p
131 # first try on the commit given by the blobs
132 cmt = find_blobs(True, pp.blobs_pre)
133 if cmt:
134 print 'trying canonical commit %s (upstream)' % cmt.sha1
135 applied = try_patch(m, pp, cmt.sha1)
136 if applied:
137 return applied
138 # this is just hopeless: it doesn't apply to the commit it should!
139 return
140 else:
141 print "no canonical commit found (upstream)"
143 cmt = find_blobs(False, pp.blobs_pre)
144 if cmt:
145 print 'trying canonical commit %s (local)' % cmt.sha1
146 applied = try_patch(m, pp, cmt.sha1)
147 if applied:
148 return applied
149 # this is just hopeless: it doesn't apply to the commit it should!
150 return
151 else:
152 print "no canonical commit found (local)"
153 # if we have a parent, try on the parent
154 parent = db.session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
155 if parent and parent.has_patch and parent.patch_id:
156 cmt = (db.session.query(db.Commit)
157 .filter(db.Commit.patch_id==parent.patch_id)
158 .order_by(db.Commit.cdate.desc()).first())
159 print 'trying to apply on parent %s' % cmt.sha1
160 applied = try_patch(m, pp, cmt.sha1)
161 if applied:
162 return applied
163 else:
164 print "no parent commit found"
165 # try on origin/master
166 sha1 = git('rev-list', '--first-parent', '-1', '--before=%d' % m.post_date,
167 'origin/master')[0].strip()
168 if not sha1:
169 sha1 = 'origin/master'
170 print 'trying on master at time of patch (%s)' % sha1
171 applied = try_patch(m, pp, sha1)
172 if applied:
173 return applied
174 # same for origin/next
175 print 'trying on origin/next'
176 applied = try_patch(m, pp, 'origin/next')
177 if applied:
178 return applied
179 # all out of ideas!
181 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
182 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
183 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
184 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
186 def parse_whats_cooking(msg, mail):
187 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
188 return
189 category = None
190 branch = 'pu' # initial part goes on 'pu'
191 notes = []
192 def _rotate_notes(category, branch, notes):
193 if branch:
194 t = db.session.query(db.Topic).filter(db.Topic.name==branch).first()
195 if not t:
196 t = db.Topic()
197 t.name = branch
198 db.session.add(t)
199 t.mail_id = mail.id
200 t.cooking_notes = '\n'.join(notes)
201 notes = []
202 if category:
203 notes.append("[%s]" % category)
204 return notes
205 text = ''.join(_get_text_payloads(msg))
206 for line in text.splitlines():
207 if _whats_cooking_separator.match(line):
208 category = None
209 notes = _rotate_notes(category, branch, notes)
210 branch = None
211 continue
212 m = _whats_cooking_category.match(line)
213 if m:
214 category = m.group(1)
215 notes = _rotate_notes(category, branch, notes)
216 continue
217 m = _whats_cooking_header.match(line)
218 if m:
219 notes = _rotate_notes(category, branch, notes)
220 notes.append(line)
221 branch = m.group(1)
222 continue
223 notes.append(line)
226 def process_mail(mail):
227 msg = parser.parse(StringIO.StringIO(mail.data))
228 parse_whats_cooking(msg, mail)
229 #patch = try_patch_anywhere(msg, mail)
230 #if patch:
231 # mail.patch_id = patch.commit.patch_id
232 mail.stale = False
235 def _query_stale_mail():
236 return (db.session.query(db.Mail)
237 .filter(db.Mail.stale == True)
238 .order_by(db.Mail.post_date.asc(), db.Mail.subject.asc()))
240 def walk_stale_mail():
241 global blobtracker
242 blobtracker = BlobTracker()
243 count = _query_stale_mail().count()
244 for mail in _query_stale_mail():
245 print "** %6d\n" % count
246 process_mail(mail)
247 count = count - 1
250 def get_mail_by_id(msg_id):
251 # Note: use first() because we don't know it exists. The DB
252 # guarantees uniqueness anyway.
253 try:
254 return dbcache.mail_cache[msg_id]
255 except KeyError:
256 return None
258 def decode_quoted(s):
259 try:
260 ret = []
261 for s, e in email.header.decode_header(s):
262 if e:
263 s = s.decode(e)
264 ret.append(s)
265 return ''.join(ret)
266 except UnicodeError:
267 return s
269 _space_regex = re.compile(r'\s+')
270 def sanitize_single_line(s):
271 if s is not None:
272 return _space_regex.sub(' ', s)[:255]
274 _gmane_id_regex = re.compile(r'<http://permalink\.gmane\.org/gmane\.comp\.version-control\.git/(\d+)>')
275 def insert_mail_into_db(msg):
276 if (msg.get('Message-Id', None)
277 and get_mail_by_id(_parse_msg_id(msg['Message-Id']))):
278 return # already exists
279 gmane_id = None
280 if msg['Archived-At']:
281 m = _gmane_id_regex.match(msg['Archived-At'])
282 if m:
283 gmane_id = int(m.group(1))
284 msgid = msg.get('Message-Id', None)
285 if not msgid or not _parse_msg_id(msgid):
286 if gmane_id:
287 msgid = 'gmane-%d@mailnotes.thomasrast.ch' % gmane_id
288 else:
289 msgid = 'fallback-%X@mailnotes.thomasrast.ch' % random.randrange(2**32)
290 else:
291 msgid = _parse_msg_id(msgid)
292 mail = dbcache.mail_cache.get(msgid)
293 mail.gmane_id = gmane_id
294 mail.message_id = sanitize_single_line(msgid)
295 if msg['From']:
296 name, addr = email.utils.parseaddr(msg['From'])
297 if name and addr:
298 mail.author = sanitize_single_line("%s <%s>" % (decode_quoted(name), addr))
299 else:
300 mail.author = sanitize_single_line(decode_quoted(msg['From']))
301 tm = None
302 if msg['Date']:
303 tm = email.utils.parsedate_tz(msg['Date'])
304 if tm:
305 tm = email.utils.mktime_tz(tm)
306 else:
307 tm = time.time()
308 mail.post_date = tm
309 if msg['Subject']:
310 mail.subject = sanitize_single_line(decode_quoted(msg['Subject']))
311 mail.in_reply_to = sanitize_single_line(_detect_reply_id(msg))
312 mail.data = msg.as_string()
313 mail.stale = mail.has_patch = bool(_guess_patch_contents(msg))
314 if msg['Subject'] and _whats_cooking_subject.match(msg['Subject']):
315 mail.stale = True
316 db.session.add(mail)
317 starter = mail
318 if mail.in_reply_to:
319 parent = get_mail_by_id(mail.in_reply_to)
320 if parent:
321 starter = parent
322 # Flag all so-far unapplied patches downwards of this one as
323 # 'stale' so they'll be tried again. XXX should use an sql UPDATE
324 # here!
325 for child in (db.session.query(db.Mail)
326 .select_from(join(db.Mail, db.Reference,
327 db.Mail.id == db.Reference.mail_id))
328 .filter(db.Reference.reference_id == starter.message_id)
329 .filter(db.Mail.has_patch == True)
330 .filter(db.Mail.patch_id == None)
331 .filter(db.Mail.message_id != mail.message_id)
332 .filter(db.Mail.stale == False)):
333 child.stale = True
334 if msg['References']:
335 for m in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
336 db.session.add(db.Reference(mail.id,
337 sanitize_single_line(m.group(1))))
339 def _parse_mail(s):
340 return parser.parse(StringIO.StringIO(s))
342 def parse_mbox(fname):
343 mbox = open(fname)
344 mbox_parsed = []
345 cur = None
346 for line in mbox:
347 if line.startswith('From news@gmane.org'):
348 if cur:
349 mbox_parsed.append(_parse_mail(''.join(cur)))
350 cur = []
351 else:
352 cur.append(line)
353 if cur:
354 mbox_parsed.append(_parse_mail(''.join(cur)))
355 count = len(mbox_parsed)
356 for msg in mbox_parsed:
357 insert_mail_into_db(msg)
358 count = count - 1
359 sys.stderr.write("%6d\r" % count)
360 sys.stderr.write("\n")
361 dbcache.mail_cache.flush()
363 if __name__ == '__main__':
364 for mbox in sys.argv[1:]:
365 parse_mbox(mbox)
366 walk_stale_mail()
367 db.session.commit()