Track non-upstream commits in blob/file cache
[trackgit.git] / mail.py
blobce3ad5bfeb8d1cca3397a00192f809aaa28d27c0
1 import sys
2 import re
3 import time
4 import random
5 import mailbox
6 import email.Iterators
7 import email.Parser
8 import email.utils
9 import sqlalchemy
10 from sqlalchemy.orm import join
11 import cStringIO as StringIO
13 import db
14 import patch
15 from git import git
17 _msg_id_regex = re.compile(r'<([^<>]+)>')
18 def _parse_msg_id(str):
19 m = _msg_id_regex.search(str)
20 if m:
21 return m.group(1)
23 parser = email.Parser.Parser()
25 def _detect_reply_id(msg):
26 if msg['In-Reply-To']:
27 return _parse_msg_id(msg['In-Reply-To'])
28 if msg['References']:
29 refs = ' '.join(msg.get_all('References'))
30 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
31 return ref_ids[-1]
33 def _get_text_payloads(msg):
34 if not msg.is_multipart():
35 yield msg.get_payload()
36 return
37 for part in email.Iterators.typed_subpart_iterator(msg):
38 if part.is_multipart():
39 yield part.get_payload(0)
40 else:
41 yield part.get_payload()
43 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
44 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
45 re.MULTILINE|re.DOTALL)
46 def _guess_patch_contents(msg):
47 for p in _get_text_payloads(msg):
48 if _format_patch_regex.match(p):
49 msg.set_payload(p)
50 return msg.as_string()
51 m = _snip_patch_regex.match(p)
52 if m:
53 msg.set_payload(m.group(1))
54 return msg.as_string()
55 # no patch found
56 return None
58 blobtracker = BlobTracker()
60 def try_patch(session, m, pp, base_sha1):
61 git('checkout', base_sha1)
62 try:
63 pp.apply()
64 except patch.PatchError:
65 return # failed
66 pipe = git('show', ret_pipe=True)
67 output = git('patch-id', input_pipe=pipe)[0]
68 if not output:
69 # this means the patch had no diff; e.g., a mode change
70 return
71 patch_id, commit_id = output.split()
72 c = session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
73 if not c:
74 output = git('log', '-1', '--pretty=format:%ct %at')[0]
75 adate, cdate = [int(s) for s in output.split()]
76 c = db.Commit(commit_id, cdate, adate, patch_id, False)
77 session.add(c)
78 blobtracker.scan_commit(commit_id)
79 p = db.Patch(c, m.id, pp.notes)
80 session.add(p)
81 return p
83 def try_patch_anywhere(session, msg, m):
84 print '*', m.message_id
85 pdata = _guess_patch_contents(msg)
86 if not pdata:
87 return
88 pp = patch.Patch(pdata)
89 if pp.missing_files:
90 return # probably for another project
91 # first try on the commit given by the blobs
92 commits = []
93 for prefix in pp.blobs_pre:
94 ret = (session.query(db.Blob, db.Commit)
95 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
96 .filter(db.Blob.sha1.like(prefix+'%'))
97 .filter(db.Commit.upstream == True)
98 .order_by(db.Commit.cdate.desc()).first())
99 if not ret:
100 print 'blob %s not found?!' % prefix
101 break
102 commits.append(ret[1])
103 else:
104 if commits:
105 # all blobs found
106 cmt = min(commits)
107 print 'trying canonical commit %s' % cmt.sha1
108 applied = try_patch(session, m, pp, cmt.sha1)
109 if applied:
110 return applied
111 # this is just hopeless: it doesn't apply to the commit it should!
112 return
113 else:
114 print "no canonical commit found"
115 # if we have a parent, try on the parent
116 parent = session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
117 if parent and parent.has_patch and parent.patch_id:
118 cmt = (session.query(db.Commit)
119 .filter(db.Commit.patch_id==parent.patch_id)
120 .order_by(db.Commit.cdate.desc()).first())
121 print 'trying to apply on parent %s' % cmt.sha1
122 applied = try_patch(session, m, pp, cmt.sha1)
123 if applied:
124 return applied
125 else:
126 print "no parent commit found"
127 # try on origin/master
128 print 'trying on origin/master'
129 applied = try_patch(session, m, pp, 'origin/master')
130 if applied:
131 return applied
132 # same for origin/next
133 print 'trying on origin/next'
134 applied = try_patch(session, m, pp, 'origin/next')
135 if applied:
136 return applied
137 # all out of ideas!
139 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
140 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
141 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
142 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
144 def parse_whats_cooking(session, msg, mail):
145 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
146 return
147 category = None
148 branch = 'pu' # initial part goes on 'pu'
149 notes = []
150 def _rotate_notes(category, branch, notes):
151 if branch:
152 t = session.query(db.Topic).filter(db.Topic.name==branch).first()
153 if not t:
154 t = db.Topic()
155 t.name = branch
156 session.add(t)
157 t.mail_id = mail.id
158 t.cooking_notes = '\n'.join(notes)
159 notes = []
160 if category:
161 notes.append("[%s]" % category)
162 return notes
163 for line in _get_text_payload(msg).splitlines():
164 if _whats_cooking_separator.match(line):
165 category = None
166 notes = _rotate_notes(category, branch, notes)
167 branch = None
168 continue
169 m = _whats_cooking_category.match(line)
170 if m:
171 category = m.group(1)
172 notes = _rotate_notes(category, branch, notes)
173 continue
174 m = _whats_cooking_header.match(line)
175 if m:
176 notes = _rotate_notes(category, branch, notes)
177 notes.append(line)
178 branch = m.group(1)
179 continue
180 notes.append(line)
183 def process_mail(session, mail):
184 msg = parser.parse(StringIO.StringIO(mail.data))
185 parse_whats_cooking(session, msg, mail)
186 patch = try_patch_anywhere(session, msg, mail)
187 if patch:
188 mail.patch_id = patch.commit.patch_id
189 mail.stale = False
192 def _query_stale_mail(session):
193 return (session.query(db.Mail)
194 .filter(db.Mail.has_patch == True)
195 .filter(db.Mail.stale == True)
196 .order_by(db.Mail.post_date.asc(), db.Mail.subject.asc()))
198 def walk_stale_mail():
199 session = db.Session()
200 count = _query_stale_mail(session).count()
201 for mail in _query_stale_mail(session):
202 process_mail(session, mail)
203 count = count - 1
204 sys.stderr.write("%6d\r" % count)
205 sys.stderr.write("\n")
208 def get_mail_by_id(session, msg_id):
209 # Note: use first() because we don't know it exists. The DB
210 # guarantees uniqueness anyway.
211 return (session.query(db.Mail)
212 .filter(db.Mail.message_id == msg_id)
213 .first())
215 _gmane_id_regex = re.compile(r'<http://permalink\.gmane\.org/gmane\.comp\.version-control\.git/(\d+)>')
216 def insert_mail_into_db(msg):
217 session = db.Session()
218 if (msg.get('Message-Id', None)
219 and get_mail_by_id(session, _parse_msg_id(msg['Message-Id']))):
220 return # already exists
221 mail = db.Mail()
222 if msg['Archived-At']:
223 m = _gmane_id_regex.match(msg['Archived-At'])
224 if m:
225 mail.gmane_id = int(m.group(1))
226 msgid = msg.get('Message-Id', None)
227 if not msgid or not _parse_msg_id(msgid):
228 if mail.gmane_id:
229 msgid = 'gmane-%d@mailnotes.trast.ch' % mail.gmane_id
230 else:
231 msgid = 'fallback-%X@mailnotes.trast.ch' % random.randrange(2**32)
232 else:
233 msgid = _parse_msg_id(msgid)
234 mail.message_id = msgid
235 if msg['From']:
236 mail.author = msg['From'][:255]
237 tm = None
238 if msg['Date']:
239 tm = email.utils.parsedate(msg['Date'])
240 if tm:
241 tm = time.mktime(tm)
242 else:
243 tm = time.time()
244 mail.post_date = tm
245 if msg['Subject']:
246 mail.subject = msg['Subject'][:255]
247 mail.in_reply_to = _detect_reply_id(msg)
248 mail.data = msg.as_string()
249 mail.stale = mail.has_patch = bool(_guess_patch_contents(msg))
250 session.add(mail)
251 starter = mail
252 if mail.in_reply_to:
253 parent = get_mail_by_id(session, mail.in_reply_to)
254 if parent:
255 starter = parent
256 # Flag all so-far unapplied patches downwards of this one as
257 # 'stale' so they'll be tried again. XXX should use an sql UPDATE
258 # here!
259 for child in (session.query(db.Mail)
260 .select_from(join(db.Mail, db.Reference,
261 db.Mail.id == db.Reference.mail_id))
262 .filter(db.Reference.reference_id == starter.message_id)
263 .filter(db.Mail.has_patch == True)
264 .filter(db.Mail.patch_id == None)
265 .filter(db.Mail.message_id != mail.message_id)
266 .filter(db.Mail.stale == False)):
267 child.stale = True
268 session.commit()
269 if msg['References']:
270 for m in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
271 session.add(db.Reference(mail.id, m.group(1)))
272 session.commit()
274 def parse_mbox(fname):
275 session = db.Session()
276 mbox = mailbox.mbox(fname, parser.parse)
277 mbox_parsed = list(mbox)
278 count = len(mbox_parsed)
279 for msg in mbox_parsed:
280 insert_mail_into_db(msg)
281 count = count - 1
282 sys.stderr.write("%6d\r" % count)
283 sys.stderr.write("\n")
285 if __name__ == '__main__':
286 for mbox in sys.argv[1:]:
287 parse_mbox(mbox)
288 walk_stale_mail()