Refactor blob cache code into separate module
[trackgit.git] / mail.py
blobe18eff56fa28ed51db07e05f34333c267ef82b18
1 import sys
2 import re
3 import time
4 import random
5 import mailbox
6 import email.Iterators
7 import email.Parser
8 import email.utils
9 import sqlalchemy
10 from sqlalchemy.orm import join
12 import db
13 import patch
14 from git import git
16 _msg_id_regex = re.compile(r'<([^<>]+)>')
17 def _parse_msg_id(str):
18 m = _msg_id_regex.search(str)
19 if m:
20 return m.group(1)
22 parser = email.Parser.Parser()
24 def _detect_reply_id(msg):
25 if msg['In-Reply-To']:
26 return _parse_msg_id(msg['In-Reply-To'])
27 if msg['References']:
28 refs = ' '.join(msg.get_all('References'))
29 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
30 return ref_ids[-1]
32 def _get_text_payloads(msg):
33 if not msg.is_multipart():
34 yield msg.get_payload()
35 return
36 for part in email.Iterators.typed_subpart_iterator(msg):
37 if part.is_multipart():
38 yield textpart.get_payload(0)
39 else:
40 yield textpart.get_payload()
42 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
43 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
44 re.MULTILINE|re.DOTALL)
45 def _guess_patch_contents(msg):
46 for p in _get_text_payloads(msg):
47 if _format_patch_regex.match(p):
48 msg.set_payload(p)
49 return msg.as_string()
50 m = _snip_patch_regex.match(p)
51 if m:
52 msg.set_payload(m.group(1))
53 return msg.as_string()
54 # no patch found
55 return None
58 def later_unapplied_patches(session, msg):
59 return (session.query(Mail)
60 .filter(Mail.has_patch==True)
61 .filter(sqlalchemy.in_(msg.message_id, Mail.references))
62 .all())
65 def try_patch(session, m, pp, commit):
66 git('checkout', commit.sha1)
67 try:
68 pp.apply()
69 except patch.PatchError:
70 return # failed
71 pipe = git('show', ret_pipe=True)
72 output = git('patch-id', input_pipe=pipe)[0]
73 if not output:
74 # this means the patch had no diff; e.g., a mode change
75 return
76 patch_id, commit_id = output.split()
77 c = session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
78 if not c:
79 output = git('log', '-1', '--pretty=format:%ct %at')[0]
80 adate, cdate = [int(s) for s in output.split()]
81 c = db.Commit(commit_id, cdate, adate, patch_id, False)
82 session.add(c)
83 p = db.Patch(c, m.id, pp.notes)
84 session.add(p)
85 return p
87 def try_patch_anywhere(session, msg, m):
88 pdata = _guess_patch_contents(msg)
89 if not pdata:
90 return
91 pp = patch.Patch(pdata)
92 # first try on the commit given by the blobs
93 commits = []
94 for prefix in pp.blobs_pre:
95 ret = (session.query(db.Blob, db.Commit)
96 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
97 .filter(db.Blob.sha1.like(prefix+'%'))
98 .filter(db.Commit.upstream == True)
99 .order_by(db.Commit.cdate.desc()).first())
100 if not ret:
101 print 'blob %s not found?!' % prefix
102 break
103 commits.append(ret[1])
104 else:
105 if commits:
106 # all blobs found
107 cmt = min(commits)
108 print 'trying canonical commit %s' % cmt.sha1
109 applied = try_patch(session, m, pp, cmt)
110 if applied:
111 return applied
112 # this is just hopeless: it doesn't apply to the commit it should!
113 return
114 else:
115 print "no canonical commit found"
116 # if we have a parent, try on the parent
117 parent = session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
118 if parent and parent.has_patch and parent.patch_id:
119 cmt = (session.query(db.Commit)
120 .filter(db.Commit.patch_id==parent.patch_id)
121 .order_by(db.Commit.cdate.desc()).first())
122 print 'trying to apply on parent %s' % cmt.sha1
123 applied = try_patch(session, m, pp, cmt)
124 if applied:
125 return applied
126 else:
127 print "no parent commit found"
128 # try on origin/master
129 print 'trying on origin/master'
130 master = git('rev-parse', 'origin/master')[0].strip()
131 cmt = session.query(db.Commit).filter(db.Commit.sha1==master).one()
132 applied = try_patch(session, m, pp, cmt)
133 if applied:
134 return applied
135 # same for origin/next
136 print 'trying on origin/next'
137 next = git('rev-parse', 'origin/next')[0].strip()
138 cmt = session.query(db.Commit).filter(db.Commit.sha1==next).one()
139 applied = try_patch(session, m, pp, cmt)
140 if applied:
141 return applied
142 # all out of ideas!
144 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
145 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
146 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
147 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
149 def parse_whats_cooking(session, msg, mail):
150 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
151 return
152 category = None
153 branch = 'pu' # initial part goes on 'pu'
154 notes = []
155 def _rotate_notes(category, branch, notes):
156 if branch:
157 t = session.query(db.Topic).filter(db.Topic.name==branch).first()
158 if not t:
159 t = db.Topic()
160 t.name = branch
161 session.add(t)
162 t.mail_id = mail.id
163 t.cooking_notes = '\n'.join(notes)
164 notes = []
165 if category:
166 notes.append("[%s]" % category)
167 return notes
168 for line in _get_text_payload(msg).splitlines():
169 if _whats_cooking_separator.match(line):
170 category = None
171 notes = _rotate_notes(category, branch, notes)
172 branch = None
173 continue
174 m = _whats_cooking_category.match(line)
175 if m:
176 category = m.group(1)
177 notes = _rotate_notes(category, branch, notes)
178 continue
179 m = _whats_cooking_header.match(line)
180 if m:
181 notes = _rotate_notes(category, branch, notes)
182 notes.append(line)
183 branch = m.group(1)
184 continue
185 notes.append(line)
187 def parse_mail(session, msg):
188 if (session.query(db.Mail.message_id)
189 .filter(db.Mail.message_id == _parse_msg_id(msg['Message-Id']))
190 .first()):
191 return [] # already exists
192 m = db.Mail()
193 m.message_id = _parse_msg_id(msg['Message-Id'])
194 m.author = msg['From']
195 m.in_reply_to = _detect_reply_id(msg)
196 m.post_date = time.mktime(email.utils.parsedate(msg['Date']))
197 m.data = msg.as_string()
198 m.has_patch = bool(_guess_patch_contents(msg))
199 session.add(m)
200 references = []
201 if msg['References']:
202 for im in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
203 references.append((m, im.group(1)))
204 # try patching
205 patch = try_patch_anywhere(session, msg, m)
206 if patch:
207 m.patch_id = patch.commit.patch_id
208 # try reading a what's cooking message
209 parse_whats_cooking(session, msg, m)
210 session.commit()
211 return references
213 def get_mail_by_id(session, msg_id):
214 # Note: use first() because we don't know it exists. The DB
215 # guarantees uniqueness anyway.
216 return (session.query(db.Mail.message_id)
217 .filter(db.Mail.message_id == msg_id)
218 .first())
220 _gmane_id_regex = re.compile(r'<http://permalink\.gmane\.org/gmane\.comp\.version-control\.git/(\d+)>')
221 def insert_mail_into_db(msg):
222 session = db.Session()
223 if (msg.get('Message-Id', None)
224 and get_mail_by_id(_parse_msg_id(msg['Message-Id']))):
225 return [] # already exists
226 mail = db.Mail()
227 if msg['Archived-At']:
228 m = _gmane_id_regex.match(msg['Archived-At'])
229 if m:
230 mail.gmane_id = int(m.group(1))
231 msgid = msg.get('Message-Id', None)
232 if not msgid or not _parse_msg_id(msg):
233 if mail.gmane_id:
234 msgid = 'gmane-%d@mailnotes.trast.ch' % mail.gmane_id
235 else:
236 msgid = 'fallback-%X@mailnotes.trast.ch' % random.randrange(2**32)
237 mail.message_id = msgid
238 if msg['From']:
239 mail.author = msg['From']
240 tm = None
241 if msg['Date']:
242 tm = email.utils.parsedate(msg['Date'])
243 if tm:
244 tm = time.mktime(tm)
245 else:
246 tm = time.time()
247 mail.post_date = tm
248 if msg['Subject']:
249 mail.subject = msg['Subject']
250 m.in_reply_to = _detect_reply_id(msg)
251 mail.data = msg.as_string()
252 mail.stale = mail.has_patch = bool(_guess_patch_contents(msg))
253 session.add(mail)
254 if msg['References']:
255 for m in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
256 session.add(db.Reference(mail.id, m.group(1)))
257 starter = mail
258 if mail.in_reply_to:
259 parent = get_mail_by_id(mail.in_reply_to)
260 if parent:
261 starter = parent
262 # Flag all so-far unapplied patches downwards of this one as
263 # 'stale' so they'll be tried again. XXX should use an sql UPDATE
264 # here!
265 for child in (session.query(db.Mail)
266 .select_from(join(db.Mail, db.Reference,
267 db.Mail.id == db.Reference.mail_id))
268 .filter(db.Reference.reference_id == starter.message_id)
269 .filter(db.Mail.has_patch == True)
270 .filter(db.Mail.patch_id == None)
271 .filter(db.Mail.stale == False)):
272 child.stale = True
273 session.commit()
275 def parse_mbox(fname):
276 session = db.Session()
277 mbox = mailbox.mbox(fname, parser.parse)
278 mbox_parsed = list(mbox)
279 references = []
280 for msg in mbox_parsed:
281 references.extend(parse_mail(session, msg))
282 session.commit()
283 for m, r in references:
284 session.add(db.Reference(m.id, r))
285 session.commit()
287 if __name__ == '__main__':
288 for mbox in sys.argv[1:]:
289 parse_mbox(mbox)