Re-add dropped line to parse_mbox()
[trackgit.git] / mail.py
blob421289fc5fc6025f218930d51b794194f40bbdab
1 import sys
2 import re
3 import time
4 import random
5 import mailbox
6 import email.Iterators
7 import email.Parser
8 import email.utils
9 import sqlalchemy
10 from sqlalchemy.orm import join
12 import db
13 import patch
14 from git import git
16 _msg_id_regex = re.compile(r'<([^<>]+)>')
17 def _parse_msg_id(str):
18 m = _msg_id_regex.search(str)
19 if m:
20 return m.group(1)
22 parser = email.Parser.Parser()
24 def _detect_reply_id(msg):
25 if msg['In-Reply-To']:
26 return _parse_msg_id(msg['In-Reply-To'])
27 if msg['References']:
28 refs = ' '.join(msg.get_all('References'))
29 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
30 return ref_ids[-1]
32 def _get_text_payloads(msg):
33 if not msg.is_multipart():
34 yield msg.get_payload()
35 return
36 for part in email.Iterators.typed_subpart_iterator(msg):
37 if part.is_multipart():
38 yield textpart.get_payload(0)
39 else:
40 yield textpart.get_payload()
42 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
43 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
44 re.MULTILINE|re.DOTALL)
45 def _guess_patch_contents(msg):
46 for p in _get_text_payloads(msg):
47 if _format_patch_regex.match(p):
48 msg.set_payload(p)
49 return msg.as_string()
50 m = _snip_patch_regex.match(p)
51 if m:
52 msg.set_payload(m.group(1))
53 return msg.as_string()
54 # no patch found
55 return None
58 def try_patch(session, m, pp, base_sha1):
59 git('checkout', base_sha1)
60 try:
61 pp.apply()
62 except patch.PatchError:
63 return # failed
64 pipe = git('show', ret_pipe=True)
65 output = git('patch-id', input_pipe=pipe)[0]
66 if not output:
67 # this means the patch had no diff; e.g., a mode change
68 return
69 patch_id, commit_id = output.split()
70 c = session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
71 if not c:
72 output = git('log', '-1', '--pretty=format:%ct %at')[0]
73 adate, cdate = [int(s) for s in output.split()]
74 c = db.Commit(commit_id, cdate, adate, patch_id, False)
75 session.add(c)
76 p = db.Patch(c, m.id, pp.notes)
77 session.add(p)
78 return p
80 def try_patch_anywhere(session, msg, m):
81 print '*', m.message_id
82 pdata = _guess_patch_contents(msg)
83 if not pdata:
84 return
85 pp = patch.Patch(pdata)
86 if pp.missing_files:
87 return # probably for another project
88 # first try on the commit given by the blobs
89 commits = []
90 for prefix in pp.blobs_pre:
91 ret = (session.query(db.Blob, db.Commit)
92 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
93 .filter(db.Blob.sha1.like(prefix+'%'))
94 .filter(db.Commit.upstream == True)
95 .order_by(db.Commit.cdate.desc()).first())
96 if not ret:
97 print 'blob %s not found?!' % prefix
98 break
99 commits.append(ret[1])
100 else:
101 if commits:
102 # all blobs found
103 cmt = min(commits)
104 print 'trying canonical commit %s' % cmt.sha1
105 applied = try_patch(session, m, pp, cmt.sha1)
106 if applied:
107 return applied
108 # this is just hopeless: it doesn't apply to the commit it should!
109 return
110 else:
111 print "no canonical commit found"
112 # if we have a parent, try on the parent
113 parent = session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
114 if parent and parent.has_patch and parent.patch_id:
115 cmt = (session.query(db.Commit)
116 .filter(db.Commit.patch_id==parent.patch_id)
117 .order_by(db.Commit.cdate.desc()).first())
118 print 'trying to apply on parent %s' % cmt.sha1
119 applied = try_patch(session, m, pp, cmt.sha1)
120 if applied:
121 return applied
122 else:
123 print "no parent commit found"
124 # try on origin/master
125 print 'trying on origin/master'
126 applied = try_patch(session, m, pp, 'origin/master')
127 if applied:
128 return applied
129 # same for origin/next
130 print 'trying on origin/next'
131 applied = try_patch(session, m, pp, 'origin/next')
132 if applied:
133 return applied
134 # all out of ideas!
136 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
137 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
138 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
139 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
141 def parse_whats_cooking(session, msg, mail):
142 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
143 return
144 category = None
145 branch = 'pu' # initial part goes on 'pu'
146 notes = []
147 def _rotate_notes(category, branch, notes):
148 if branch:
149 t = session.query(db.Topic).filter(db.Topic.name==branch).first()
150 if not t:
151 t = db.Topic()
152 t.name = branch
153 session.add(t)
154 t.mail_id = mail.id
155 t.cooking_notes = '\n'.join(notes)
156 notes = []
157 if category:
158 notes.append("[%s]" % category)
159 return notes
160 for line in _get_text_payload(msg).splitlines():
161 if _whats_cooking_separator.match(line):
162 category = None
163 notes = _rotate_notes(category, branch, notes)
164 branch = None
165 continue
166 m = _whats_cooking_category.match(line)
167 if m:
168 category = m.group(1)
169 notes = _rotate_notes(category, branch, notes)
170 continue
171 m = _whats_cooking_header.match(line)
172 if m:
173 notes = _rotate_notes(category, branch, notes)
174 notes.append(line)
175 branch = m.group(1)
176 continue
177 notes.append(line)
180 def process_mail(session, mail):
181 msg = parser.parse(mail.data)
182 parse_whats_cooking(session, msg, mail)
183 patch = try_patch_anywhere(session, msg, mail)
184 if patch:
185 m.patch_id = patch.commit.patch_id
188 def _query_stale_mail(session):
189 return (session.query(db.Mail)
190 .filter(db.Mail.has_patch == True)
191 .filter(db.Mail.stale == True)
192 .order_by(db.Mail.post_date.asc(), db.Mail.subject.asc()))
194 def walk_stale_mail():
195 session = db.Session()
196 count = _query_stale_mail(session).count()
197 for mail in _query_stale_mail(session):
198 process_mail(session, mail)
199 count = count - 1
200 sys.stderr.write("%6d\r" % count)
201 sys.stderr.write("\n")
204 def get_mail_by_id(session, msg_id):
205 # Note: use first() because we don't know it exists. The DB
206 # guarantees uniqueness anyway.
207 return (session.query(db.Mail.message_id)
208 .filter(db.Mail.message_id == msg_id)
209 .first())
211 _gmane_id_regex = re.compile(r'<http://permalink\.gmane\.org/gmane\.comp\.version-control\.git/(\d+)>')
212 def insert_mail_into_db(msg):
213 session = db.Session()
214 if (msg.get('Message-Id', None)
215 and get_mail_by_id(_parse_msg_id(msg['Message-Id']))):
216 return [] # already exists
217 mail = db.Mail()
218 if msg['Archived-At']:
219 m = _gmane_id_regex.match(msg['Archived-At'])
220 if m:
221 mail.gmane_id = int(m.group(1))
222 msgid = msg.get('Message-Id', None)
223 if not msgid or not _parse_msg_id(msg):
224 if mail.gmane_id:
225 msgid = 'gmane-%d@mailnotes.trast.ch' % mail.gmane_id
226 else:
227 msgid = 'fallback-%X@mailnotes.trast.ch' % random.randrange(2**32)
228 mail.message_id = msgid
229 if msg['From']:
230 mail.author = msg['From']
231 tm = None
232 if msg['Date']:
233 tm = email.utils.parsedate(msg['Date'])
234 if tm:
235 tm = time.mktime(tm)
236 else:
237 tm = time.time()
238 mail.post_date = tm
239 if msg['Subject']:
240 mail.subject = msg['Subject']
241 m.in_reply_to = _detect_reply_id(msg)
242 mail.data = msg.as_string()
243 mail.stale = mail.has_patch = bool(_guess_patch_contents(msg))
244 session.add(mail)
245 if msg['References']:
246 for m in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
247 session.add(db.Reference(mail.id, m.group(1)))
248 starter = mail
249 if mail.in_reply_to:
250 parent = get_mail_by_id(mail.in_reply_to)
251 if parent:
252 starter = parent
253 # Flag all so-far unapplied patches downwards of this one as
254 # 'stale' so they'll be tried again. XXX should use an sql UPDATE
255 # here!
256 for child in (session.query(db.Mail)
257 .select_from(join(db.Mail, db.Reference,
258 db.Mail.id == db.Reference.mail_id))
259 .filter(db.Reference.reference_id == starter.message_id)
260 .filter(db.Mail.has_patch == True)
261 .filter(db.Mail.patch_id == None)
262 .filter(db.Mail.stale == False)):
263 child.stale = True
264 session.commit()
266 def parse_mbox(fname):
267 session = db.Session()
268 mbox = mailbox.mbox(fname, parser.parse)
269 mbox_parsed = list(mbox)
270 count = len(mbox_parsed)
271 for msg in mbox_parsed:
272 insert_mail_into_db(msg)
273 count = count - 1
274 sys.stderr.write("%6d\r" % count)
275 sys.stderr.write("\n")
277 if __name__ == '__main__':
278 for mbox in sys.argv[1:]:
279 parse_mbox(mbox)
280 walk_stale_mail()