Fix mail importer into shape again
[trackgit.git] / mail.py
blob8043f93b4ecd7e8e3264d7fa0803eca101f5f32d
1 import sys
2 import re
3 import time
4 import random
5 import mailbox
6 import email.Iterators
7 import email.Parser
8 import email.utils
9 import sqlalchemy
10 from sqlalchemy.orm import join
11 import cStringIO as StringIO
13 import db
14 import patch
15 from git import git
17 _msg_id_regex = re.compile(r'<([^<>]+)>')
18 def _parse_msg_id(str):
19 m = _msg_id_regex.search(str)
20 if m:
21 return m.group(1)
23 parser = email.Parser.Parser()
25 def _detect_reply_id(msg):
26 if msg['In-Reply-To']:
27 return _parse_msg_id(msg['In-Reply-To'])
28 if msg['References']:
29 refs = ' '.join(msg.get_all('References'))
30 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
31 return ref_ids[-1]
33 def _get_text_payloads(msg):
34 if not msg.is_multipart():
35 yield msg.get_payload()
36 return
37 for part in email.Iterators.typed_subpart_iterator(msg):
38 if part.is_multipart():
39 yield part.get_payload(0)
40 else:
41 yield part.get_payload()
43 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
44 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
45 re.MULTILINE|re.DOTALL)
46 def _guess_patch_contents(msg):
47 for p in _get_text_payloads(msg):
48 if _format_patch_regex.match(p):
49 msg.set_payload(p)
50 return msg.as_string()
51 m = _snip_patch_regex.match(p)
52 if m:
53 msg.set_payload(m.group(1))
54 return msg.as_string()
55 # no patch found
56 return None
59 def try_patch(session, m, pp, base_sha1):
60 git('checkout', base_sha1)
61 try:
62 pp.apply()
63 except patch.PatchError:
64 return # failed
65 pipe = git('show', ret_pipe=True)
66 output = git('patch-id', input_pipe=pipe)[0]
67 if not output:
68 # this means the patch had no diff; e.g., a mode change
69 return
70 patch_id, commit_id = output.split()
71 c = session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
72 if not c:
73 output = git('log', '-1', '--pretty=format:%ct %at')[0]
74 adate, cdate = [int(s) for s in output.split()]
75 c = db.Commit(commit_id, cdate, adate, patch_id, False)
76 session.add(c)
77 p = db.Patch(c, m.id, pp.notes)
78 session.add(p)
79 return p
81 def try_patch_anywhere(session, msg, m):
82 print '*', m.message_id
83 pdata = _guess_patch_contents(msg)
84 if not pdata:
85 return
86 pp = patch.Patch(pdata)
87 if pp.missing_files:
88 return # probably for another project
89 # first try on the commit given by the blobs
90 commits = []
91 for prefix in pp.blobs_pre:
92 ret = (session.query(db.Blob, db.Commit)
93 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
94 .filter(db.Blob.sha1.like(prefix+'%'))
95 .filter(db.Commit.upstream == True)
96 .order_by(db.Commit.cdate.desc()).first())
97 if not ret:
98 print 'blob %s not found?!' % prefix
99 break
100 commits.append(ret[1])
101 else:
102 if commits:
103 # all blobs found
104 cmt = min(commits)
105 print 'trying canonical commit %s' % cmt.sha1
106 applied = try_patch(session, m, pp, cmt.sha1)
107 if applied:
108 return applied
109 # this is just hopeless: it doesn't apply to the commit it should!
110 return
111 else:
112 print "no canonical commit found"
113 # if we have a parent, try on the parent
114 parent = session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
115 if parent and parent.has_patch and parent.patch_id:
116 cmt = (session.query(db.Commit)
117 .filter(db.Commit.patch_id==parent.patch_id)
118 .order_by(db.Commit.cdate.desc()).first())
119 print 'trying to apply on parent %s' % cmt.sha1
120 applied = try_patch(session, m, pp, cmt.sha1)
121 if applied:
122 return applied
123 else:
124 print "no parent commit found"
125 # try on origin/master
126 print 'trying on origin/master'
127 applied = try_patch(session, m, pp, 'origin/master')
128 if applied:
129 return applied
130 # same for origin/next
131 print 'trying on origin/next'
132 applied = try_patch(session, m, pp, 'origin/next')
133 if applied:
134 return applied
135 # all out of ideas!
137 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
138 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
139 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
140 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
142 def parse_whats_cooking(session, msg, mail):
143 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
144 return
145 category = None
146 branch = 'pu' # initial part goes on 'pu'
147 notes = []
148 def _rotate_notes(category, branch, notes):
149 if branch:
150 t = session.query(db.Topic).filter(db.Topic.name==branch).first()
151 if not t:
152 t = db.Topic()
153 t.name = branch
154 session.add(t)
155 t.mail_id = mail.id
156 t.cooking_notes = '\n'.join(notes)
157 notes = []
158 if category:
159 notes.append("[%s]" % category)
160 return notes
161 for line in _get_text_payload(msg).splitlines():
162 if _whats_cooking_separator.match(line):
163 category = None
164 notes = _rotate_notes(category, branch, notes)
165 branch = None
166 continue
167 m = _whats_cooking_category.match(line)
168 if m:
169 category = m.group(1)
170 notes = _rotate_notes(category, branch, notes)
171 continue
172 m = _whats_cooking_header.match(line)
173 if m:
174 notes = _rotate_notes(category, branch, notes)
175 notes.append(line)
176 branch = m.group(1)
177 continue
178 notes.append(line)
181 def process_mail(session, mail):
182 msg = parser.parse(StringIO.StringIO(mail.data))
183 parse_whats_cooking(session, msg, mail)
184 patch = try_patch_anywhere(session, msg, mail)
185 if patch:
186 m.patch_id = patch.commit.patch_id
189 def _query_stale_mail(session):
190 return (session.query(db.Mail)
191 .filter(db.Mail.has_patch == True)
192 .filter(db.Mail.stale == True)
193 .order_by(db.Mail.post_date.asc(), db.Mail.subject.asc()))
195 def walk_stale_mail():
196 session = db.Session()
197 count = _query_stale_mail(session).count()
198 for mail in _query_stale_mail(session):
199 process_mail(session, mail)
200 count = count - 1
201 sys.stderr.write("%6d\r" % count)
202 sys.stderr.write("\n")
205 def get_mail_by_id(session, msg_id):
206 # Note: use first() because we don't know it exists. The DB
207 # guarantees uniqueness anyway.
208 return (session.query(db.Mail)
209 .filter(db.Mail.message_id == msg_id)
210 .first())
212 _gmane_id_regex = re.compile(r'<http://permalink\.gmane\.org/gmane\.comp\.version-control\.git/(\d+)>')
213 def insert_mail_into_db(msg):
214 session = db.Session()
215 if (msg.get('Message-Id', None)
216 and get_mail_by_id(session, _parse_msg_id(msg['Message-Id']))):
217 return # already exists
218 mail = db.Mail()
219 if msg['Archived-At']:
220 m = _gmane_id_regex.match(msg['Archived-At'])
221 if m:
222 mail.gmane_id = int(m.group(1))
223 msgid = msg.get('Message-Id', None)
224 if not msgid or not _parse_msg_id(msgid):
225 if mail.gmane_id:
226 msgid = 'gmane-%d@mailnotes.trast.ch' % mail.gmane_id
227 else:
228 msgid = 'fallback-%X@mailnotes.trast.ch' % random.randrange(2**32)
229 else:
230 msgid = _parse_msg_id(msgid)
231 mail.message_id = msgid
232 if msg['From']:
233 mail.author = msg['From']
234 tm = None
235 if msg['Date']:
236 tm = email.utils.parsedate(msg['Date'])
237 if tm:
238 tm = time.mktime(tm)
239 else:
240 tm = time.time()
241 mail.post_date = tm
242 if msg['Subject']:
243 mail.subject = msg['Subject']
244 mail.in_reply_to = _detect_reply_id(msg)
245 mail.data = msg.as_string()
246 mail.stale = mail.has_patch = bool(_guess_patch_contents(msg))
247 session.add(mail)
248 starter = mail
249 if mail.in_reply_to:
250 parent = get_mail_by_id(session, mail.in_reply_to)
251 if parent:
252 starter = parent
253 # Flag all so-far unapplied patches downwards of this one as
254 # 'stale' so they'll be tried again. XXX should use an sql UPDATE
255 # here!
256 for child in (session.query(db.Mail)
257 .select_from(join(db.Mail, db.Reference,
258 db.Mail.id == db.Reference.mail_id))
259 .filter(db.Reference.reference_id == starter.message_id)
260 .filter(db.Mail.has_patch == True)
261 .filter(db.Mail.patch_id == None)
262 .filter(db.Mail.stale == False)):
263 child.stale = True
264 session.commit()
265 if msg['References']:
266 for m in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
267 session.add(db.Reference(mail.id, m.group(1)))
268 session.commit()
270 def parse_mbox(fname):
271 session = db.Session()
272 mbox = mailbox.mbox(fname, parser.parse)
273 mbox_parsed = list(mbox)
274 count = len(mbox_parsed)
275 for msg in mbox_parsed:
276 insert_mail_into_db(msg)
277 count = count - 1
278 sys.stderr.write("%6d\r" % count)
279 sys.stderr.write("\n")
281 if __name__ == '__main__':
282 for mbox in sys.argv[1:]:
283 parse_mbox(mbox)
284 walk_stale_mail()