let mbox parser handle several
[trackgit.git] / mail.py
blobf8d2c6450f64b48bdde2bf800bb960a2e1f48f4b
1 import sys
2 import re
3 import time
4 import mailbox
5 import email.Iterators
6 import email.Parser
7 import email.utils
8 import sqlalchemy
10 import db
11 import patch
12 from git import git
14 _msg_id_regex = re.compile(r'<([^<>]+)>')
15 def _parse_msg_id(str):
16 return _msg_id_regex.search(str).group(1)
18 parser = email.Parser.Parser()
20 def _detect_reply_id(msg):
21 if msg['In-Reply-To']:
22 return _parse_msg_id(msg['In-Reply-To'])
23 if msg['References']:
24 refs = ' '.join(msg.get_all('References'))
25 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
26 return ref_ids[-1]
28 def _get_text_payload(msg):
29 if not msg.is_multipart():
30 return msg.get_payload()
31 textpart = max(email.Iterators.typed_subpart_iterator(msg), key=len)
32 if textpart.is_multipart():
33 return textpart.get_payload(0)
34 else:
35 return textpart.get_payload()
37 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
38 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
39 re.MULTILINE|re.DOTALL)
40 def _guess_patch_contents(msg):
41 p = _get_text_payload(msg)
42 if _format_patch_regex.match(p):
43 msg.set_payload(p)
44 return msg.as_string()
45 m = _snip_patch_regex.match(p)
46 if m:
47 msg.set_payload(m.group(1))
48 return msg.as_string()
49 # no patch found
50 return None
53 def later_unapplied_patches(session, msg):
54 return (session.query(Mail)
55 .filter(Mail.has_patch==True)
56 .filter(sqlalchemy.in_(msg.message_id, Mail.references))
57 .all())
60 def try_patch(session, m, pp, commit):
61 git('checkout', commit.sha1)
62 try:
63 pp.apply()
64 except patch.PatchError:
65 return # failed
66 pipe = git('show', ret_pipe=True)
67 output = git('patch-id', input_pipe=pipe)[0]
68 patch_id, commit_id = output.split()
69 c = session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
70 if not c:
71 output = git('log', '-1', '--pretty=format:%ct %at')[0]
72 adate, cdate = [int(s) for s in output.split()]
73 c = db.Commit(commit_id, cdate, adate, patch_id, False)
74 session.add(c)
75 p = db.Patch(c, m.id, pp.notes)
76 session.add(p)
77 return p
79 def try_patch_anywhere(session, msg, m):
80 pdata = _guess_patch_contents(msg)
81 if not pdata:
82 return
83 pp = patch.Patch(pdata)
84 # first try on the commit given by the blobs
85 commits = []
86 for prefix in pp.blobs_pre:
87 ret = (session.query(db.Blob, db.Commit)
88 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
89 .filter(db.Blob.sha1.like(prefix+'%'))
90 .filter(db.Commit.upstream == True)
91 .order_by(db.Commit.cdate.desc()).first())
92 if not ret:
93 print 'blob %s not found?!' % prefix
94 break
95 commits.append(ret[1])
96 else:
97 if commits:
98 # all blobs found
99 cmt = min(commits)
100 print 'trying canonical commit %s' % cmt.sha1
101 applied = try_patch(session, m, pp, cmt)
102 if applied:
103 return applied
104 # this is just hopeless: it doesn't apply to the commit it should!
105 return
106 else:
107 print "no canonical commit found"
108 # if we have a parent, try on the parent
109 parent = session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
110 if parent and parent.has_patch and parent.patch_id:
111 cmt = (session.query(db.Commit)
112 .filter(db.Commit.patch_id==parent.patch_id)
113 .order_by(db.Commit.cdate.desc()).first())
114 print 'trying to apply on parent %s' % cmt.sha1
115 applied = try_patch(session, m, pp, cmt)
116 if applied:
117 return applied
118 else:
119 print "no parent commit found"
120 # try on origin/master
121 print 'trying on origin/master'
122 master = git('rev-parse', 'origin/master')[0].strip()
123 cmt = session.query(db.Commit).filter(db.Commit.sha1==master).one()
124 applied = try_patch(session, m, pp, cmt)
125 if applied:
126 return applied
127 # same for origin/next
128 print 'trying on origin/next'
129 next = git('rev-parse', 'origin/next')[0].strip()
130 cmt = session.query(db.Commit).filter(db.Commit.sha1==next).one()
131 applied = try_patch(session, m, pp, cmt)
132 if applied:
133 return applied
134 # all out of ideas!
136 def parse_mail(session, msg):
137 if (session.query(db.Mail.message_id)
138 .filter(db.Mail.message_id == _parse_msg_id(msg['Message-Id']))
139 .first()):
140 return [] # already exists
141 m = db.Mail()
142 m.message_id = _parse_msg_id(msg['Message-Id'])
143 m.author = msg['From']
144 m.in_reply_to = _detect_reply_id(msg)
145 m.post_date = time.mktime(email.utils.parsedate(msg['Date']))
146 m.payload = msg.as_string()
147 m.has_patch = bool(_guess_patch_contents(msg))
148 session.add(m)
149 references = []
150 if msg['References']:
151 for im in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
152 references.append((m, im.group(1)))
153 patch = try_patch_anywhere(session, msg, m)
154 if patch:
155 m.patch_id = patch.commit.patch_id
156 session.commit()
157 return references
159 def parse_mbox(fname):
160 session = db.Session()
161 mbox = mailbox.mbox(fname, parser.parse)
162 mbox_parsed = list(mbox)
163 references = []
164 for msg in mbox_parsed:
165 references.extend(parse_mail(session, msg))
166 session.commit()
167 for m, r in references:
168 session.add(db.Reference(m.id, r))
169 session.commit()
171 if __name__ == '__main__':
172 for mbox in sys.argv[1:]:
173 parse_mbox(mbox)