Drop foreign key from 'References:' relation
[trackgit.git] / mail.py
blob6efba2b09a387644e580ceabe76ac7f387020be2
1 import sys
2 import re
3 import time
4 import mailbox
5 import email.Iterators
6 import email.Parser
7 import email.utils
8 import sqlalchemy
10 import db
11 import patch
12 from git import git
14 _msg_id_regex = re.compile(r'<([^<>]+)>')
15 def _parse_msg_id(str):
16 return _msg_id_regex.search(str).group(1)
18 parser = email.Parser.Parser()
20 def _detect_reply_id(msg):
21 if msg['In-Reply-To']:
22 return _parse_msg_id(msg['In-Reply-To'])
23 if msg['References']:
24 refs = ' '.join(msg.get_all('References'))
25 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
26 return ref_ids[-1]
28 def _get_text_payload(msg):
29 if not msg.is_multipart():
30 return msg.get_payload()
31 textpart = max(email.Iterators.typed_subpart_iterator(msg), key=len)
32 if textpart.is_multipart():
33 return textpart.get_payload(0)
34 else:
35 return textpart.get_payload()
37 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
38 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
39 re.MULTILINE|re.DOTALL)
40 def _guess_patch_contents(msg):
41 p = _get_text_payload(msg)
42 if _format_patch_regex.match(p):
43 msg.set_payload(p)
44 return msg.as_string()
45 m = _snip_patch_regex.match(p)
46 if m:
47 msg.set_payload(m.group(1))
48 return msg.as_string()
49 # no patch found
50 return None
53 def later_unapplied_patches(session, msg):
54 return (session.query(Mail)
55 .filter(Mail.has_patch==True)
56 .filter(sqlalchemy.in_(msg.message_id, Mail.references))
57 .all())
60 def try_patch(session, m, pp, commit):
61 git('checkout', commit.sha1)
62 try:
63 pp.apply()
64 except patch.PatchError:
65 return # failed
66 pipe = git('show', ret_pipe=True)
67 output = git('patch-id', input_pipe=pipe)[0]
68 if not output:
69 # this means the patch had no diff; e.g., a mode change
70 return
71 patch_id, commit_id = output.split()
72 c = session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
73 if not c:
74 output = git('log', '-1', '--pretty=format:%ct %at')[0]
75 adate, cdate = [int(s) for s in output.split()]
76 c = db.Commit(commit_id, cdate, adate, patch_id, False)
77 session.add(c)
78 p = db.Patch(c, m.id, pp.notes)
79 session.add(p)
80 return p
82 def try_patch_anywhere(session, msg, m):
83 pdata = _guess_patch_contents(msg)
84 if not pdata:
85 return
86 pp = patch.Patch(pdata)
87 # first try on the commit given by the blobs
88 commits = []
89 for prefix in pp.blobs_pre:
90 ret = (session.query(db.Blob, db.Commit)
91 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
92 .filter(db.Blob.sha1.like(prefix+'%'))
93 .filter(db.Commit.upstream == True)
94 .order_by(db.Commit.cdate.desc()).first())
95 if not ret:
96 print 'blob %s not found?!' % prefix
97 break
98 commits.append(ret[1])
99 else:
100 if commits:
101 # all blobs found
102 cmt = min(commits)
103 print 'trying canonical commit %s' % cmt.sha1
104 applied = try_patch(session, m, pp, cmt)
105 if applied:
106 return applied
107 # this is just hopeless: it doesn't apply to the commit it should!
108 return
109 else:
110 print "no canonical commit found"
111 # if we have a parent, try on the parent
112 parent = session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
113 if parent and parent.has_patch and parent.patch_id:
114 cmt = (session.query(db.Commit)
115 .filter(db.Commit.patch_id==parent.patch_id)
116 .order_by(db.Commit.cdate.desc()).first())
117 print 'trying to apply on parent %s' % cmt.sha1
118 applied = try_patch(session, m, pp, cmt)
119 if applied:
120 return applied
121 else:
122 print "no parent commit found"
123 # try on origin/master
124 print 'trying on origin/master'
125 master = git('rev-parse', 'origin/master')[0].strip()
126 cmt = session.query(db.Commit).filter(db.Commit.sha1==master).one()
127 applied = try_patch(session, m, pp, cmt)
128 if applied:
129 return applied
130 # same for origin/next
131 print 'trying on origin/next'
132 next = git('rev-parse', 'origin/next')[0].strip()
133 cmt = session.query(db.Commit).filter(db.Commit.sha1==next).one()
134 applied = try_patch(session, m, pp, cmt)
135 if applied:
136 return applied
137 # all out of ideas!
139 def parse_mail(session, msg):
140 if (session.query(db.Mail.message_id)
141 .filter(db.Mail.message_id == _parse_msg_id(msg['Message-Id']))
142 .first()):
143 return [] # already exists
144 m = db.Mail()
145 m.message_id = _parse_msg_id(msg['Message-Id'])
146 m.author = msg['From']
147 m.in_reply_to = _detect_reply_id(msg)
148 m.post_date = time.mktime(email.utils.parsedate(msg['Date']))
149 m.payload = msg.as_string()
150 m.has_patch = bool(_guess_patch_contents(msg))
151 session.add(m)
152 references = []
153 if msg['References']:
154 for im in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
155 references.append((m, im.group(1)))
156 patch = try_patch_anywhere(session, msg, m)
157 if patch:
158 m.patch_id = patch.commit.patch_id
159 session.commit()
160 return references
162 def parse_mbox(fname):
163 session = db.Session()
164 mbox = mailbox.mbox(fname, parser.parse)
165 mbox_parsed = list(mbox)
166 references = []
167 for msg in mbox_parsed:
168 references.extend(parse_mail(session, msg))
169 session.commit()
170 for m, r in references:
171 session.add(db.Reference(m.id, r))
172 session.commit()
174 if __name__ == '__main__':
175 for mbox in sys.argv[1:]:
176 parse_mbox(mbox)