Change Message data format somewhat
[trackgit.git] / mail.py
blob98fbcd134ea59e1380c8372109c0ff809955a8b4
1 import sys
2 import re
3 import time
4 import mailbox
5 import email.Iterators
6 import email.Parser
7 import email.utils
8 import sqlalchemy
10 import db
11 import patch
12 from git import git
14 _msg_id_regex = re.compile(r'<([^<>]+)>')
15 def _parse_msg_id(str):
16 return _msg_id_regex.search(str).group(1)
18 parser = email.Parser.Parser()
20 def _detect_reply_id(msg):
21 if msg['In-Reply-To']:
22 return _parse_msg_id(msg['In-Reply-To'])
23 if msg['References']:
24 refs = ' '.join(msg.get_all('References'))
25 ref_ids = [m.group(1) for m in _msg_id_regex.finditer(refs)]
26 return ref_ids[-1]
28 def _get_text_payload(msg):
29 if not msg.is_multipart():
30 return msg.get_payload()
31 textpart = max(email.Iterators.typed_subpart_iterator(msg), key=len)
32 if textpart.is_multipart():
33 return textpart.get_payload(0)
34 else:
35 return textpart.get_payload()
37 _format_patch_regex = re.compile('.*^---$.*^diff --git', re.MULTILINE|re.DOTALL)
38 _snip_patch_regex = re.compile('.*^-+ ?(?:8<|>8) ?-+\n(.*^diff --git.*)',
39 re.MULTILINE|re.DOTALL)
40 def _guess_patch_contents(msg):
41 p = _get_text_payload(msg)
42 if _format_patch_regex.match(p):
43 msg.set_payload(p)
44 return msg.as_string()
45 m = _snip_patch_regex.match(p)
46 if m:
47 msg.set_payload(m.group(1))
48 return msg.as_string()
49 # no patch found
50 return None
53 def later_unapplied_patches(session, msg):
54 return (session.query(Mail)
55 .filter(Mail.has_patch==True)
56 .filter(sqlalchemy.in_(msg.message_id, Mail.references))
57 .all())
60 def try_patch(session, m, pp, commit):
61 git('checkout', commit.sha1)
62 try:
63 pp.apply()
64 except patch.PatchError:
65 return # failed
66 pipe = git('show', ret_pipe=True)
67 output = git('patch-id', input_pipe=pipe)[0]
68 if not output:
69 # this means the patch had no diff; e.g., a mode change
70 return
71 patch_id, commit_id = output.split()
72 c = session.query(db.Commit).filter(db.Commit.sha1 == commit_id).first()
73 if not c:
74 output = git('log', '-1', '--pretty=format:%ct %at')[0]
75 adate, cdate = [int(s) for s in output.split()]
76 c = db.Commit(commit_id, cdate, adate, patch_id, False)
77 session.add(c)
78 p = db.Patch(c, m.id, pp.notes)
79 session.add(p)
80 return p
82 def try_patch_anywhere(session, msg, m):
83 pdata = _guess_patch_contents(msg)
84 if not pdata:
85 return
86 pp = patch.Patch(pdata)
87 # first try on the commit given by the blobs
88 commits = []
89 for prefix in pp.blobs_pre:
90 ret = (session.query(db.Blob, db.Commit)
91 .filter(db.Blob.newest_commit_sha1 == db.Commit.sha1)
92 .filter(db.Blob.sha1.like(prefix+'%'))
93 .filter(db.Commit.upstream == True)
94 .order_by(db.Commit.cdate.desc()).first())
95 if not ret:
96 print 'blob %s not found?!' % prefix
97 break
98 commits.append(ret[1])
99 else:
100 if commits:
101 # all blobs found
102 cmt = min(commits)
103 print 'trying canonical commit %s' % cmt.sha1
104 applied = try_patch(session, m, pp, cmt)
105 if applied:
106 return applied
107 # this is just hopeless: it doesn't apply to the commit it should!
108 return
109 else:
110 print "no canonical commit found"
111 # if we have a parent, try on the parent
112 parent = session.query(db.Mail).filter(db.Mail.message_id==m.in_reply_to).first()
113 if parent and parent.has_patch and parent.patch_id:
114 cmt = (session.query(db.Commit)
115 .filter(db.Commit.patch_id==parent.patch_id)
116 .order_by(db.Commit.cdate.desc()).first())
117 print 'trying to apply on parent %s' % cmt.sha1
118 applied = try_patch(session, m, pp, cmt)
119 if applied:
120 return applied
121 else:
122 print "no parent commit found"
123 # try on origin/master
124 print 'trying on origin/master'
125 master = git('rev-parse', 'origin/master')[0].strip()
126 cmt = session.query(db.Commit).filter(db.Commit.sha1==master).one()
127 applied = try_patch(session, m, pp, cmt)
128 if applied:
129 return applied
130 # same for origin/next
131 print 'trying on origin/next'
132 next = git('rev-parse', 'origin/next')[0].strip()
133 cmt = session.query(db.Commit).filter(db.Commit.sha1==next).one()
134 applied = try_patch(session, m, pp, cmt)
135 if applied:
136 return applied
137 # all out of ideas!
139 _whats_cooking_subject = re.compile(r"^What's cooking in git\.git")
140 _whats_cooking_category = re.compile(r"^\[(.*)\]$")
141 _whats_cooking_header = re.compile(r"\* (../[a-zA-Z0-9-]+) \([^)]*\) \d+ commits?")
142 _whats_cooking_separator = re.compile(r"^(-{5,}|-- )$")
144 def parse_whats_cooking(session, msg, mail):
145 if not (msg["Subject"] and _whats_cooking_subject.match(msg["Subject"])):
146 return
147 category = None
148 branch = 'pu' # initial part goes on 'pu'
149 notes = []
150 def _rotate_notes(category, branch, notes):
151 if branch:
152 t = session.query(db.Topic).filter(db.Topic.name==branch).first()
153 if not t:
154 t = db.Topic()
155 t.name = branch
156 session.add(t)
157 t.mail_id = mail.id
158 t.cooking_notes = '\n'.join(notes)
159 notes = []
160 if category:
161 notes.append("[%s]" % category)
162 return notes
163 for line in _get_text_payload(msg).splitlines():
164 if _whats_cooking_separator.match(line):
165 category = None
166 notes = _rotate_notes(category, branch, notes)
167 branch = None
168 continue
169 m = _whats_cooking_category.match(line)
170 if m:
171 category = m.group(1)
172 notes = _rotate_notes(category, branch, notes)
173 continue
174 m = _whats_cooking_header.match(line)
175 if m:
176 notes = _rotate_notes(category, branch, notes)
177 notes.append(line)
178 branch = m.group(1)
179 continue
180 notes.append(line)
182 def parse_mail(session, msg):
183 if (session.query(db.Mail.message_id)
184 .filter(db.Mail.message_id == _parse_msg_id(msg['Message-Id']))
185 .first()):
186 return [] # already exists
187 m = db.Mail()
188 m.message_id = _parse_msg_id(msg['Message-Id'])
189 m.author = msg['From']
190 m.in_reply_to = _detect_reply_id(msg)
191 m.post_date = time.mktime(email.utils.parsedate(msg['Date']))
192 m.data = msg.as_string()
193 m.has_patch = bool(_guess_patch_contents(msg))
194 session.add(m)
195 references = []
196 if msg['References']:
197 for im in _msg_id_regex.finditer(' '.join(msg.get_all('References'))):
198 references.append((m, im.group(1)))
199 # try patching
200 patch = try_patch_anywhere(session, msg, m)
201 if patch:
202 m.patch_id = patch.commit.patch_id
203 # try reading a what's cooking message
204 parse_whats_cooking(session, msg, m)
205 session.commit()
206 return references
208 def parse_mbox(fname):
209 session = db.Session()
210 mbox = mailbox.mbox(fname, parser.parse)
211 mbox_parsed = list(mbox)
212 references = []
213 for msg in mbox_parsed:
214 references.extend(parse_mail(session, msg))
215 session.commit()
216 for m, r in references:
217 session.add(db.Reference(m.id, r))
218 session.commit()
220 if __name__ == '__main__':
221 for mbox in sys.argv[1:]:
222 parse_mbox(mbox)