hg2git.py: Add support for extracting authorship from Signed-off-by lines
[fast-export/rorcz.git] / hg2git.py
blobe328a423351c77de40213a8faee99707891d9ead
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: GPLv2
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
8 """
10 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
11 from tempfile import mkstemp
12 import re
13 import sys
14 import os
16 # silly regex to catch Signed-off-by lines in log message
17 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
18 # silly regex to see if user field has email address
19 user_re=re.compile('([^<]+) (<[^>]+>)$')
20 # silly regex to clean out user names
21 user_clean_re=re.compile('^["]([^"]+)["]$')
22 # git branch for hg's default 'HEAD' branch
23 cfg_master='master'
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
27 def usage(ret):
28 sys.stderr.write(__doc__)
29 return ret
31 def setup_repo(url):
32 myui=ui.ui()
33 return myui,hg.repository(myui,url)
35 def fixup_user(user,authors):
36 if authors!=None:
37 # if we have an authors table, try to get mapping
38 # by defaulting to the current value of 'user'
39 user=authors.get(user,user)
40 name,mail,m='','',user_re.match(user)
41 if m==None:
42 # if we don't have 'Name <mail>' syntax, use 'user
43 # <devnull@localhost>' if use contains no at and
44 # 'user <user>' otherwise
45 name=user
46 if '@' not in user:
47 mail='<devnull@localhost>'
48 else:
49 mail='<%s>' % user
50 else:
51 # if we have 'Name <mail>' syntax, everything is fine :)
52 name,mail=m.group(1),m.group(2)
54 # remove any silly quoting from username
55 m2=user_clean_re.match(name)
56 if m2!=None:
57 name=m2.group(1)
58 return '%s %s' % (name,mail)
60 def get_changeset(ui,repo,revision,authors):
61 def get_branch(name):
62 if name=='HEAD':
63 name=cfg_master
64 return name
65 node=repo.lookup(revision)
66 (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
67 tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
68 branch=get_branch(extra.get('branch','master'))
69 return (manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
71 def gitmode(x):
72 return x and '100755' or '100644'
74 def wr(msg=''):
75 print msg
76 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
78 def checkpoint(count):
79 count=count+1
80 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
81 sys.stderr.write("Checkpoint after %d commits\n" % count)
82 wr('checkpoint')
83 wr()
84 return count
86 def get_parent_mark(parent,marks):
87 """Get the mark for some parent.
88 If we saw it in the current session, return :%d syntax and
89 otherwise the SHA1 from the cache."""
90 return marks.get(str(parent+1),':%d' % (parent+1))
92 def mismatch(f1,f2):
93 """See if two revisions of a file are not equal."""
94 return node.hex(f1)!=node.hex(f2)
96 def outer_set(dleft,dright,l,c,r):
97 """Loop over our repository and find all changed and missing files."""
98 for left in dleft.keys():
99 right=dright.get(left,None)
100 if right==None:
101 # we have the file but our parent hasn't: add to left set
102 l.append(left)
103 elif mismatch(dleft[left],right):
104 # we have it but checksums mismatch: add to center set
105 c.append(left)
106 for right in dright.keys():
107 left=dleft.get(right,None)
108 if left==None:
109 # if parent has file but we don't: add to right set
110 r.append(right)
111 # change is already handled when comparing child against parent
112 return l,c,r
114 def get_filechanges(repo,revision,parents,mleft):
115 """Given some repository and revision, find all changed/deleted files."""
116 l,c,r=[],[],[]
117 for p in parents:
118 if p<0: continue
119 mright=repo.changectx(p).manifest()
120 dleft=mleft.keys()
121 dleft.sort()
122 dright=mright.keys()
123 dright.sort()
124 l,c,r=outer_set(mleft,mright,l,c,r)
125 return l,c,r
127 def get_author(logmessage,committer,authors):
128 """As git distincts between author and committer of a patch, try to
129 extract author by detecting Signed-off-by lines.
131 This walks from the end of the log message towards the top skipping
132 empty lines. Upon the first non-empty line, it walks all Signed-off-by
133 lines upwards to find the first one. For that (if found), it extracts
134 authorship information the usual way (authors table, cleaning, etc.)
136 If no Signed-off-by line is found, this defaults to the committer.
138 This may sound stupid (and it somehow is), but in log messages we
139 accidentially may have lines in the middle starting with
140 "Signed-off-by: foo" and thus matching our detection regex. Prevent
141 that."""
143 loglines=logmessage.split('\n')
144 i=len(loglines)
145 # from tail walk to top skipping empty lines
146 while i>=0:
147 i-=1
148 if len(loglines[i].strip())==0: continue
149 break
150 if i>=0:
151 # walk further upwards to find first sob line, store in 'first'
152 first=None
153 while i>=0:
154 m=sob_re.match(loglines[i])
155 if m==None: break
156 first=m
157 i-=1
158 # if the last non-empty line matches our Signed-Off-by regex: extract username
159 if first!=None:
160 r=fixup_user(first.group(1),authors)
161 return r
162 return committer
164 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors):
165 (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
166 parents=repo.changelog.parentrevs(revision)
168 wr('commit refs/heads/%s' % branch)
169 wr('mark :%d' % (revision+1))
170 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
171 wr('committer %s %d %s' % (user,time,timezone))
172 wr('data %d' % (len(desc)+1)) # wtf?
173 wr(desc)
174 wr()
176 src=heads.get(branch,'')
177 link=''
178 if src!='':
179 # if we have a cached head, this is an incremental import: initialize it
180 # and kill reference so we won't init it again
181 wr('from %s' % src)
182 heads[branch]=''
183 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
184 (branch,src))
185 link=src # avoid making a merge commit for incremental import
186 elif link=='' and not heads.has_key(branch) and revision>0:
187 # newly created branch and not the first one: connect to parent
188 tmp=get_parent_mark(parents[0],marks)
189 wr('from %s' % tmp)
190 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
191 (branch,tmp))
192 link=tmp # avoid making a merge commit for branch fork
194 if parents:
195 l=last.get(branch,revision)
196 for p in parents:
197 # 1) as this commit implicitely is the child of the most recent
198 # commit of this branch, ignore this parent
199 # 2) ignore nonexistent parents
200 # 3) merge otherwise
201 if p==l or p==revision or p<0:
202 continue
203 tmp=get_parent_mark(p,marks)
204 # if we fork off a branch, don't merge with our parent via 'merge'
205 # as we have 'from' already above
206 if tmp==link:
207 continue
208 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
209 (branch,tmp,p))
210 wr('merge %s' % tmp)
212 last[branch]=revision
213 heads[branch]=''
214 # we need this later to write out tags
215 marks[str(revision)]=':%d'%(revision+1)
217 ctx=repo.changectx(str(revision))
218 man=ctx.manifest()
219 added,changed,removed=get_filechanges(repo,revision,parents,man)
221 sys.stderr.write('Exporting revision %d with %d/%d/%d added/changed/removed files\n' %
222 (revision,len(added),len(changed),len(removed)))
224 for a in added+changed:
225 fctx=ctx.filectx(a)
226 d=fctx.data()
227 wr('M %s inline %s' % (gitmode(man.execf(a)),a))
228 wr('data %d' % len(d)) # had some trouble with size()
229 wr(d)
231 for r in removed:
232 wr('D %s' % r)
234 wr()
235 return checkpoint(count)
237 def export_tags(ui,repo,marks_cache,start,end,count,authors):
238 l=repo.tagslist()
239 for tag,node in l:
240 # ignore latest revision
241 if tag=='tip': continue
242 rev=repo.changelog.rev(node)
243 # ignore those tags not in our import range
244 if rev<start or rev>=end: continue
246 ref=marks_cache.get(str(rev),None)
247 if ref==None:
248 sys.stderr.write('Failed to find reference for creating tag'
249 ' %s at r%d\n' % (tag,rev))
250 continue
251 (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev,authors)
252 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
253 wr('tag %s' % tag)
254 wr('from %s' % ref)
255 wr('tagger %s %d %s' % (user,time,timezone))
256 msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag,
257 rev,branch,desc.split('\n')[0])
258 wr('data %d' % (len(msg)+1))
259 wr(msg)
260 wr()
261 count=checkpoint(count)
262 return count
264 def load_cache(filename):
265 cache={}
266 if not os.path.exists(filename):
267 return cache
268 f=open(filename,'r')
270 for line in f.readlines():
271 l+=1
272 fields=line.split(' ')
273 if fields==None or not len(fields)==2 or fields[0][0]!=':':
274 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
275 continue
276 # put key:value in cache, key without ^:
277 cache[fields[0][1:]]=fields[1].split('\n')[0]
278 f.close()
279 return cache
281 def save_cache(filename,cache):
282 f=open(filename,'w+')
283 map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
284 f.close()
286 def verify_heads(ui,repo,cache):
287 def getsha1(branch):
288 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
289 sha1=f.readlines()[0].split('\n')[0]
290 f.close()
291 return sha1
293 for b in cache.keys():
294 sys.stderr.write('Verifying branch [%s]\n' % b)
295 sha1=getsha1(b)
296 c=cache.get(b)
297 if sha1!=c:
298 sys.stderr.write('Warning: Branch [%s] modified outside hg2git:'
299 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
300 return True
302 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={}):
303 _max=int(m)
305 marks_cache=load_cache(marksfile)
306 heads_cache=load_cache(headsfile)
307 state_cache=load_cache(tipfile)
309 ui,repo=setup_repo(repourl)
311 if not verify_heads(ui,repo,heads_cache):
312 return 1
314 tip=repo.changelog.count()
316 min=int(state_cache.get('tip',0))
317 max=_max
318 if _max<0:
319 max=tip
322 last={}
323 for rev in range(min,max):
324 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c,authors)
326 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
328 sys.stderr.write('Issued %d commands\n' % c)
330 state_cache['tip']=max
331 state_cache['repo']=repourl
332 save_cache(tipfile,state_cache)
334 return 0
336 if __name__=='__main__':
337 if len(sys.argv)!=6: sys.exit(usage(1))
338 repourl,m,marksfile,headsfile,tipfile=sys.argv[1:]
339 sys.exit(hg2git(repourl,m,marksfile,headsfile,tipfile))