hg2git.py: Create only leightweight tags
[fast-export/barak.git] / hg2git.py
blob966ee63f59da6892bf4689c8d849a707082c6005
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: GPLv2
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
8 """
10 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
11 from tempfile import mkstemp
12 from optparse import OptionParser
13 import re
14 import sys
15 import os
17 # silly regex to catch Signed-off-by lines in log message
18 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
19 # silly regex to see if user field has email address
20 user_re=re.compile('([^<]+) (<[^>]+>)$')
21 # silly regex to clean out user names
22 user_clean_re=re.compile('^["]([^"]+)["]$')
23 # git branch for hg's default 'HEAD' branch
24 cfg_master='master'
25 # insert 'checkpoint' command after this many commits or none at all if 0
26 cfg_checkpoint_count=0
27 # write some progress message every this many file contents written
28 cfg_export_boundary=1000
30 def usage(ret):
31 sys.stderr.write(__doc__)
32 return ret
34 def setup_repo(url):
35 myui=ui.ui()
36 return myui,hg.repository(myui,url)
38 def fixup_user(user,authors):
39 if authors!=None:
40 # if we have an authors table, try to get mapping
41 # by defaulting to the current value of 'user'
42 user=authors.get(user,user)
43 name,mail,m='','',user_re.match(user)
44 if m==None:
45 # if we don't have 'Name <mail>' syntax, use 'user
46 # <devnull@localhost>' if use contains no at and
47 # 'user <user>' otherwise
48 name=user
49 if '@' not in user:
50 mail='<devnull@localhost>'
51 else:
52 mail='<%s>' % user
53 else:
54 # if we have 'Name <mail>' syntax, everything is fine :)
55 name,mail=m.group(1),m.group(2)
57 # remove any silly quoting from username
58 m2=user_clean_re.match(name)
59 if m2!=None:
60 name=m2.group(1)
61 return '%s %s' % (name,mail)
63 def get_branch(name):
64 if name=='HEAD':
65 name=cfg_master
66 return name
68 def get_changeset(ui,repo,revision,authors={}):
69 node=repo.lookup(revision)
70 (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
71 tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
72 branch=get_branch(extra.get('branch','master'))
73 return (manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
75 def gitmode(x):
76 return x and '100755' or '100644'
78 def wr(msg=''):
79 print msg
80 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
82 def checkpoint(count):
83 count=count+1
84 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
85 sys.stderr.write("Checkpoint after %d commits\n" % count)
86 wr('checkpoint')
87 wr()
88 return count
90 def get_parent_mark(parent,marks):
91 """Get the mark for some parent.
92 If we saw it in the current session, return :%d syntax and
93 otherwise the SHA1 from the cache."""
94 return marks.get(str(parent+1),':%d' % (parent+1))
96 def mismatch(f1,f2):
97 """See if two revisions of a file are not equal."""
98 return node.hex(f1)!=node.hex(f2)
100 def outer_set(dleft,dright,l,c,r):
101 """Loop over our repository and find all changed and missing files."""
102 for left in dleft.keys():
103 right=dright.get(left,None)
104 if right==None:
105 # we have the file but our parent hasn't: add to left set
106 l.append(left)
107 elif mismatch(dleft[left],right):
108 # we have it but checksums mismatch: add to center set
109 c.append(left)
110 for right in dright.keys():
111 left=dleft.get(right,None)
112 if left==None:
113 # if parent has file but we don't: add to right set
114 r.append(right)
115 # change is already handled when comparing child against parent
116 return l,c,r
118 def get_filechanges(repo,revision,parents,mleft):
119 """Given some repository and revision, find all changed/deleted files."""
120 l,c,r=[],[],[]
121 for p in parents:
122 if p<0: continue
123 mright=repo.changectx(p).manifest()
124 dleft=mleft.keys()
125 dleft.sort()
126 dright=mright.keys()
127 dright.sort()
128 l,c,r=outer_set(mleft,mright,l,c,r)
129 return l,c,r
131 def get_author(logmessage,committer,authors):
132 """As git distincts between author and committer of a patch, try to
133 extract author by detecting Signed-off-by lines.
135 This walks from the end of the log message towards the top skipping
136 empty lines. Upon the first non-empty line, it walks all Signed-off-by
137 lines upwards to find the first one. For that (if found), it extracts
138 authorship information the usual way (authors table, cleaning, etc.)
140 If no Signed-off-by line is found, this defaults to the committer.
142 This may sound stupid (and it somehow is), but in log messages we
143 accidentially may have lines in the middle starting with
144 "Signed-off-by: foo" and thus matching our detection regex. Prevent
145 that."""
147 loglines=logmessage.split('\n')
148 i=len(loglines)
149 # from tail walk to top skipping empty lines
150 while i>=0:
151 i-=1
152 if len(loglines[i].strip())==0: continue
153 break
154 if i>=0:
155 # walk further upwards to find first sob line, store in 'first'
156 first=None
157 while i>=0:
158 m=sob_re.match(loglines[i])
159 if m==None: break
160 first=m
161 i-=1
162 # if the last non-empty line matches our Signed-Off-by regex: extract username
163 if first!=None:
164 r=fixup_user(first.group(1),authors)
165 return r
166 return committer
168 def export_file_contents(ctx,manifest,files):
169 count=0
170 max=len(files)
171 for file in files:
172 fctx=ctx.filectx(file)
173 d=fctx.data()
174 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
175 wr('data %d' % len(d)) # had some trouble with size()
176 wr(d)
177 count+=1
178 if count%cfg_export_boundary==0:
179 sys.stderr.write('Exported %d/%d files\n' % (count,max))
180 if max>cfg_export_boundary:
181 sys.stderr.write('Exported %d/%d files\n' % (count,max))
183 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
184 (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
185 parents=repo.changelog.parentrevs(revision)
187 wr('commit refs/heads/%s' % branch)
188 wr('mark :%d' % (revision+1))
189 if sob:
190 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
191 wr('committer %s %d %s' % (user,time,timezone))
192 wr('data %d' % (len(desc)+1)) # wtf?
193 wr(desc)
194 wr()
196 src=heads.get(branch,'')
197 link=''
198 if src!='':
199 # if we have a cached head, this is an incremental import: initialize it
200 # and kill reference so we won't init it again
201 wr('from %s' % src)
202 heads[branch]=''
203 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
204 (branch,src))
205 link=src # avoid making a merge commit for incremental import
206 elif link=='' and not heads.has_key(branch) and revision>0:
207 # newly created branch and not the first one: connect to parent
208 tmp=get_parent_mark(parents[0],marks)
209 wr('from %s' % tmp)
210 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
211 (branch,tmp))
212 link=tmp # avoid making a merge commit for branch fork
214 if parents:
215 l=last.get(branch,revision)
216 for p in parents:
217 # 1) as this commit implicitely is the child of the most recent
218 # commit of this branch, ignore this parent
219 # 2) ignore nonexistent parents
220 # 3) merge otherwise
221 if p==l or p==revision or p<0:
222 continue
223 tmp=get_parent_mark(p,marks)
224 # if we fork off a branch, don't merge with our parent via 'merge'
225 # as we have 'from' already above
226 if tmp==link:
227 continue
228 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
229 (branch,tmp,p))
230 wr('merge %s' % tmp)
232 last[branch]=revision
233 heads[branch]=''
234 # we need this later to write out tags
235 marks[str(revision)]=':%d'%(revision+1)
237 ctx=repo.changectx(str(revision))
238 man=ctx.manifest()
240 if revision==0:
241 # first revision: feed in full manifest
242 sys.stderr.write('Exporting full revision %d/%d with %d added files\n' %
243 (revision+1,max,len(man.keys())))
244 export_file_contents(ctx,man,man.keys())
245 else:
246 # later revision: feed in changed manifest
247 added,changed,removed=get_filechanges(repo,revision,parents,man)
248 sys.stderr.write('Exporting delta revision %d/%d with %d/%d/%d added/changed/removed files\n' %
249 (revision+1,max,len(added),len(changed),len(removed)))
250 export_file_contents(ctx,man,added+changed)
251 for r in removed:
252 wr('D %s' % r)
254 wr()
255 return checkpoint(count)
257 def export_tags(ui,repo,marks_cache,start,end,count,authors):
258 l=repo.tagslist()
259 for tag,node in l:
260 # ignore latest revision
261 if tag=='tip': continue
262 rev=repo.changelog.rev(node)
263 # ignore those tags not in our import range
264 if rev<start or rev>=end: continue
266 ref=get_parent_mark(rev,marks_cache)
267 if ref==None:
268 sys.stderr.write('Failed to find reference for creating tag'
269 ' %s at r%d\n' % (tag,rev))
270 continue
271 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
272 wr('reset refs/tags/%s' % tag)
273 wr('from %s' % ref)
274 wr()
275 count=checkpoint(count)
276 return count
278 def load_authors(filename):
279 cache={}
280 if not os.path.exists(filename):
281 return cache
282 f=open(filename,'r')
284 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
285 for line in f.readlines():
286 l+=1
287 m=lre.match(line)
288 if m==None:
289 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
290 continue
291 # put key:value in cache, key without ^:
292 cache[m.group(1).strip()]=m.group(2).strip()
293 f.close()
294 sys.stderr.write('Loaded %d authors\n' % l)
295 return cache
297 def load_cache(filename):
298 cache={}
299 if not os.path.exists(filename):
300 return cache
301 f=open(filename,'r')
303 for line in f.readlines():
304 l+=1
305 fields=line.split(' ')
306 if fields==None or not len(fields)==2 or fields[0][0]!=':':
307 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
308 continue
309 # put key:value in cache, key without ^:
310 cache[fields[0][1:]]=fields[1].split('\n')[0]
311 f.close()
312 return cache
314 def save_cache(filename,cache):
315 f=open(filename,'w+')
316 map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
317 f.close()
319 def verify_heads(ui,repo,cache,force):
320 def getsha1(branch):
321 try:
322 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
323 sha1=f.readlines()[0].split('\n')[0]
324 f.close()
325 return sha1
326 except IOError:
327 return None
329 branches=repo.branchtags()
330 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
331 l.sort()
333 # get list of hg's branches to verify, don't take all git has
334 for _,_,b in l:
335 b=get_branch(b)
336 sys.stderr.write('Verifying branch [%s]\n' % b)
337 sha1=getsha1(b)
338 c=cache.get(b)
339 if sha1!=c:
340 sys.stderr.write('Error: Branch [%s] modified outside hg2git:'
341 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
342 if not force: return False
344 # verify that branch has exactly one head
345 t={}
346 for h in repo.heads():
347 (_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
348 if t.get(branch,False):
349 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
350 repo.changelog.rev(h))
351 if not force: return False
352 t[branch]=True
354 return True
356 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
357 _max=int(m)
359 marks_cache=load_cache(marksfile)
360 heads_cache=load_cache(headsfile)
361 state_cache=load_cache(tipfile)
363 ui,repo=setup_repo(repourl)
365 if not verify_heads(ui,repo,heads_cache,force):
366 return 1
368 tip=repo.changelog.count()
370 min=int(state_cache.get('tip',0))
371 max=_max
372 if _max<0:
373 max=tip
376 last={}
377 for rev in range(min,max):
378 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
380 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
382 sys.stderr.write('Issued %d commands\n' % c)
384 state_cache['tip']=max
385 state_cache['repo']=repourl
386 save_cache(tipfile,state_cache)
388 return 0
390 if __name__=='__main__':
391 def bail(parser,opt):
392 sys.stderr.write('Error: No %s option given\n' % opt)
393 parser.print_help()
394 sys.exit(2)
396 parser=OptionParser()
398 parser.add_option("-m","--max",type="int",dest="max",
399 help="Maximum hg revision to import")
400 parser.add_option("--marks",dest="marksfile",
401 help="File to read git-fast-import's marks from")
402 parser.add_option("--heads",dest="headsfile",
403 help="File to read last run's git heads from")
404 parser.add_option("--status",dest="statusfile",
405 help="File to read status from")
406 parser.add_option("-r","--repo",dest="repourl",
407 help="URL of repo to import")
408 parser.add_option("-s",action="store_true",dest="sob",
409 default=False,help="Enable parsing Signed-off-by lines")
410 parser.add_option("-A","--authors",dest="authorfile",
411 help="Read authormap from AUTHORFILE")
412 parser.add_option("-f","--force",action="store_true",dest="force",
413 default=False,help="Ignore validation errors by force")
415 (options,args)=parser.parse_args()
417 m=-1
418 if options.max!=None: m=options.max
420 if options.marksfile==None: bail(parser,'--marks')
421 if options.marksfile==None: bail(parser,'--heads')
422 if options.marksfile==None: bail(parser,'--status')
423 if options.marksfile==None: bail(parser,'--repo')
425 a={}
426 if options.authorfile!=None:
427 a=load_authors(options.authorfile)
429 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
430 options.statusfile,authors=a,sob=options.sob,force=options.force))