Allow branches and tags to be remapped
[fast-export.git] / hg-fast-export.py
blobfa8229249b2b62cbe519376b91256e213a03d04d
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139 strip_leading_slash(filename)))
140 wr('data %d' % len(d)) # had some trouble with size()
141 wr(d)
142 count+=1
143 if count%cfg_export_boundary==0:
144 sys.stderr.write('Exported %d/%d files\n' % (count,max))
145 if max>cfg_export_boundary:
146 sys.stderr.write('Exported %d/%d files\n' % (count,max))
148 def sanitize_name(name,what="branch"):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
151 def dot(name):
152 if name[0] == '.': return '_'+name[1:]
153 return name
155 n=name
156 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
157 n=p.sub('_', n)
158 if n[-1] in ('/', '.'): n=n[:-1]+'_'
159 n='/'.join(map(dot,n.split('/')))
160 p=re.compile('_+')
161 n=p.sub('_', n)
163 if n!=name:
164 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
165 return n
167 def strip_leading_slash(filename):
168 if filename[0] == '/':
169 return filename[1:]
170 return filename
172 def export_commit(ui,repo,revision,old_marks,max,count,authors,
173 branchesmap,sob,brmap,hgtags,notes,encoding=''):
174 def get_branchname(name):
175 if brmap.has_key(name):
176 return brmap[name]
177 n=sanitize_name(branchesmap.get(name,name))
178 brmap[name]=n
179 return n
181 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
183 branch=get_branchname(branch)
185 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
187 if len(parents)==0 and revision != 0:
188 wr('reset refs/heads/%s' % branch)
190 wr('commit refs/heads/%s' % branch)
191 wr('mark :%d' % (revision+1))
192 if sob:
193 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
194 wr('committer %s %d %s' % (user,time,timezone))
195 wr('data %d' % (len(desc)+1)) # wtf?
196 wr(desc)
197 wr()
199 ctx=repo.changectx(str(revision))
200 man=ctx.manifest()
201 added,changed,removed,type=[],[],[],''
203 if len(parents) == 0:
204 # first revision: feed in full manifest
205 added=man.keys()
206 added.sort()
207 type='full'
208 else:
209 wr('from %s' % revnum_to_revref(parents[0], old_marks))
210 if len(parents) == 1:
211 # later non-merge revision: feed in changed manifest
212 # if we have exactly one parent, just take the changes from the
213 # manifest without expensively comparing checksums
214 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
215 added,changed,removed=f[1],f[0],f[2]
216 type='simple delta'
217 else: # a merge with two parents
218 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
219 # later merge revision: feed in changed manifest
220 # for many files comparing checksums is expensive so only do it for
221 # merges where we really need it due to hg's revlog logic
222 added,changed,removed=get_filechanges(repo,revision,parents,man)
223 type='thorough delta'
225 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
226 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
228 if encoding:
229 removed=[r.decode(encoding).encode('utf8') for r in removed]
231 removed=[strip_leading_slash(x) for x in removed]
233 map(lambda r: wr('D %s' % r),removed)
234 export_file_contents(ctx,man,added,hgtags,encoding)
235 export_file_contents(ctx,man,changed,hgtags,encoding)
236 wr()
238 count=checkpoint(count)
239 count=generate_note(user,time,timezone,revision,ctx,count,notes)
240 return count
242 def generate_note(user,time,timezone,revision,ctx,count,notes):
243 if not notes:
244 return count
245 wr('commit refs/notes/hg')
246 wr('committer %s %d %s' % (user,time,timezone))
247 wr('data 0')
248 wr('N inline :%d' % (revision+1))
249 hg_hash=ctx.hex()
250 wr('data %d' % (len(hg_hash)))
251 wr_no_nl(hg_hash)
252 wr()
253 return checkpoint(count)
255 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
256 l=repo.tagslist()
257 for tag,node in l:
258 # Remap the branch name
259 tag=sanitize_name(tagsmap.get(tag,tag),"tag")
260 # ignore latest revision
261 if tag=='tip': continue
262 # ignore tags to nodes that are missing (ie, 'in the future')
263 if node.encode('hex_codec') not in mapping_cache:
264 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
265 continue
267 rev=int(mapping_cache[node.encode('hex_codec')])
269 ref=revnum_to_revref(rev, old_marks)
270 if ref==None:
271 sys.stderr.write('Failed to find reference for creating tag'
272 ' %s at r%d\n' % (tag,rev))
273 continue
274 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
275 wr('reset refs/tags/%s' % tag)
276 wr('from %s' % ref)
277 wr()
278 count=checkpoint(count)
279 return count
281 def load_mapping(name, filename):
282 cache={}
283 if not os.path.exists(filename):
284 return cache
285 f=open(filename,'r')
288 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
289 for line in f.readlines():
290 l+=1
291 line=line.strip()
292 if line=='' or line[0]=='#':
293 continue
294 m=lre.match(line)
295 if m==None:
296 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
297 continue
298 # put key:value in cache, key without ^:
299 cache[m.group(1).strip()]=m.group(2).strip()
300 a+=1
301 f.close()
302 sys.stderr.write('Loaded %d %s\n' % (a, name))
303 return cache
305 def branchtip(repo, heads):
306 '''return the tipmost branch head in heads'''
307 tip = heads[-1]
308 for h in reversed(heads):
309 if 'close' not in repo.changelog.read(h)[5]:
310 tip = h
311 break
312 return tip
314 def verify_heads(ui,repo,cache,force):
315 branches={}
316 for bn, heads in repo.branchmap().iteritems():
317 branches[bn] = branchtip(repo, heads)
318 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
319 l.sort()
321 # get list of hg's branches to verify, don't take all git has
322 for _,_,b in l:
323 b=get_branch(b)
324 sha1=get_git_sha1(b)
325 c=cache.get(b)
326 if sha1!=c:
327 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
328 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
329 if not force: return False
331 # verify that branch has exactly one head
332 t={}
333 for h in repo.heads():
334 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
335 if t.get(branch,False):
336 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
337 repo.changelog.rev(h))
338 if not force: return False
339 t[branch]=True
341 return True
343 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
344 authors={},branchesmap={},tagsmap={},
345 sob=False,force=False,hgtags=False,notes=False,encoding=''):
346 _max=int(m)
348 old_marks=load_cache(marksfile,lambda s: int(s)-1)
349 mapping_cache=load_cache(mappingfile)
350 heads_cache=load_cache(headsfile)
351 state_cache=load_cache(tipfile)
353 ui,repo=setup_repo(repourl)
355 if not verify_heads(ui,repo,heads_cache,force):
356 return 1
358 try:
359 tip=repo.changelog.count()
360 except AttributeError:
361 tip=len(repo)
363 min=int(state_cache.get('tip',0))
364 max=_max
365 if _max<0 or max>tip:
366 max=tip
368 for rev in range(0,max):
369 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
370 mapping_cache[revnode.encode('hex_codec')] = str(rev)
374 brmap={}
375 for rev in range(min,max):
376 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
377 sob,brmap,hgtags,notes,encoding)
379 state_cache['tip']=max
380 state_cache['repo']=repourl
381 save_cache(tipfile,state_cache)
382 save_cache(mappingfile,mapping_cache)
384 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
386 sys.stderr.write('Issued %d commands\n' % c)
388 return 0
390 if __name__=='__main__':
391 def bail(parser,opt):
392 sys.stderr.write('Error: No %s option given\n' % opt)
393 parser.print_help()
394 sys.exit(2)
396 parser=OptionParser()
398 parser.add_option("-m","--max",type="int",dest="max",
399 help="Maximum hg revision to import")
400 parser.add_option("--mapping",dest="mappingfile",
401 help="File to read last run's hg-to-git SHA1 mapping")
402 parser.add_option("--marks",dest="marksfile",
403 help="File to read git-fast-import's marks from")
404 parser.add_option("--heads",dest="headsfile",
405 help="File to read last run's git heads from")
406 parser.add_option("--status",dest="statusfile",
407 help="File to read status from")
408 parser.add_option("-r","--repo",dest="repourl",
409 help="URL of repo to import")
410 parser.add_option("-s",action="store_true",dest="sob",
411 default=False,help="Enable parsing Signed-off-by lines")
412 parser.add_option("--hgtags",action="store_true",dest="hgtags",
413 default=False,help="Enable exporting .hgtags files")
414 parser.add_option("-A","--authors",dest="authorfile",
415 help="Read authormap from AUTHORFILE")
416 parser.add_option("-B","--branches",dest="branchesfile",
417 help="Read branch map from BRANCHESFILE")
418 parser.add_option("-T","--tags",dest="tagsfile",
419 help="Read tags map from TAGSFILE")
420 parser.add_option("-f","--force",action="store_true",dest="force",
421 default=False,help="Ignore validation errors by force")
422 parser.add_option("-M","--default-branch",dest="default_branch",
423 help="Set the default branch")
424 parser.add_option("-o","--origin",dest="origin_name",
425 help="use <name> as namespace to track upstream")
426 parser.add_option("--hg-hash",action="store_true",dest="notes",
427 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
428 parser.add_option("-e",dest="encoding",
429 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
431 (options,args)=parser.parse_args()
433 m=-1
434 if options.max!=None: m=options.max
436 if options.marksfile==None: bail(parser,'--marks')
437 if options.mappingfile==None: bail(parser,'--mapping')
438 if options.headsfile==None: bail(parser,'--heads')
439 if options.statusfile==None: bail(parser,'--status')
440 if options.repourl==None: bail(parser,'--repo')
442 a={}
443 if options.authorfile!=None:
444 a=load_mapping('authors', options.authorfile)
446 b={}
447 if options.branchesfile!=None:
448 b=load_mapping('branches', options.branchesfile)
450 t={}
451 if options.tagsfile!=None:
452 t=load_mapping('tags', options.tagsfile)
454 if options.default_branch!=None:
455 set_default_branch(options.default_branch)
457 if options.origin_name!=None:
458 set_origin_name(options.origin_name)
460 encoding=''
461 if options.encoding!=None:
462 encoding=options.encoding
464 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
465 options.headsfile, options.statusfile,
466 authors=a,branchesmap=b,tagsmap=t,
467 sob=options.sob,force=options.force,hgtags=options.hgtags,
468 notes=options.notes,encoding=encoding))