hg-fast-export.sh should fail if git-fast-import fails
[fast-export/barak.git] / hg-fast-export.py
blob519b556c32165d06cee27c04d6a3033295d27ace
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from tempfile import mkstemp
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
15 # silly regex to catch Signed-off-by lines in log message
16 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
17 # insert 'checkpoint' command after this many commits or none at all if 0
18 cfg_checkpoint_count=0
19 # write some progress message every this many file contents written
20 cfg_export_boundary=1000
22 def gitmode(flags):
23 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
25 def wr(msg=''):
26 if msg == None:
27 msg = ''
28 print msg
29 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
31 def checkpoint(count):
32 count=count+1
33 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
34 sys.stderr.write("Checkpoint after %d commits\n" % count)
35 wr('checkpoint')
36 wr()
37 return count
39 def revnum_to_revref(rev, old_marks):
40 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
41 or a mark)"""
42 return old_marks.get(rev) or ':%d' % (rev+1)
44 def file_mismatch(f1,f2):
45 """See if two revisions of a file are not equal."""
46 return node.hex(f1)!=node.hex(f2)
48 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
49 """Loop over our repository and find all changed and missing files."""
50 for left in dleft.keys():
51 right=dright.get(left,None)
52 if right==None:
53 # we have the file but our parent hasn't: add to left set
54 l.append(left)
55 elif match(dleft[left],right):
56 # we have it but checksums mismatch: add to center set
57 c.append(left)
58 for right in dright.keys():
59 left=dleft.get(right,None)
60 if left==None:
61 # if parent has file but we don't: add to right set
62 r.append(right)
63 # change is already handled when comparing child against parent
64 return l,c,r
66 def get_filechanges(repo,revision,parents,mleft):
67 """Given some repository and revision, find all changed/deleted files."""
68 l,c,r=[],[],[]
69 for p in parents:
70 if p<0: continue
71 mright=repo.changectx(p).manifest()
72 l,c,r=split_dict(mleft,mright,l,c,r)
73 l.sort()
74 c.sort()
75 r.sort()
76 return l,c,r
78 def get_author(logmessage,committer,authors):
79 """As git distincts between author and committer of a patch, try to
80 extract author by detecting Signed-off-by lines.
82 This walks from the end of the log message towards the top skipping
83 empty lines. Upon the first non-empty line, it walks all Signed-off-by
84 lines upwards to find the first one. For that (if found), it extracts
85 authorship information the usual way (authors table, cleaning, etc.)
87 If no Signed-off-by line is found, this defaults to the committer.
89 This may sound stupid (and it somehow is), but in log messages we
90 accidentially may have lines in the middle starting with
91 "Signed-off-by: foo" and thus matching our detection regex. Prevent
92 that."""
94 loglines=logmessage.split('\n')
95 i=len(loglines)
96 # from tail walk to top skipping empty lines
97 while i>=0:
98 i-=1
99 if len(loglines[i].strip())==0: continue
100 break
101 if i>=0:
102 # walk further upwards to find first sob line, store in 'first'
103 first=None
104 while i>=0:
105 m=sob_re.match(loglines[i])
106 if m==None: break
107 first=m
108 i-=1
109 # if the last non-empty line matches our Signed-Off-by regex: extract username
110 if first!=None:
111 r=fixup_user(first.group(1),authors)
112 return r
113 return committer
115 def export_file_contents(ctx,manifest,files):
116 count=0
117 max=len(files)
118 for file in files:
119 # Skip .hgtags files. They only get us in trouble.
120 if file == ".hgtags":
121 sys.stderr.write('Skip %s\n' % (file))
122 continue
123 d=ctx.filectx(file).data()
124 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
125 wr('data %d' % len(d)) # had some trouble with size()
126 wr(d)
127 count+=1
128 if count%cfg_export_boundary==0:
129 sys.stderr.write('Exported %d/%d files\n' % (count,max))
130 if max>cfg_export_boundary:
131 sys.stderr.write('Exported %d/%d files\n' % (count,max))
133 def sanitize_name(name,what="branch"):
134 """Sanitize input roughly according to git-check-ref-format(1)"""
136 def dot(name):
137 if name[0] == '.': return '_'+name[1:]
138 return name
140 n=name
141 p=re.compile('([[ ~^:?*]|\.\.)')
142 n=p.sub('_', n)
143 if n[-1] == '/': n=n[:-1]+'_'
144 n='/'.join(map(dot,n.split('/')))
145 p=re.compile('_+')
146 n=p.sub('_', n)
148 if n!=name:
149 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
150 return n
152 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap):
153 def get_branchname(name):
154 if brmap.has_key(name):
155 return brmap[name]
156 n=sanitize_name(name)
157 brmap[name]=n
158 return n
160 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
162 branch=get_branchname(branch)
164 wr('commit refs/heads/%s' % branch)
165 wr('mark :%d' % (revision+1))
166 if sob:
167 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
168 wr('committer %s %d %s' % (user,time,timezone))
169 wr('data %d' % (len(desc)+1)) # wtf?
170 wr(desc)
171 wr()
173 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
175 # Sort the parents based on revision ids so that we always get the
176 # same resulting git repo, no matter how the revisions were
177 # numbered.
178 parents.sort(key=repo.changelog.node, reverse=True)
180 ctx=repo.changectx(str(revision))
181 man=ctx.manifest()
182 added,changed,removed,type=[],[],[],''
184 if len(parents) == 0:
185 # first revision: feed in full manifest
186 added=man.keys()
187 added.sort()
188 type='full'
189 else:
190 wr('from %s' % revnum_to_revref(parents[0], old_marks))
191 if len(parents) == 1:
192 # later non-merge revision: feed in changed manifest
193 # if we have exactly one parent, just take the changes from the
194 # manifest without expensively comparing checksums
195 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
196 added,changed,removed=f[1],f[0],f[2]
197 type='simple delta'
198 else: # a merge with two parents
199 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
200 # later merge revision: feed in changed manifest
201 # for many files comparing checksums is expensive so only do it for
202 # merges where we really need it due to hg's revlog logic
203 added,changed,removed=get_filechanges(repo,revision,parents,man)
204 type='thorough delta'
206 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
207 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
209 map(lambda r: wr('D %s' % r),removed)
210 export_file_contents(ctx,man,added)
211 export_file_contents(ctx,man,changed)
212 wr()
214 return checkpoint(count)
216 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
217 l=repo.tagslist()
218 for tag,node in l:
219 tag=sanitize_name(tag,"tag")
220 # ignore latest revision
221 if tag=='tip': continue
222 # ignore tags to nodes that are missing (ie, 'in the future')
223 if node.encode('hex_codec') not in mapping_cache:
224 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
225 continue
227 rev=int(mapping_cache[node.encode('hex_codec')])
229 ref=revnum_to_revref(rev, old_marks)
230 if ref==None:
231 sys.stderr.write('Failed to find reference for creating tag'
232 ' %s at r%d\n' % (tag,rev))
233 continue
234 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
235 wr('reset refs/tags/%s' % tag)
236 wr('from %s' % ref)
237 wr()
238 count=checkpoint(count)
239 return count
241 def load_authors(filename):
242 cache={}
243 if not os.path.exists(filename):
244 return cache
245 f=open(filename,'r')
247 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
248 for line in f.readlines():
249 l+=1
250 m=lre.match(line)
251 if m==None:
252 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
253 continue
254 # put key:value in cache, key without ^:
255 cache[m.group(1).strip()]=m.group(2).strip()
256 f.close()
257 sys.stderr.write('Loaded %d authors\n' % l)
258 return cache
260 def verify_heads(ui,repo,cache,force):
261 branches=repo.branchtags()
262 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
263 l.sort()
265 # get list of hg's branches to verify, don't take all git has
266 for _,_,b in l:
267 b=get_branch(b)
268 sha1=get_git_sha1(b)
269 c=cache.get(b)
270 if sha1!=c:
271 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
272 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
273 if not force: return False
275 # verify that branch has exactly one head
276 t={}
277 for h in repo.heads():
278 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
279 if t.get(branch,False):
280 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
281 repo.changelog.rev(h))
282 if not force: return False
283 t[branch]=True
285 return True
287 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False):
288 _max=int(m)
290 old_marks=load_cache(marksfile,lambda s: int(s)-1)
291 mapping_cache=load_cache(mappingfile)
292 heads_cache=load_cache(headsfile)
293 state_cache=load_cache(tipfile)
295 ui,repo=setup_repo(repourl)
297 if not verify_heads(ui,repo,heads_cache,force):
298 return 1
300 try:
301 tip=repo.changelog.count()
302 except AttributeError:
303 tip=len(repo)
305 min=int(state_cache.get('tip',0))
306 max=_max
307 if _max<0 or max>tip:
308 max=tip
310 for rev in range(0,max):
311 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
312 mapping_cache[revnode.encode('hex_codec')] = str(rev)
316 brmap={}
317 for rev in range(min,max):
318 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap)
320 state_cache['tip']=max
321 state_cache['repo']=repourl
322 save_cache(tipfile,state_cache)
323 save_cache(mappingfile,mapping_cache)
325 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
327 sys.stderr.write('Issued %d commands\n' % c)
329 return 0
331 if __name__=='__main__':
332 def bail(parser,opt):
333 sys.stderr.write('Error: No %s option given\n' % opt)
334 parser.print_help()
335 sys.exit(2)
337 parser=OptionParser()
339 parser.add_option("-m","--max",type="int",dest="max",
340 help="Maximum hg revision to import")
341 parser.add_option("--mapping",dest="mappingfile",
342 help="File to read last run's hg-to-git SHA1 mapping")
343 parser.add_option("--marks",dest="marksfile",
344 help="File to read git-fast-import's marks from")
345 parser.add_option("--heads",dest="headsfile",
346 help="File to read last run's git heads from")
347 parser.add_option("--status",dest="statusfile",
348 help="File to read status from")
349 parser.add_option("-r","--repo",dest="repourl",
350 help="URL of repo to import")
351 parser.add_option("-s",action="store_true",dest="sob",
352 default=False,help="Enable parsing Signed-off-by lines")
353 parser.add_option("-A","--authors",dest="authorfile",
354 help="Read authormap from AUTHORFILE")
355 parser.add_option("-f","--force",action="store_true",dest="force",
356 default=False,help="Ignore validation errors by force")
357 parser.add_option("-M","--default-branch",dest="default_branch",
358 help="Set the default branch")
359 parser.add_option("-o","--origin",dest="origin_name",
360 help="use <name> as namespace to track upstream")
362 (options,args)=parser.parse_args()
364 m=-1
365 if options.max!=None: m=options.max
367 if options.marksfile==None: bail(parser,'--marks')
368 if options.mappingfile==None: bail(parser,'--mapping')
369 if options.headsfile==None: bail(parser,'--heads')
370 if options.statusfile==None: bail(parser,'--status')
371 if options.repourl==None: bail(parser,'--repo')
373 a={}
374 if options.authorfile!=None:
375 a=load_authors(options.authorfile)
377 if options.default_branch!=None:
378 set_default_branch(options.default_branch)
380 if options.origin_name!=None:
381 set_origin_name(options.origin_name)
383 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
384 options.statusfile,authors=a,sob=options.sob,force=options.force))