hg-fast-export: Catch up with mercurial crew API changes
[fast-export/benizi.git] / hg-fast-export.py
blobca3ab6950f8e3c1d21cb255f56462bb1e67a04ab
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset,load_cache,save_cache,get_git_sha1
8 from tempfile import mkstemp
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 # silly regex to catch Signed-off-by lines in log message
15 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
16 # insert 'checkpoint' command after this many commits or none at all if 0
17 cfg_checkpoint_count=0
18 # write some progress message every this many file contents written
19 cfg_export_boundary=1000
21 def gitmode(flags):
22 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
24 def wr(msg=''):
25 if msg == None:
26 msg = ''
27 print msg
28 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
30 def checkpoint(count):
31 count=count+1
32 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
33 sys.stderr.write("Checkpoint after %d commits\n" % count)
34 wr('checkpoint')
35 wr()
36 return count
38 def get_parent_mark(parent,marks):
39 """Get the mark for some parent.
40 If we saw it in the current session, return :%d syntax and
41 otherwise the SHA1 from the cache."""
42 return marks.get(str(parent),':%d' % (parent+1))
44 def file_mismatch(f1,f2):
45 """See if two revisions of a file are not equal."""
46 return node.hex(f1)!=node.hex(f2)
48 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
49 """Loop over our repository and find all changed and missing files."""
50 for left in dleft.keys():
51 right=dright.get(left,None)
52 if right==None:
53 # we have the file but our parent hasn't: add to left set
54 l.append(left)
55 elif match(dleft[left],right):
56 # we have it but checksums mismatch: add to center set
57 c.append(left)
58 for right in dright.keys():
59 left=dleft.get(right,None)
60 if left==None:
61 # if parent has file but we don't: add to right set
62 r.append(right)
63 # change is already handled when comparing child against parent
64 return l,c,r
66 def get_filechanges(repo,revision,parents,mleft):
67 """Given some repository and revision, find all changed/deleted files."""
68 l,c,r=[],[],[]
69 for p in parents:
70 if p<0: continue
71 mright=repo.changectx(p).manifest()
72 l,c,r=split_dict(mleft,mright,l,c,r)
73 l.sort()
74 c.sort()
75 r.sort()
76 return l,c,r
78 def get_author(logmessage,committer,authors):
79 """As git distincts between author and committer of a patch, try to
80 extract author by detecting Signed-off-by lines.
82 This walks from the end of the log message towards the top skipping
83 empty lines. Upon the first non-empty line, it walks all Signed-off-by
84 lines upwards to find the first one. For that (if found), it extracts
85 authorship information the usual way (authors table, cleaning, etc.)
87 If no Signed-off-by line is found, this defaults to the committer.
89 This may sound stupid (and it somehow is), but in log messages we
90 accidentially may have lines in the middle starting with
91 "Signed-off-by: foo" and thus matching our detection regex. Prevent
92 that."""
94 loglines=logmessage.split('\n')
95 i=len(loglines)
96 # from tail walk to top skipping empty lines
97 while i>=0:
98 i-=1
99 if len(loglines[i].strip())==0: continue
100 break
101 if i>=0:
102 # walk further upwards to find first sob line, store in 'first'
103 first=None
104 while i>=0:
105 m=sob_re.match(loglines[i])
106 if m==None: break
107 first=m
108 i-=1
109 # if the last non-empty line matches our Signed-Off-by regex: extract username
110 if first!=None:
111 r=fixup_user(first.group(1),authors)
112 return r
113 return committer
115 def export_file_contents(ctx,manifest,files):
116 count=0
117 max=len(files)
118 for file in files:
119 d=ctx.filectx(file).data()
120 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
121 wr('data %d' % len(d)) # had some trouble with size()
122 wr(d)
123 count+=1
124 if count%cfg_export_boundary==0:
125 sys.stderr.write('Exported %d/%d files\n' % (count,max))
126 if max>cfg_export_boundary:
127 sys.stderr.write('Exported %d/%d files\n' % (count,max))
129 def is_merge(parents):
131 for parent in parents:
132 if parent>=0:
133 c+=1
134 return c>1
136 def sanitize_name(name,what="branch"):
137 """Sanitize input roughly according to git-check-ref-format(1)"""
139 def dot(name):
140 if name[0] == '.': return '_'+name[1:]
141 return name
143 n=name
144 p=re.compile('([[ ~^:?*]|\.\.)')
145 n=p.sub('_', n)
146 if n[-1] == '/': n=n[:-1]+'_'
147 n='/'.join(map(dot,n.split('/')))
148 p=re.compile('_+')
149 n=p.sub('_', n)
151 if n!=name:
152 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
153 return n
155 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob,brmap):
156 def get_branchname(name):
157 if brmap.has_key(name):
158 return brmap[name]
159 n=sanitize_name(name)
160 brmap[name]=n
161 return n
163 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
164 parents=repo.changelog.parentrevs(revision)
166 branch=get_branchname(branch)
168 wr('commit refs/heads/%s' % branch)
169 wr('mark :%d' % (revision+1))
170 if sob:
171 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
172 wr('committer %s %d %s' % (user,time,timezone))
173 wr('data %d' % (len(desc)+1)) # wtf?
174 wr(desc)
175 wr()
177 pidx1, pidx2 = 0, 1
178 if parents[0] < parents[1]:
179 pidx1, pidx2 = 1, 0
181 src=heads.get(branch,'')
182 link=''
183 if src!='':
184 # if we have a cached head, this is an incremental import: initialize it
185 # and kill reference so we won't init it again
186 wr('from %s' % src)
187 heads[branch]=''
188 sys.stderr.write('%s: Initializing to parent [%s]\n' %
189 (branch,src))
190 link=src # avoid making a merge commit for incremental import
191 elif link=='' and not heads.has_key(branch) and revision>0:
192 # newly created branch and not the first one: connect to parent
193 tmp=get_parent_mark(parents[0],marks)
194 wr('from %s' % tmp)
195 sys.stderr.write('%s: Link new branch to parent [%s]\n' %
196 (branch,tmp))
197 link=tmp # avoid making a merge commit for branch fork
198 elif last.get(branch,revision) != parents[pidx1] and parents[pidx1] > 0 and revision > 0:
199 pm=get_parent_mark(parents[pidx1],marks)
200 sys.stderr.write('%s: Placing commit [r%d] in branch [%s] on top of [r%d]\n' %
201 (branch,revision,branch,parents[pidx1]));
202 wr('from %s' % pm)
204 if parents[pidx2] > 0:
205 pm=get_parent_mark(parents[pidx2],marks)
206 sys.stderr.write('%s: Merging with parent [%s] from [r%d]\n' %
207 (branch,pm,parents[pidx2]))
208 wr('merge %s' % pm)
210 last[branch]=revision
211 heads[branch]=''
212 # we need this later to write out tags
213 marks[str(revision)]=':%d'%(revision+1)
215 ctx=repo.changectx(str(revision))
216 man=ctx.manifest()
217 added,changed,removed,type=[],[],[],''
219 if revision==0:
220 # first revision: feed in full manifest
221 added=man.keys()
222 added.sort()
223 type='full'
224 elif is_merge(parents):
225 # later merge revision: feed in changed manifest
226 # for many files comparing checksums is expensive so only do it for
227 # merges where we really need it due to hg's revlog logic
228 added,changed,removed=get_filechanges(repo,revision,parents,man)
229 type='thorough delta'
230 else:
231 # later non-merge revision: feed in changed manifest
232 # if we have exactly one parent, just take the changes from the
233 # manifest without expensively comparing checksums
234 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
235 added,changed,removed=f[1],f[0],f[2]
236 type='simple delta'
238 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
239 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
241 map(lambda r: wr('D %s' % r),removed)
242 export_file_contents(ctx,man,added)
243 export_file_contents(ctx,man,changed)
244 wr()
246 return checkpoint(count)
248 def export_tags(ui,repo,marks_cache,start,end,count,authors):
249 l=repo.tagslist()
250 for tag,node in l:
251 tag=sanitize_name(tag,"tag")
252 # ignore latest revision
253 if tag=='tip': continue
254 rev=repo.changelog.rev(node)
255 # ignore those tags not in our import range
256 if rev<start or rev>=end: continue
258 ref=get_parent_mark(rev,marks_cache)
259 if ref==None:
260 sys.stderr.write('Failed to find reference for creating tag'
261 ' %s at r%d\n' % (tag,rev))
262 continue
263 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
264 wr('reset refs/tags/%s' % tag)
265 wr('from %s' % ref)
266 wr()
267 count=checkpoint(count)
268 return count
270 def load_authors(filename):
271 cache={}
272 if not os.path.exists(filename):
273 return cache
274 f=open(filename,'r')
276 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
277 for line in f.readlines():
278 l+=1
279 m=lre.match(line)
280 if m==None:
281 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
282 continue
283 # put key:value in cache, key without ^:
284 cache[m.group(1).strip()]=m.group(2).strip()
285 f.close()
286 sys.stderr.write('Loaded %d authors\n' % l)
287 return cache
289 def verify_heads(ui,repo,cache,force):
290 branches=repo.branchtags()
291 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
292 l.sort()
294 # get list of hg's branches to verify, don't take all git has
295 for _,_,b in l:
296 b=get_branch(b)
297 sha1=get_git_sha1(b)
298 c=cache.get(b)
299 if sha1!=None and c!=None:
300 sys.stderr.write('Verifying branch [%s]\n' % b)
301 if sha1!=c:
302 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
303 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
304 if not force: return False
306 # verify that branch has exactly one head
307 t={}
308 for h in repo.heads():
309 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
310 if t.get(branch,False):
311 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
312 repo.changelog.rev(h))
313 if not force: return False
314 t[branch]=True
316 return True
318 def mangle_mark(mark):
319 return str(int(mark)-1)
321 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
322 _max=int(m)
324 marks_cache=load_cache(marksfile,mangle_mark)
325 heads_cache=load_cache(headsfile)
326 state_cache=load_cache(tipfile)
328 ui,repo=setup_repo(repourl)
330 if not verify_heads(ui,repo,heads_cache,force):
331 return 1
333 try:
334 tip=repo.changelog.count()
335 except AttributeError:
336 tip=len(repo)
338 min=int(state_cache.get('tip',0))
339 max=_max
340 if _max<0 or max>tip:
341 max=tip
344 last={}
345 brmap={}
346 for rev in range(min,max):
347 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob,brmap)
349 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
351 sys.stderr.write('Issued %d commands\n' % c)
353 state_cache['tip']=max
354 state_cache['repo']=repourl
355 save_cache(tipfile,state_cache)
357 return 0
359 if __name__=='__main__':
360 def bail(parser,opt):
361 sys.stderr.write('Error: No %s option given\n' % opt)
362 parser.print_help()
363 sys.exit(2)
365 parser=OptionParser()
367 parser.add_option("-m","--max",type="int",dest="max",
368 help="Maximum hg revision to import")
369 parser.add_option("--marks",dest="marksfile",
370 help="File to read git-fast-import's marks from")
371 parser.add_option("--heads",dest="headsfile",
372 help="File to read last run's git heads from")
373 parser.add_option("--status",dest="statusfile",
374 help="File to read status from")
375 parser.add_option("-r","--repo",dest="repourl",
376 help="URL of repo to import")
377 parser.add_option("-s",action="store_true",dest="sob",
378 default=False,help="Enable parsing Signed-off-by lines")
379 parser.add_option("-A","--authors",dest="authorfile",
380 help="Read authormap from AUTHORFILE")
381 parser.add_option("-f","--force",action="store_true",dest="force",
382 default=False,help="Ignore validation errors by force")
384 (options,args)=parser.parse_args()
386 m=-1
387 if options.max!=None: m=options.max
389 if options.marksfile==None: bail(parser,'--marks')
390 if options.headsfile==None: bail(parser,'--heads')
391 if options.statusfile==None: bail(parser,'--status')
392 if options.repourl==None: bail(parser,'--repo')
394 a={}
395 if options.authorfile!=None:
396 a=load_authors(options.authorfile)
398 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
399 options.statusfile,authors=a,sob=options.sob,force=options.force))