hg export: Support tag movement
[fast-export/fast-export-unix-compliant.git] / hg-fast-export.py
blobdd9f1795347d9cc12ad9a6ba138213343bf2654a
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch
9 from tempfile import mkstemp
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
15 # silly regex to catch Signed-off-by lines in log message
16 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
17 # insert 'checkpoint' command after this many commits or none at all if 0
18 cfg_checkpoint_count=0
19 # write some progress message every this many file contents written
20 cfg_export_boundary=1000
22 def gitmode(flags):
23 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
25 def wr(msg=''):
26 if msg == None:
27 msg = ''
28 print msg
29 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
31 def checkpoint(count):
32 count=count+1
33 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
34 sys.stderr.write("Checkpoint after %d commits\n" % count)
35 wr('checkpoint')
36 wr()
37 return count
39 def get_parent_mark(parent,marks):
40 """Get the mark for some parent.
41 If we saw it in the current session, return :%d syntax and
42 otherwise the SHA1 from the cache."""
43 return marks.get(str(parent),':%d' % (parent+1))
45 def file_mismatch(f1,f2):
46 """See if two revisions of a file are not equal."""
47 return node.hex(f1)!=node.hex(f2)
49 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
50 """Loop over our repository and find all changed and missing files."""
51 for left in dleft.keys():
52 right=dright.get(left,None)
53 if right==None:
54 # we have the file but our parent hasn't: add to left set
55 l.append(left)
56 elif match(dleft[left],right):
57 # we have it but checksums mismatch: add to center set
58 c.append(left)
59 for right in dright.keys():
60 left=dleft.get(right,None)
61 if left==None:
62 # if parent has file but we don't: add to right set
63 r.append(right)
64 # change is already handled when comparing child against parent
65 return l,c,r
67 def get_filechanges(repo,revision,parents,mleft):
68 """Given some repository and revision, find all changed/deleted files."""
69 l,c,r=[],[],[]
70 for p in parents:
71 if p<0: continue
72 mright=repo.changectx(p).manifest()
73 l,c,r=split_dict(mleft,mright,l,c,r)
74 l.sort()
75 c.sort()
76 r.sort()
77 return l,c,r
79 def get_author(logmessage,committer,authors):
80 """As git distincts between author and committer of a patch, try to
81 extract author by detecting Signed-off-by lines.
83 This walks from the end of the log message towards the top skipping
84 empty lines. Upon the first non-empty line, it walks all Signed-off-by
85 lines upwards to find the first one. For that (if found), it extracts
86 authorship information the usual way (authors table, cleaning, etc.)
88 If no Signed-off-by line is found, this defaults to the committer.
90 This may sound stupid (and it somehow is), but in log messages we
91 accidentially may have lines in the middle starting with
92 "Signed-off-by: foo" and thus matching our detection regex. Prevent
93 that."""
95 loglines=logmessage.split('\n')
96 i=len(loglines)
97 # from tail walk to top skipping empty lines
98 while i>=0:
99 i-=1
100 if len(loglines[i].strip())==0: continue
101 break
102 if i>=0:
103 # walk further upwards to find first sob line, store in 'first'
104 first=None
105 while i>=0:
106 m=sob_re.match(loglines[i])
107 if m==None: break
108 first=m
109 i-=1
110 # if the last non-empty line matches our Signed-Off-by regex: extract username
111 if first!=None:
112 r=fixup_user(first.group(1),authors)
113 return r
114 return committer
116 def export_file_contents(ctx,manifest,files):
117 count=0
118 max=len(files)
119 for file in files:
120 # Skip .hgtags files. They only get us in trouble.
121 if file == ".hgtags":
122 sys.stderr.write('Skip %s\n' % (file))
123 continue
124 d=ctx.filectx(file).data()
125 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
126 wr('data %d' % len(d)) # had some trouble with size()
127 wr(d)
128 count+=1
129 if count%cfg_export_boundary==0:
130 sys.stderr.write('Exported %d/%d files\n' % (count,max))
131 if max>cfg_export_boundary:
132 sys.stderr.write('Exported %d/%d files\n' % (count,max))
134 def is_merge(parents):
136 for parent in parents:
137 if parent>=0:
138 c+=1
139 return c>1
141 def sanitize_name(name,what="branch"):
142 """Sanitize input roughly according to git-check-ref-format(1)"""
144 def dot(name):
145 if name[0] == '.': return '_'+name[1:]
146 return name
148 n=name
149 p=re.compile('([[ ~^:?*]|\.\.)')
150 n=p.sub('_', n)
151 if n[-1] == '/': n=n[:-1]+'_'
152 n='/'.join(map(dot,n.split('/')))
153 p=re.compile('_+')
154 n=p.sub('_', n)
156 if n!=name:
157 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
158 return n
160 def export_commit(ui,repo,revision,marks,mapping,heads,last,max,count,authors,sob,brmap):
161 def get_branchname(name):
162 if brmap.has_key(name):
163 return brmap[name]
164 n=sanitize_name(name)
165 brmap[name]=n
166 return n
168 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
169 parents=repo.changelog.parentrevs(revision)
171 branch=get_branchname(branch)
173 wr('commit refs/heads/%s' % branch)
174 wr('mark :%d' % (revision+1))
175 if sob:
176 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
177 wr('committer %s %d %s' % (user,time,timezone))
178 wr('data %d' % (len(desc)+1)) # wtf?
179 wr(desc)
180 wr()
182 pidx1, pidx2 = 0, 1
183 if parents[0] < parents[1]:
184 pidx1, pidx2 = 1, 0
186 src=heads.get(branch,'')
187 link=''
188 if src!='':
189 # if we have a cached head, this is an incremental import: initialize it
190 # and kill reference so we won't init it again
191 wr('from %s' % src)
192 heads[branch]=''
193 sys.stderr.write('%s: Initializing to parent [%s]\n' %
194 (branch,src))
195 link=src # avoid making a merge commit for incremental import
196 elif link=='' and not heads.has_key(branch) and revision>0:
197 # newly created branch and not the first one: connect to parent
198 tmp=get_parent_mark(parents[0],marks)
199 wr('from %s' % tmp)
200 sys.stderr.write('%s: Link new branch to parent [%s]\n' %
201 (branch,tmp))
202 link=tmp # avoid making a merge commit for branch fork
203 elif last.get(branch,revision) != parents[pidx1] and parents[pidx1] > 0 and revision > 0:
204 pm=get_parent_mark(parents[pidx1],marks)
205 sys.stderr.write('%s: Placing commit [r%d] in branch [%s] on top of [r%d]\n' %
206 (branch,revision,branch,parents[pidx1]));
207 wr('from %s' % pm)
209 if parents[pidx2] > 0:
210 pm=get_parent_mark(parents[pidx2],marks)
211 sys.stderr.write('%s: Merging with parent [%s] from [r%d]\n' %
212 (branch,pm,parents[pidx2]))
213 wr('merge %s' % pm)
215 last[branch]=revision
216 heads[branch]=''
217 # we need this later to write out tags
218 marks[str(revision)]=':%d'%(revision+1)
220 ctx=repo.changectx(str(revision))
221 man=ctx.manifest()
222 added,changed,removed,type=[],[],[],''
224 if revision==0:
225 # first revision: feed in full manifest
226 added=man.keys()
227 added.sort()
228 type='full'
229 elif is_merge(parents):
230 # later merge revision: feed in changed manifest
231 # for many files comparing checksums is expensive so only do it for
232 # merges where we really need it due to hg's revlog logic
233 added,changed,removed=get_filechanges(repo,revision,parents,man)
234 type='thorough delta'
235 else:
236 # later non-merge revision: feed in changed manifest
237 # if we have exactly one parent, just take the changes from the
238 # manifest without expensively comparing checksums
239 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
240 added,changed,removed=f[1],f[0],f[2]
241 type='simple delta'
243 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
244 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
246 map(lambda r: wr('D %s' % r),removed)
247 export_file_contents(ctx,man,added)
248 export_file_contents(ctx,man,changed)
249 wr()
251 return checkpoint(count)
253 def export_tags(ui,repo,marks_cache,mapping_cache,count,authors):
254 l=repo.tagslist()
255 for tag,node in l:
256 tag=sanitize_name(tag,"tag")
257 # ignore latest revision
258 if tag=='tip': continue
259 # ignore tags to nodes that are missing (ie, 'in the future')
260 if node.encode('hex_codec') not in mapping_cache:
261 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
262 continue
264 rev=int(mapping_cache[node.encode('hex_codec')])
266 ref=marks_cache.get(str(rev),':%d' % (rev))
267 if ref==None:
268 sys.stderr.write('Failed to find reference for creating tag'
269 ' %s at r%d\n' % (tag,rev))
270 continue
271 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
272 wr('reset refs/tags/%s' % tag)
273 wr('from %s' % ref)
274 wr()
275 count=checkpoint(count)
276 return count
278 def load_authors(filename):
279 cache={}
280 if not os.path.exists(filename):
281 return cache
282 f=open(filename,'r')
284 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
285 for line in f.readlines():
286 l+=1
287 m=lre.match(line)
288 if m==None:
289 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
290 continue
291 # put key:value in cache, key without ^:
292 cache[m.group(1).strip()]=m.group(2).strip()
293 f.close()
294 sys.stderr.write('Loaded %d authors\n' % l)
295 return cache
297 def verify_heads(ui,repo,cache,force):
298 branches=repo.branchtags()
299 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
300 l.sort()
302 # get list of hg's branches to verify, don't take all git has
303 for _,_,b in l:
304 b=get_branch(b)
305 sha1=get_git_sha1(b)
306 c=cache.get(b)
307 if sha1!=None and c!=None:
308 sys.stderr.write('Verifying branch [%s]\n' % b)
309 if sha1!=c:
310 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
311 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
312 if not force: return False
314 # verify that branch has exactly one head
315 t={}
316 for h in repo.heads():
317 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
318 if t.get(branch,False):
319 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
320 repo.changelog.rev(h))
321 if not force: return False
322 t[branch]=True
324 return True
326 def mangle_mark(mark):
327 return str(int(mark)-1)
329 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False):
330 _max=int(m)
332 marks_cache=load_cache(marksfile,mangle_mark)
333 mapping_cache=load_cache(mappingfile)
334 heads_cache=load_cache(headsfile)
335 state_cache=load_cache(tipfile)
337 ui,repo=setup_repo(repourl)
339 if not verify_heads(ui,repo,heads_cache,force):
340 return 1
342 try:
343 tip=repo.changelog.count()
344 except AttributeError:
345 tip=len(repo)
347 min=int(state_cache.get('tip',0))
348 max=_max
349 if _max<0 or max>tip:
350 max=tip
352 for rev in range(0,max):
353 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
354 mapping_cache[revnode.encode('hex_codec')] = str(rev)
358 last={}
359 brmap={}
360 for rev in range(min,max):
361 c=export_commit(ui,repo,rev,marks_cache,mapping_cache,heads_cache,last,max,c,authors,sob,brmap)
363 state_cache['tip']=max
364 state_cache['repo']=repourl
365 save_cache(tipfile,state_cache)
366 save_cache(mappingfile,mapping_cache)
368 c=export_tags(ui,repo,marks_cache,mapping_cache,c,authors)
370 sys.stderr.write('Issued %d commands\n' % c)
372 return 0
374 if __name__=='__main__':
375 def bail(parser,opt):
376 sys.stderr.write('Error: No %s option given\n' % opt)
377 parser.print_help()
378 sys.exit(2)
380 parser=OptionParser()
382 parser.add_option("-m","--max",type="int",dest="max",
383 help="Maximum hg revision to import")
384 parser.add_option("--mapping",dest="mappingfile",
385 help="File to read last run's hg-to-git SHA1 mapping")
386 parser.add_option("--marks",dest="marksfile",
387 help="File to read git-fast-import's marks from")
388 parser.add_option("--heads",dest="headsfile",
389 help="File to read last run's git heads from")
390 parser.add_option("--status",dest="statusfile",
391 help="File to read status from")
392 parser.add_option("-r","--repo",dest="repourl",
393 help="URL of repo to import")
394 parser.add_option("-s",action="store_true",dest="sob",
395 default=False,help="Enable parsing Signed-off-by lines")
396 parser.add_option("-A","--authors",dest="authorfile",
397 help="Read authormap from AUTHORFILE")
398 parser.add_option("-f","--force",action="store_true",dest="force",
399 default=False,help="Ignore validation errors by force")
400 parser.add_option("-M","--default-branch",dest="default_branch",
401 help="Set the default branch")
403 (options,args)=parser.parse_args()
405 m=-1
406 if options.max!=None: m=options.max
408 if options.marksfile==None: bail(parser,'--marks')
409 if options.mappingfile==None: bail(parser,'--mapping')
410 if options.headsfile==None: bail(parser,'--heads')
411 if options.statusfile==None: bail(parser,'--status')
412 if options.repourl==None: bail(parser,'--repo')
414 a={}
415 if options.authorfile!=None:
416 a=load_authors(options.authorfile)
418 if options.default_branch!=None:
419 set_default_branch(options.default_branch)
421 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
422 options.statusfile,authors=a,sob=options.sob,force=options.force))