Maintain backwards compatibility for ui setup
[fast-export/benizi.git] / hg-fast-export.py
blobcdb838b1c4d9d9882d533fdfabe47f66ad0ef792
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from tempfile import mkstemp
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
15 # silly regex to catch Signed-off-by lines in log message
16 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
17 # insert 'checkpoint' command after this many commits or none at all if 0
18 cfg_checkpoint_count=0
19 # write some progress message every this many file contents written
20 cfg_export_boundary=1000
22 def gitmode(flags):
23 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
25 def wr(msg=''):
26 if msg == None:
27 msg = ''
28 print msg
29 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
31 def checkpoint(count):
32 count=count+1
33 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
34 sys.stderr.write("Checkpoint after %d commits\n" % count)
35 wr('checkpoint')
36 wr()
37 return count
39 def get_parent_mark(parent,marks):
40 """Get the mark for some parent.
41 If we saw it in the current session, return :%d syntax and
42 otherwise the SHA1 from the cache."""
43 return marks.get(str(parent),':%d' % (parent+1))
45 def file_mismatch(f1,f2):
46 """See if two revisions of a file are not equal."""
47 return node.hex(f1)!=node.hex(f2)
49 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
50 """Loop over our repository and find all changed and missing files."""
51 for left in dleft.keys():
52 right=dright.get(left,None)
53 if right==None:
54 # we have the file but our parent hasn't: add to left set
55 l.append(left)
56 elif match(dleft[left],right):
57 # we have it but checksums mismatch: add to center set
58 c.append(left)
59 for right in dright.keys():
60 left=dleft.get(right,None)
61 if left==None:
62 # if parent has file but we don't: add to right set
63 r.append(right)
64 # change is already handled when comparing child against parent
65 return l,c,r
67 def get_filechanges(repo,revision,parents,mleft):
68 """Given some repository and revision, find all changed/deleted files."""
69 l,c,r=[],[],[]
70 for p in parents:
71 if p<0: continue
72 mright=repo.changectx(p).manifest()
73 l,c,r=split_dict(mleft,mright,l,c,r)
74 l.sort()
75 c.sort()
76 r.sort()
77 return l,c,r
79 def get_author(logmessage,committer,authors):
80 """As git distincts between author and committer of a patch, try to
81 extract author by detecting Signed-off-by lines.
83 This walks from the end of the log message towards the top skipping
84 empty lines. Upon the first non-empty line, it walks all Signed-off-by
85 lines upwards to find the first one. For that (if found), it extracts
86 authorship information the usual way (authors table, cleaning, etc.)
88 If no Signed-off-by line is found, this defaults to the committer.
90 This may sound stupid (and it somehow is), but in log messages we
91 accidentially may have lines in the middle starting with
92 "Signed-off-by: foo" and thus matching our detection regex. Prevent
93 that."""
95 loglines=logmessage.split('\n')
96 i=len(loglines)
97 # from tail walk to top skipping empty lines
98 while i>=0:
99 i-=1
100 if len(loglines[i].strip())==0: continue
101 break
102 if i>=0:
103 # walk further upwards to find first sob line, store in 'first'
104 first=None
105 while i>=0:
106 m=sob_re.match(loglines[i])
107 if m==None: break
108 first=m
109 i-=1
110 # if the last non-empty line matches our Signed-Off-by regex: extract username
111 if first!=None:
112 r=fixup_user(first.group(1),authors)
113 return r
114 return committer
116 def export_file_contents(ctx,manifest,files):
117 count=0
118 max=len(files)
119 for file in files:
120 # Skip .hgtags files. They only get us in trouble.
121 if file == ".hgtags":
122 sys.stderr.write('Skip %s\n' % (file))
123 continue
124 d=ctx.filectx(file).data()
125 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
126 wr('data %d' % len(d)) # had some trouble with size()
127 wr(d)
128 count+=1
129 if count%cfg_export_boundary==0:
130 sys.stderr.write('Exported %d/%d files\n' % (count,max))
131 if max>cfg_export_boundary:
132 sys.stderr.write('Exported %d/%d files\n' % (count,max))
134 def is_merge(parents):
136 for parent in parents:
137 if parent>=0:
138 c+=1
139 return c>1
141 def sanitize_name(name,what="branch"):
142 """Sanitize input roughly according to git-check-ref-format(1)"""
144 def dot(name):
145 if name[0] == '.': return '_'+name[1:]
146 return name
148 n=name
149 p=re.compile('([[ ~^:?*]|\.\.)')
150 n=p.sub('_', n)
151 if n[-1] == '/': n=n[:-1]+'_'
152 n='/'.join(map(dot,n.split('/')))
153 p=re.compile('_+')
154 n=p.sub('_', n)
156 if n!=name:
157 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
158 return n
160 def export_commit(ui,repo,revision,marks,mapping,heads,last,max,count,authors,sob,brmap):
161 def get_branchname(name):
162 if brmap.has_key(name):
163 return brmap[name]
164 n=sanitize_name(name)
165 brmap[name]=n
166 return n
168 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
169 parents=repo.changelog.parentrevs(revision)
171 branch=get_branchname(branch)
173 wr('commit refs/heads/%s' % branch)
174 wr('mark :%d' % (revision+1))
175 if sob:
176 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
177 wr('committer %s %d %s' % (user,time,timezone))
178 wr('data %d' % (len(desc)+1)) # wtf?
179 wr(desc)
180 wr()
182 pidx1, pidx2 = 0, 1
183 if parents[1] > 0:
184 if parents[0] <= 0 or \
185 repo.changelog.node(parents[0]) < repo.changelog.node(parents[1]):
186 pidx1, pidx2 = 1, 0
188 full_rev=False
189 if revision==0: full_rev=True
191 src=heads.get(branch,'')
192 link=''
193 if src!='':
194 # if we have a cached head, this is an incremental import: initialize it
195 # and kill reference so we won't init it again
196 wr('from %s' % src)
197 heads[branch]=''
198 sys.stderr.write('%s: Initializing to parent [%s]\n' %
199 (branch,src))
200 link=src # avoid making a merge commit for incremental import
201 elif link=='' and not heads.has_key(branch) and revision>0:
202 if parents[0]>=0:
203 # newly created branch with parent: connect to parent
204 tmp=get_parent_mark(parents[0],marks)
205 wr('from %s' % tmp)
206 sys.stderr.write('%s: Link new branch to parent [%s]\n' %
207 (branch,tmp))
208 link=tmp # avoid making a merge commit for branch fork
209 else:
210 # newly created branch without parent: feed full revision
211 full_rev=True
212 elif last.get(branch,revision) != parents[pidx1] and parents[pidx1] > 0 and revision > 0:
213 pm=get_parent_mark(parents[pidx1],marks)
214 sys.stderr.write('%s: Placing commit [r%d] in branch [%s] on top of [r%d]\n' %
215 (branch,revision,branch,parents[pidx1]));
216 wr('from %s' % pm)
218 if parents[pidx2] > 0:
219 pm=get_parent_mark(parents[pidx2],marks)
220 sys.stderr.write('%s: Merging with parent [%s] from [r%d]\n' %
221 (branch,pm,parents[pidx2]))
222 wr('merge %s' % pm)
224 last[branch]=revision
225 heads[branch]=''
226 # we need this later to write out tags
227 marks[str(revision)]=':%d'%(revision+1)
229 ctx=repo.changectx(str(revision))
230 man=ctx.manifest()
231 added,changed,removed,type=[],[],[],''
233 if full_rev:
234 # first revision: feed in full manifest
235 added=man.keys()
236 added.sort()
237 type='full'
238 elif is_merge(parents):
239 # later merge revision: feed in changed manifest
240 # for many files comparing checksums is expensive so only do it for
241 # merges where we really need it due to hg's revlog logic
242 added,changed,removed=get_filechanges(repo,revision,parents,man)
243 type='thorough delta'
244 else:
245 # later non-merge revision: feed in changed manifest
246 # if we have exactly one parent, just take the changes from the
247 # manifest without expensively comparing checksums
248 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
249 added,changed,removed=f[1],f[0],f[2]
250 type='simple delta'
252 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
253 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
255 map(lambda r: wr('D %s' % r),removed)
256 export_file_contents(ctx,man,added)
257 export_file_contents(ctx,man,changed)
258 wr()
260 return checkpoint(count)
262 def export_tags(ui,repo,marks_cache,mapping_cache,count,authors):
263 l=repo.tagslist()
264 for tag,node in l:
265 tag=sanitize_name(tag,"tag")
266 # ignore latest revision
267 if tag=='tip': continue
268 # ignore tags to nodes that are missing (ie, 'in the future')
269 if node.encode('hex_codec') not in mapping_cache:
270 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
271 continue
273 rev=int(mapping_cache[node.encode('hex_codec')])
275 ref=marks_cache.get(str(rev),':%d' % (rev))
276 if ref==None:
277 sys.stderr.write('Failed to find reference for creating tag'
278 ' %s at r%d\n' % (tag,rev))
279 continue
280 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
281 wr('reset refs/tags/%s' % tag)
282 wr('from %s' % ref)
283 wr()
284 count=checkpoint(count)
285 return count
287 def load_authors(filename):
288 cache={}
289 if not os.path.exists(filename):
290 return cache
291 f=open(filename,'r')
293 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
294 for line in f.readlines():
295 l+=1
296 m=lre.match(line)
297 if m==None:
298 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
299 continue
300 # put key:value in cache, key without ^:
301 cache[m.group(1).strip()]=m.group(2).strip()
302 f.close()
303 sys.stderr.write('Loaded %d authors\n' % l)
304 return cache
306 def verify_heads(ui,repo,cache,force):
307 branches=repo.branchtags()
308 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
309 l.sort()
311 # get list of hg's branches to verify, don't take all git has
312 for _,_,b in l:
313 b=get_branch(b)
314 sha1=get_git_sha1(b)
315 c=cache.get(b)
316 if sha1!=None and c!=None:
317 sys.stderr.write('Verifying branch [%s]\n' % b)
318 if sha1!=c:
319 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
320 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
321 if not force: return False
323 # verify that branch has exactly one head
324 t={}
325 for h in repo.heads():
326 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
327 if t.get(branch,False):
328 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
329 repo.changelog.rev(h))
330 if not force: return False
331 t[branch]=True
333 return True
335 def mangle_mark(mark):
336 return str(int(mark)-1)
338 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False):
339 _max=int(m)
341 marks_cache=load_cache(marksfile,mangle_mark)
342 mapping_cache=load_cache(mappingfile)
343 heads_cache=load_cache(headsfile)
344 state_cache=load_cache(tipfile)
346 ui,repo=setup_repo(repourl)
348 if not verify_heads(ui,repo,heads_cache,force):
349 return 1
351 try:
352 tip=repo.changelog.count()
353 except AttributeError:
354 tip=len(repo)
356 min=int(state_cache.get('tip',0))
357 max=_max
358 if _max<0 or max>tip:
359 max=tip
361 for rev in range(0,max):
362 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
363 mapping_cache[revnode.encode('hex_codec')] = str(rev)
367 last={}
368 brmap={}
369 for rev in range(min,max):
370 c=export_commit(ui,repo,rev,marks_cache,mapping_cache,heads_cache,last,max,c,authors,sob,brmap)
372 state_cache['tip']=max
373 state_cache['repo']=repourl
374 save_cache(tipfile,state_cache)
375 save_cache(mappingfile,mapping_cache)
377 c=export_tags(ui,repo,marks_cache,mapping_cache,c,authors)
379 sys.stderr.write('Issued %d commands\n' % c)
381 return 0
383 if __name__=='__main__':
384 def bail(parser,opt):
385 sys.stderr.write('Error: No %s option given\n' % opt)
386 parser.print_help()
387 sys.exit(2)
389 parser=OptionParser()
391 parser.add_option("-m","--max",type="int",dest="max",
392 help="Maximum hg revision to import")
393 parser.add_option("--mapping",dest="mappingfile",
394 help="File to read last run's hg-to-git SHA1 mapping")
395 parser.add_option("--marks",dest="marksfile",
396 help="File to read git-fast-import's marks from")
397 parser.add_option("--heads",dest="headsfile",
398 help="File to read last run's git heads from")
399 parser.add_option("--status",dest="statusfile",
400 help="File to read status from")
401 parser.add_option("-r","--repo",dest="repourl",
402 help="URL of repo to import")
403 parser.add_option("-s",action="store_true",dest="sob",
404 default=False,help="Enable parsing Signed-off-by lines")
405 parser.add_option("-A","--authors",dest="authorfile",
406 help="Read authormap from AUTHORFILE")
407 parser.add_option("-f","--force",action="store_true",dest="force",
408 default=False,help="Ignore validation errors by force")
409 parser.add_option("-M","--default-branch",dest="default_branch",
410 help="Set the default branch")
411 parser.add_option("-o","--origin",dest="origin_name",
412 help="use <name> as namespace to track upstream")
414 (options,args)=parser.parse_args()
416 m=-1
417 if options.max!=None: m=options.max
419 if options.marksfile==None: bail(parser,'--marks')
420 if options.mappingfile==None: bail(parser,'--mapping')
421 if options.headsfile==None: bail(parser,'--heads')
422 if options.statusfile==None: bail(parser,'--status')
423 if options.repourl==None: bail(parser,'--repo')
425 a={}
426 if options.authorfile!=None:
427 a=load_authors(options.authorfile)
429 if options.default_branch!=None:
430 set_default_branch(options.default_branch)
432 if options.origin_name!=None:
433 set_origin_name(options.origin_name)
435 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
436 options.statusfile,authors=a,sob=options.sob,force=options.force))