Remove obsolete and unsupported SVN tools
[fast-export.git] / hg-fast-export.py
blobbf5f5b254e59184ebb46be8380ad44ea76328c1e
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139 strip_leading_slash(filename)))
140 wr('data %d' % len(d)) # had some trouble with size()
141 wr(d)
142 count+=1
143 if count%cfg_export_boundary==0:
144 sys.stderr.write('Exported %d/%d files\n' % (count,max))
145 if max>cfg_export_boundary:
146 sys.stderr.write('Exported %d/%d files\n' % (count,max))
148 def sanitize_name(name,what="branch"):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
151 def dot(name):
152 if name[0] == '.': return '_'+name[1:]
153 return name
155 n=name
156 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
157 n=p.sub('_', n)
158 if n[-1] in ('/', '.'): n=n[:-1]+'_'
159 n='/'.join(map(dot,n.split('/')))
160 p=re.compile('_+')
161 n=p.sub('_', n)
163 if n!=name:
164 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
165 return n
167 def strip_leading_slash(filename):
168 if filename[0] == '/':
169 return filename[1:]
170 return filename
172 def export_commit(ui,repo,revision,old_marks,max,count,authors,
173 branchesmap,sob,brmap,hgtags,encoding='',fn_encoding=''):
174 def get_branchname(name):
175 if brmap.has_key(name):
176 return brmap[name]
177 n=sanitize_name(branchesmap.get(name,name))
178 brmap[name]=n
179 return n
181 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
183 branch=get_branchname(branch)
185 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
187 if len(parents)==0 and revision != 0:
188 wr('reset refs/heads/%s' % branch)
190 wr('commit refs/heads/%s' % branch)
191 wr('mark :%d' % (revision+1))
192 if sob:
193 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
194 wr('committer %s %d %s' % (user,time,timezone))
195 wr('data %d' % (len(desc)+1)) # wtf?
196 wr(desc)
197 wr()
199 ctx=repo.changectx(str(revision))
200 man=ctx.manifest()
201 added,changed,removed,type=[],[],[],''
203 if len(parents) == 0:
204 # first revision: feed in full manifest
205 added=man.keys()
206 added.sort()
207 type='full'
208 else:
209 wr('from %s' % revnum_to_revref(parents[0], old_marks))
210 if len(parents) == 1:
211 # later non-merge revision: feed in changed manifest
212 # if we have exactly one parent, just take the changes from the
213 # manifest without expensively comparing checksums
214 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
215 added,changed,removed=f[1],f[0],f[2]
216 type='simple delta'
217 else: # a merge with two parents
218 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
219 # later merge revision: feed in changed manifest
220 # for many files comparing checksums is expensive so only do it for
221 # merges where we really need it due to hg's revlog logic
222 added,changed,removed=get_filechanges(repo,revision,parents,man)
223 type='thorough delta'
225 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
226 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
228 if fn_encoding:
229 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
231 removed=[strip_leading_slash(x) for x in removed]
233 map(lambda r: wr('D %s' % r),removed)
234 export_file_contents(ctx,man,added,hgtags,fn_encoding)
235 export_file_contents(ctx,man,changed,hgtags,fn_encoding)
236 wr()
238 return checkpoint(count)
240 def export_note(ui,repo,revision,count,authors,encoding,is_first):
241 (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
243 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
245 wr('commit refs/notes/hg')
246 wr('committer %s %d %s' % (user,time,timezone))
247 wr('data 0')
248 if is_first:
249 wr('from refs/notes/hg^0')
250 wr('N inline :%d' % (revision+1))
251 hg_hash=repo.changectx(str(revision)).hex()
252 wr('data %d' % (len(hg_hash)))
253 wr_no_nl(hg_hash)
254 wr()
255 return checkpoint(count)
257 wr('data %d' % (len(desc)+1)) # wtf?
258 wr(desc)
259 wr()
261 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
262 l=repo.tagslist()
263 for tag,node in l:
264 # Remap the branch name
265 tag=sanitize_name(tagsmap.get(tag,tag),"tag")
266 # ignore latest revision
267 if tag=='tip': continue
268 # ignore tags to nodes that are missing (ie, 'in the future')
269 if node.encode('hex_codec') not in mapping_cache:
270 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
271 continue
273 rev=int(mapping_cache[node.encode('hex_codec')])
275 ref=revnum_to_revref(rev, old_marks)
276 if ref==None:
277 sys.stderr.write('Failed to find reference for creating tag'
278 ' %s at r%d\n' % (tag,rev))
279 continue
280 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
281 wr('reset refs/tags/%s' % tag)
282 wr('from %s' % ref)
283 wr()
284 count=checkpoint(count)
285 return count
287 def load_mapping(name, filename):
288 cache={}
289 if not os.path.exists(filename):
290 return cache
291 f=open(filename,'r')
294 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
295 for line in f.readlines():
296 l+=1
297 line=line.strip()
298 if line=='' or line[0]=='#':
299 continue
300 m=lre.match(line)
301 if m==None:
302 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
303 continue
304 # put key:value in cache, key without ^:
305 cache[m.group(1).strip()]=m.group(2).strip()
306 a+=1
307 f.close()
308 sys.stderr.write('Loaded %d %s\n' % (a, name))
309 return cache
311 def branchtip(repo, heads):
312 '''return the tipmost branch head in heads'''
313 tip = heads[-1]
314 for h in reversed(heads):
315 if 'close' not in repo.changelog.read(h)[5]:
316 tip = h
317 break
318 return tip
320 def verify_heads(ui,repo,cache,force):
321 branches={}
322 for bn, heads in repo.branchmap().iteritems():
323 branches[bn] = branchtip(repo, heads)
324 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
325 l.sort()
327 # get list of hg's branches to verify, don't take all git has
328 for _,_,b in l:
329 b=get_branch(b)
330 sha1=get_git_sha1(b)
331 c=cache.get(b)
332 if sha1!=c:
333 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
334 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
335 if not force: return False
337 # verify that branch has exactly one head
338 t={}
339 for h in repo.heads():
340 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
341 if t.get(branch,False):
342 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
343 repo.changelog.rev(h))
344 if not force: return False
345 t[branch]=True
347 return True
349 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
350 authors={},branchesmap={},tagsmap={},
351 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''):
352 _max=int(m)
354 old_marks=load_cache(marksfile,lambda s: int(s)-1)
355 mapping_cache=load_cache(mappingfile)
356 heads_cache=load_cache(headsfile)
357 state_cache=load_cache(tipfile)
359 ui,repo=setup_repo(repourl)
361 if not verify_heads(ui,repo,heads_cache,force):
362 return 1
364 try:
365 tip=repo.changelog.count()
366 except AttributeError:
367 tip=len(repo)
369 min=int(state_cache.get('tip',0))
370 max=_max
371 if _max<0 or max>tip:
372 max=tip
374 for rev in range(0,max):
375 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
376 mapping_cache[revnode.encode('hex_codec')] = str(rev)
380 brmap={}
381 for rev in range(min,max):
382 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
383 sob,brmap,hgtags,encoding,fn_encoding)
384 if notes:
385 for rev in range(min,max):
386 c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
388 state_cache['tip']=max
389 state_cache['repo']=repourl
390 save_cache(tipfile,state_cache)
391 save_cache(mappingfile,mapping_cache)
393 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
395 sys.stderr.write('Issued %d commands\n' % c)
397 return 0
399 if __name__=='__main__':
400 def bail(parser,opt):
401 sys.stderr.write('Error: No %s option given\n' % opt)
402 parser.print_help()
403 sys.exit(2)
405 parser=OptionParser()
407 parser.add_option("-m","--max",type="int",dest="max",
408 help="Maximum hg revision to import")
409 parser.add_option("--mapping",dest="mappingfile",
410 help="File to read last run's hg-to-git SHA1 mapping")
411 parser.add_option("--marks",dest="marksfile",
412 help="File to read git-fast-import's marks from")
413 parser.add_option("--heads",dest="headsfile",
414 help="File to read last run's git heads from")
415 parser.add_option("--status",dest="statusfile",
416 help="File to read status from")
417 parser.add_option("-r","--repo",dest="repourl",
418 help="URL of repo to import")
419 parser.add_option("-s",action="store_true",dest="sob",
420 default=False,help="Enable parsing Signed-off-by lines")
421 parser.add_option("--hgtags",action="store_true",dest="hgtags",
422 default=False,help="Enable exporting .hgtags files")
423 parser.add_option("-A","--authors",dest="authorfile",
424 help="Read authormap from AUTHORFILE")
425 parser.add_option("-B","--branches",dest="branchesfile",
426 help="Read branch map from BRANCHESFILE")
427 parser.add_option("-T","--tags",dest="tagsfile",
428 help="Read tags map from TAGSFILE")
429 parser.add_option("-f","--force",action="store_true",dest="force",
430 default=False,help="Ignore validation errors by force")
431 parser.add_option("-M","--default-branch",dest="default_branch",
432 help="Set the default branch")
433 parser.add_option("-o","--origin",dest="origin_name",
434 help="use <name> as namespace to track upstream")
435 parser.add_option("--hg-hash",action="store_true",dest="notes",
436 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
437 parser.add_option("-e",dest="encoding",
438 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
439 parser.add_option("--fe",dest="fn_encoding",
440 help="Assume file names from Mercurial are encoded in <filename_encoding>")
442 (options,args)=parser.parse_args()
444 m=-1
445 if options.max!=None: m=options.max
447 if options.marksfile==None: bail(parser,'--marks')
448 if options.mappingfile==None: bail(parser,'--mapping')
449 if options.headsfile==None: bail(parser,'--heads')
450 if options.statusfile==None: bail(parser,'--status')
451 if options.repourl==None: bail(parser,'--repo')
453 a={}
454 if options.authorfile!=None:
455 a=load_mapping('authors', options.authorfile)
457 b={}
458 if options.branchesfile!=None:
459 b=load_mapping('branches', options.branchesfile)
461 t={}
462 if options.tagsfile!=None:
463 t=load_mapping('tags', options.tagsfile)
465 if options.default_branch!=None:
466 set_default_branch(options.default_branch)
468 if options.origin_name!=None:
469 set_origin_name(options.origin_name)
471 encoding=''
472 if options.encoding!=None:
473 encoding=options.encoding
475 fn_encoding=encoding
476 if options.fn_encoding!=None:
477 fn_encoding=options.fn_encoding
479 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
480 options.headsfile, options.statusfile,
481 authors=a,branchesmap=b,tagsmap=t,
482 sob=options.sob,force=options.force,hgtags=options.hgtags,
483 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))