Fix broken support for bare repositories
[fast-export.git] / hg-fast-export.py
blob6a184c17439ccf8bd5433feaaa99c4280a118ea5
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139 strip_leading_slash(filename)))
140 wr('data %d' % len(d)) # had some trouble with size()
141 wr(d)
142 count+=1
143 if count%cfg_export_boundary==0:
144 sys.stderr.write('Exported %d/%d files\n' % (count,max))
145 if max>cfg_export_boundary:
146 sys.stderr.write('Exported %d/%d files\n' % (count,max))
148 def sanitize_name(name,what="branch", mapping={}):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
151 # NOTE: Do not update this transform to work around
152 # incompatibilities on your platform. If you change it and it starts
153 # modifying names which previously were not touched it will break
154 # preexisting setups which are doing incremental imports.
156 # Use the -B and -T options to mangle branch and tag names
157 # instead. If you have a source repository where this is too much
158 # work to do manually, write a tool that does it for you.
160 def dot(name):
161 if name[0] == '.': return '_'+name[1:]
162 return name
164 n=mapping.get(name,name)
165 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
166 n=p.sub('_', n)
167 if n[-1] in ('/', '.'): n=n[:-1]+'_'
168 n='/'.join(map(dot,n.split('/')))
169 p=re.compile('_+')
170 n=p.sub('_', n)
172 if n!=name:
173 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
174 return n
176 def strip_leading_slash(filename):
177 if filename[0] == '/':
178 return filename[1:]
179 return filename
181 def export_commit(ui,repo,revision,old_marks,max,count,authors,
182 branchesmap,sob,brmap,hgtags,encoding='',fn_encoding=''):
183 def get_branchname(name):
184 if brmap.has_key(name):
185 return brmap[name]
186 n=sanitize_name(name, "branch", branchesmap)
187 brmap[name]=n
188 return n
190 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
192 branch=get_branchname(branch)
194 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
196 if len(parents)==0 and revision != 0:
197 wr('reset refs/heads/%s' % branch)
199 wr('commit refs/heads/%s' % branch)
200 wr('mark :%d' % (revision+1))
201 if sob:
202 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
203 wr('committer %s %d %s' % (user,time,timezone))
204 wr('data %d' % (len(desc)+1)) # wtf?
205 wr(desc)
206 wr()
208 ctx=repo.changectx(str(revision))
209 man=ctx.manifest()
210 added,changed,removed,type=[],[],[],''
212 if len(parents) == 0:
213 # first revision: feed in full manifest
214 added=man.keys()
215 added.sort()
216 type='full'
217 else:
218 wr('from %s' % revnum_to_revref(parents[0], old_marks))
219 if len(parents) == 1:
220 # later non-merge revision: feed in changed manifest
221 # if we have exactly one parent, just take the changes from the
222 # manifest without expensively comparing checksums
223 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
224 added,changed,removed=f[1],f[0],f[2]
225 type='simple delta'
226 else: # a merge with two parents
227 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
228 # later merge revision: feed in changed manifest
229 # for many files comparing checksums is expensive so only do it for
230 # merges where we really need it due to hg's revlog logic
231 added,changed,removed=get_filechanges(repo,revision,parents,man)
232 type='thorough delta'
234 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
235 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
237 if fn_encoding:
238 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
240 removed=[strip_leading_slash(x) for x in removed]
242 map(lambda r: wr('D %s' % r),removed)
243 export_file_contents(ctx,man,added,hgtags,fn_encoding)
244 export_file_contents(ctx,man,changed,hgtags,fn_encoding)
245 wr()
247 return checkpoint(count)
249 def export_note(ui,repo,revision,count,authors,encoding,is_first):
250 (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
252 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
254 wr('commit refs/notes/hg')
255 wr('committer %s %d %s' % (user,time,timezone))
256 wr('data 0')
257 if is_first:
258 wr('from refs/notes/hg^0')
259 wr('N inline :%d' % (revision+1))
260 hg_hash=repo.changectx(str(revision)).hex()
261 wr('data %d' % (len(hg_hash)))
262 wr_no_nl(hg_hash)
263 wr()
264 return checkpoint(count)
266 wr('data %d' % (len(desc)+1)) # wtf?
267 wr(desc)
268 wr()
270 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
271 l=repo.tagslist()
272 for tag,node in l:
273 # Remap the branch name
274 tag=sanitize_name(tag,"tag",tagsmap)
275 # ignore latest revision
276 if tag=='tip': continue
277 # ignore tags to nodes that are missing (ie, 'in the future')
278 if node.encode('hex_codec') not in mapping_cache:
279 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
280 continue
282 rev=int(mapping_cache[node.encode('hex_codec')])
284 ref=revnum_to_revref(rev, old_marks)
285 if ref==None:
286 sys.stderr.write('Failed to find reference for creating tag'
287 ' %s at r%d\n' % (tag,rev))
288 continue
289 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
290 wr('reset refs/tags/%s' % tag)
291 wr('from %s' % ref)
292 wr()
293 count=checkpoint(count)
294 return count
296 def load_mapping(name, filename):
297 cache={}
298 if not os.path.exists(filename):
299 sys.stderr.write('Could not open mapping file [%s]\n' % (filename))
300 return cache
301 f=open(filename,'r')
304 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
305 for line in f.readlines():
306 l+=1
307 line=line.strip()
308 if line=='' or line[0]=='#':
309 continue
310 m=lre.match(line)
311 if m==None:
312 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
313 continue
314 # put key:value in cache, key without ^:
315 cache[m.group(1).strip()]=m.group(2).strip()
316 a+=1
317 f.close()
318 sys.stderr.write('Loaded %d %s\n' % (a, name))
319 return cache
321 def branchtip(repo, heads):
322 '''return the tipmost branch head in heads'''
323 tip = heads[-1]
324 for h in reversed(heads):
325 if 'close' not in repo.changelog.read(h)[5]:
326 tip = h
327 break
328 return tip
330 def verify_heads(ui,repo,cache,force,branchesmap):
331 branches={}
332 for bn, heads in repo.branchmap().iteritems():
333 branches[bn] = branchtip(repo, heads)
334 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
335 l.sort()
337 # get list of hg's branches to verify, don't take all git has
338 for _,_,b in l:
339 b=get_branch(b)
340 sanitized_name=sanitize_name(b,"branch",branchesmap)
341 sha1=get_git_sha1(sanitized_name)
342 c=cache.get(sanitized_name)
343 if sha1!=c:
344 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
345 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
346 if not force: return False
348 # verify that branch has exactly one head
349 t={}
350 for h in repo.heads():
351 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
352 if t.get(branch,False):
353 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
354 repo.changelog.rev(h))
355 if not force: return False
356 t[branch]=True
358 return True
360 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
361 authors={},branchesmap={},tagsmap={},
362 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''):
363 def check_cache(filename, contents):
364 if len(contents) == 0:
365 sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename)
367 _max=int(m)
369 old_marks=load_cache(marksfile,lambda s: int(s)-1)
370 mapping_cache=load_cache(mappingfile)
371 heads_cache=load_cache(headsfile)
372 state_cache=load_cache(tipfile)
374 if len(state_cache) != 0:
375 for (name, data) in [(marksfile, old_marks),
376 (mappingfile, mapping_cache),
377 (headsfile, state_cache)]:
378 check_cache(name, data)
380 ui,repo=setup_repo(repourl)
382 if not verify_heads(ui,repo,heads_cache,force,branchesmap):
383 return 1
385 try:
386 tip=repo.changelog.count()
387 except AttributeError:
388 tip=len(repo)
390 min=int(state_cache.get('tip',0))
391 max=_max
392 if _max<0 or max>tip:
393 max=tip
395 for rev in range(0,max):
396 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
397 mapping_cache[revnode.encode('hex_codec')] = str(rev)
401 brmap={}
402 for rev in range(min,max):
403 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
404 sob,brmap,hgtags,encoding,fn_encoding)
405 if notes:
406 for rev in range(min,max):
407 c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
409 state_cache['tip']=max
410 state_cache['repo']=repourl
411 save_cache(tipfile,state_cache)
412 save_cache(mappingfile,mapping_cache)
414 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
416 sys.stderr.write('Issued %d commands\n' % c)
418 return 0
420 if __name__=='__main__':
421 def bail(parser,opt):
422 sys.stderr.write('Error: No %s option given\n' % opt)
423 parser.print_help()
424 sys.exit(2)
426 parser=OptionParser()
428 parser.add_option("-m","--max",type="int",dest="max",
429 help="Maximum hg revision to import")
430 parser.add_option("--mapping",dest="mappingfile",
431 help="File to read last run's hg-to-git SHA1 mapping")
432 parser.add_option("--marks",dest="marksfile",
433 help="File to read git-fast-import's marks from")
434 parser.add_option("--heads",dest="headsfile",
435 help="File to read last run's git heads from")
436 parser.add_option("--status",dest="statusfile",
437 help="File to read status from")
438 parser.add_option("-r","--repo",dest="repourl",
439 help="URL of repo to import")
440 parser.add_option("-s",action="store_true",dest="sob",
441 default=False,help="Enable parsing Signed-off-by lines")
442 parser.add_option("--hgtags",action="store_true",dest="hgtags",
443 default=False,help="Enable exporting .hgtags files")
444 parser.add_option("-A","--authors",dest="authorfile",
445 help="Read authormap from AUTHORFILE")
446 parser.add_option("-B","--branches",dest="branchesfile",
447 help="Read branch map from BRANCHESFILE")
448 parser.add_option("-T","--tags",dest="tagsfile",
449 help="Read tags map from TAGSFILE")
450 parser.add_option("-f","--force",action="store_true",dest="force",
451 default=False,help="Ignore validation errors by force")
452 parser.add_option("-M","--default-branch",dest="default_branch",
453 help="Set the default branch")
454 parser.add_option("-o","--origin",dest="origin_name",
455 help="use <name> as namespace to track upstream")
456 parser.add_option("--hg-hash",action="store_true",dest="notes",
457 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
458 parser.add_option("-e",dest="encoding",
459 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
460 parser.add_option("--fe",dest="fn_encoding",
461 help="Assume file names from Mercurial are encoded in <filename_encoding>")
463 (options,args)=parser.parse_args()
465 m=-1
466 if options.max!=None: m=options.max
468 if options.marksfile==None: bail(parser,'--marks')
469 if options.mappingfile==None: bail(parser,'--mapping')
470 if options.headsfile==None: bail(parser,'--heads')
471 if options.statusfile==None: bail(parser,'--status')
472 if options.repourl==None: bail(parser,'--repo')
474 a={}
475 if options.authorfile!=None:
476 a=load_mapping('authors', options.authorfile)
478 b={}
479 if options.branchesfile!=None:
480 b=load_mapping('branches', options.branchesfile)
482 t={}
483 if options.tagsfile!=None:
484 t=load_mapping('tags', options.tagsfile)
486 if options.default_branch!=None:
487 set_default_branch(options.default_branch)
489 if options.origin_name!=None:
490 set_origin_name(options.origin_name)
492 encoding=''
493 if options.encoding!=None:
494 encoding=options.encoding
496 fn_encoding=encoding
497 if options.fn_encoding!=None:
498 fn_encoding=options.fn_encoding
500 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
501 options.headsfile, options.statusfile,
502 authors=a,branchesmap=b,tagsmap=t,
503 sob=options.sob,force=options.force,hgtags=options.hgtags,
504 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))