Merge branch 'atykhyy-as-binary'
[fast-export.git] / hg-fast-export.py
blob2394b2ec0be1e78f5dbdebb2c90902cc52a10e35
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from mercurial.scmutil import revsymbol
8 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
9 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
15 if sys.platform == "win32":
16 # On Windows, sys.stdout is initially opened in text mode, which means that
17 # when a LF (\n) character is written to sys.stdout, it will be converted
18 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
19 # code to change the mode of sys.stdout to binary.
20 import msvcrt
21 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
23 # silly regex to catch Signed-off-by lines in log message
24 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
25 # insert 'checkpoint' command after this many commits or none at all if 0
26 cfg_checkpoint_count=0
27 # write some progress message every this many file contents written
28 cfg_export_boundary=1000
30 def gitmode(flags):
31 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
33 def wr_no_nl(msg=''):
34 if msg:
35 sys.stdout.write(msg)
37 def wr(msg=''):
38 wr_no_nl(msg)
39 sys.stdout.write('\n')
40 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
42 def checkpoint(count):
43 count=count+1
44 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
45 sys.stderr.write("Checkpoint after %d commits\n" % count)
46 wr('checkpoint')
47 wr()
48 return count
50 def revnum_to_revref(rev, old_marks):
51 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
52 or a mark)"""
53 return old_marks.get(rev) or ':%d' % (rev+1)
55 def file_mismatch(f1,f2):
56 """See if two revisions of a file are not equal."""
57 return node.hex(f1)!=node.hex(f2)
59 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
60 """Loop over our repository and find all changed and missing files."""
61 for left in dleft.keys():
62 right=dright.get(left,None)
63 if right==None:
64 # we have the file but our parent hasn't: add to left set
65 l.append(left)
66 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
67 # we have it but checksums mismatch: add to center set
68 c.append(left)
69 for right in dright.keys():
70 left=dleft.get(right,None)
71 if left==None:
72 # if parent has file but we don't: add to right set
73 r.append(right)
74 # change is already handled when comparing child against parent
75 return l,c,r
77 def get_filechanges(repo,revision,parents,mleft):
78 """Given some repository and revision, find all changed/deleted files."""
79 l,c,r=[],[],[]
80 for p in parents:
81 if p<0: continue
82 mright=revsymbol(repo,str(p)).manifest()
83 l,c,r=split_dict(mleft,mright,l,c,r)
84 l.sort()
85 c.sort()
86 r.sort()
87 return l,c,r
89 def get_author(logmessage,committer,authors):
90 """As git distincts between author and committer of a patch, try to
91 extract author by detecting Signed-off-by lines.
93 This walks from the end of the log message towards the top skipping
94 empty lines. Upon the first non-empty line, it walks all Signed-off-by
95 lines upwards to find the first one. For that (if found), it extracts
96 authorship information the usual way (authors table, cleaning, etc.)
98 If no Signed-off-by line is found, this defaults to the committer.
100 This may sound stupid (and it somehow is), but in log messages we
101 accidentially may have lines in the middle starting with
102 "Signed-off-by: foo" and thus matching our detection regex. Prevent
103 that."""
105 loglines=logmessage.split('\n')
106 i=len(loglines)
107 # from tail walk to top skipping empty lines
108 while i>=0:
109 i-=1
110 if len(loglines[i].strip())==0: continue
111 break
112 if i>=0:
113 # walk further upwards to find first sob line, store in 'first'
114 first=None
115 while i>=0:
116 m=sob_re.match(loglines[i])
117 if m==None: break
118 first=m
119 i-=1
120 # if the last non-empty line matches our Signed-Off-by regex: extract username
121 if first!=None:
122 r=fixup_user(first.group(1),authors)
123 return r
124 return committer
126 def export_file_contents(ctx,manifest,files,hgtags,encoding='',filter_contents=None):
127 count=0
128 max=len(files)
129 for file in files:
130 # Skip .hgtags files. They only get us in trouble.
131 if not hgtags and file == ".hgtags":
132 sys.stderr.write('Skip %s\n' % (file))
133 continue
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 file_ctx=ctx.filectx(file)
139 d=file_ctx.data()
140 if filter_contents:
141 import subprocess
142 filter_cmd=filter_contents + [filename,node.hex(file_ctx.filenode()),'1' if file_ctx.isbinary() else '0']
143 try:
144 filter_proc=subprocess.Popen(filter_cmd,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
145 d,_=filter_proc.communicate(d)
146 except:
147 sys.stderr.write('Running filter-contents %s:\n' % filter_cmd)
148 raise
149 filter_ret=filter_proc.poll()
150 if filter_ret:
151 raise subprocess.CalledProcessError(filter_ret,filter_cmd)
152 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
153 strip_leading_slash(filename)))
154 wr('data %d' % len(d)) # had some trouble with size()
155 wr(d)
156 count+=1
157 if count%cfg_export_boundary==0:
158 sys.stderr.write('Exported %d/%d files\n' % (count,max))
159 if max>cfg_export_boundary:
160 sys.stderr.write('Exported %d/%d files\n' % (count,max))
162 def sanitize_name(name,what="branch", mapping={}):
163 """Sanitize input roughly according to git-check-ref-format(1)"""
165 # NOTE: Do not update this transform to work around
166 # incompatibilities on your platform. If you change it and it starts
167 # modifying names which previously were not touched it will break
168 # preexisting setups which are doing incremental imports.
170 # Fast-export tries to not inflict arbitrary naming policy on the
171 # user, instead it aims to provide mechanisms allowing the user to
172 # apply their own policy. Therefore do not add a transform which can
173 # already be implemented with the -B and -T options to mangle branch
174 # and tag names. If you have a source repository where this is too
175 # much work to do manually, write a tool that does it for you.
178 def dot(name):
179 if not name: return name
180 if name[0] == '.': return '_'+name[1:]
181 return name
183 n=mapping.get(name,name)
184 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
185 n=p.sub('_', n)
186 if n[-1] in ('/', '.'): n=n[:-1]+'_'
187 n='/'.join(map(dot,n.split('/')))
188 p=re.compile('_+')
189 n=p.sub('_', n)
191 if n!=name:
192 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
193 return n
195 def strip_leading_slash(filename):
196 if filename[0] == '/':
197 return filename[1:]
198 return filename
200 def export_commit(ui,repo,revision,old_marks,max,count,authors,
201 branchesmap,sob,brmap,hgtags,encoding='',fn_encoding='',filter_contents=None):
202 def get_branchname(name):
203 if brmap.has_key(name):
204 return brmap[name]
205 n=sanitize_name(name, "branch", branchesmap)
206 brmap[name]=n
207 return n
209 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
211 branch=get_branchname(branch)
213 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
215 if len(parents)==0 and revision != 0:
216 wr('reset refs/heads/%s' % branch)
218 wr('commit refs/heads/%s' % branch)
219 wr('mark :%d' % (revision+1))
220 if sob:
221 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
222 wr('committer %s %d %s' % (user,time,timezone))
223 wr('data %d' % (len(desc)+1)) # wtf?
224 wr(desc)
225 wr()
227 ctx=revsymbol(repo,str(revision))
228 man=ctx.manifest()
229 added,changed,removed,type=[],[],[],''
231 if len(parents) == 0:
232 # first revision: feed in full manifest
233 added=man.keys()
234 added.sort()
235 type='full'
236 else:
237 wr('from %s' % revnum_to_revref(parents[0], old_marks))
238 if len(parents) == 1:
239 # later non-merge revision: feed in changed manifest
240 # if we have exactly one parent, just take the changes from the
241 # manifest without expensively comparing checksums
242 f=repo.status(parents[0],revnode)[:3]
243 added,changed,removed=f[1],f[0],f[2]
244 type='simple delta'
245 else: # a merge with two parents
246 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
247 # later merge revision: feed in changed manifest
248 # for many files comparing checksums is expensive so only do it for
249 # merges where we really need it due to hg's revlog logic
250 added,changed,removed=get_filechanges(repo,revision,parents,man)
251 type='thorough delta'
253 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
254 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
256 if fn_encoding:
257 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
259 removed=[strip_leading_slash(x) for x in removed]
261 map(lambda r: wr('D %s' % r),removed)
262 export_file_contents(ctx,man,added,hgtags,fn_encoding,filter_contents)
263 export_file_contents(ctx,man,changed,hgtags,fn_encoding,filter_contents)
264 wr()
266 return checkpoint(count)
268 def export_note(ui,repo,revision,count,authors,encoding,is_first):
269 (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
271 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
273 wr('commit refs/notes/hg')
274 wr('committer %s %d %s' % (user,time,timezone))
275 wr('data 0')
276 if is_first:
277 wr('from refs/notes/hg^0')
278 wr('N inline :%d' % (revision+1))
279 hg_hash=revsymbol(repo,str(revision)).hex()
280 wr('data %d' % (len(hg_hash)))
281 wr_no_nl(hg_hash)
282 wr()
283 return checkpoint(count)
285 wr('data %d' % (len(desc)+1)) # wtf?
286 wr(desc)
287 wr()
289 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
290 l=repo.tagslist()
291 for tag,node in l:
292 # Remap the branch name
293 tag=sanitize_name(tag,"tag",tagsmap)
294 # ignore latest revision
295 if tag=='tip': continue
296 # ignore tags to nodes that are missing (ie, 'in the future')
297 if node.encode('hex_codec') not in mapping_cache:
298 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
299 continue
301 rev=int(mapping_cache[node.encode('hex_codec')])
303 ref=revnum_to_revref(rev, old_marks)
304 if ref==None:
305 sys.stderr.write('Failed to find reference for creating tag'
306 ' %s at r%d\n' % (tag,rev))
307 continue
308 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
309 wr('reset refs/tags/%s' % tag)
310 wr('from %s' % ref)
311 wr()
312 count=checkpoint(count)
313 return count
315 def load_mapping(name, filename, mapping_is_raw):
316 raw_regexp=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
317 string_regexp='"(((\\.)|(\\")|[^"])*)"'
318 quoted_regexp=re.compile('^'+string_regexp+'[ ]*=[ ]*'+string_regexp+'$')
320 def parse_raw_line(line):
321 m=raw_regexp.match(line)
322 if m==None:
323 return None
324 return (m.group(1).strip(), m.group(2).strip())
326 def parse_quoted_line(line):
327 m=quoted_regexp.match(line)
328 if m==None:
329 return None
330 return (m.group(1).decode('string_escape'),
331 m.group(5).decode('string_escape'))
333 cache={}
334 if not os.path.exists(filename):
335 sys.stderr.write('Could not open mapping file [%s]\n' % (filename))
336 return cache
337 f=open(filename,'r')
340 for line in f.readlines():
341 l+=1
342 line=line.strip()
343 if l==1 and line[0]=='#' and line=='# quoted-escaped-strings':
344 continue
345 elif line=='' or line[0]=='#':
346 continue
347 m=parse_raw_line(line) if mapping_is_raw else parse_quoted_line(line)
348 if m==None:
349 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
350 continue
351 # put key:value in cache, key without ^:
352 cache[m[0]]=m[1]
353 a+=1
354 f.close()
355 sys.stderr.write('Loaded %d %s\n' % (a, name))
356 return cache
358 def branchtip(repo, heads):
359 '''return the tipmost branch head in heads'''
360 tip = heads[-1]
361 for h in reversed(heads):
362 if 'close' not in repo.changelog.read(h)[5]:
363 tip = h
364 break
365 return tip
367 def verify_heads(ui,repo,cache,force,branchesmap):
368 branches={}
369 for bn, heads in repo.branchmap().iteritems():
370 branches[bn] = branchtip(repo, heads)
371 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
372 l.sort()
374 # get list of hg's branches to verify, don't take all git has
375 for _,_,b in l:
376 b=get_branch(b)
377 sanitized_name=sanitize_name(b,"branch",branchesmap)
378 sha1=get_git_sha1(sanitized_name)
379 c=cache.get(sanitized_name)
380 if sha1!=c:
381 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
382 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
383 if not force: return False
385 # verify that branch has exactly one head
386 t={}
387 for h in repo.heads():
388 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
389 if t.get(branch,False):
390 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
391 repo.changelog.rev(h))
392 if not force: return False
393 t[branch]=True
395 return True
397 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
398 authors={},branchesmap={},tagsmap={},
399 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding='',filter_contents=None):
400 def check_cache(filename, contents):
401 if len(contents) == 0:
402 sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename)
404 _max=int(m)
406 old_marks=load_cache(marksfile,lambda s: int(s)-1)
407 mapping_cache=load_cache(mappingfile)
408 heads_cache=load_cache(headsfile)
409 state_cache=load_cache(tipfile)
411 if len(state_cache) != 0:
412 for (name, data) in [(marksfile, old_marks),
413 (mappingfile, mapping_cache),
414 (headsfile, state_cache)]:
415 check_cache(name, data)
417 ui,repo=setup_repo(repourl)
419 if not verify_heads(ui,repo,heads_cache,force,branchesmap):
420 return 1
422 try:
423 tip=repo.changelog.count()
424 except AttributeError:
425 tip=len(repo)
427 min=int(state_cache.get('tip',0))
428 max=_max
429 if _max<0 or max>tip:
430 max=tip
432 for rev in range(0,max):
433 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
434 mapping_cache[revnode.encode('hex_codec')] = str(rev)
438 brmap={}
439 for rev in range(min,max):
440 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
441 sob,brmap,hgtags,encoding,fn_encoding,filter_contents)
442 if notes:
443 for rev in range(min,max):
444 c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
446 state_cache['tip']=max
447 state_cache['repo']=repourl
448 save_cache(tipfile,state_cache)
449 save_cache(mappingfile,mapping_cache)
451 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
453 sys.stderr.write('Issued %d commands\n' % c)
455 return 0
457 if __name__=='__main__':
458 def bail(parser,opt):
459 sys.stderr.write('Error: No %s option given\n' % opt)
460 parser.print_help()
461 sys.exit(2)
463 parser=OptionParser()
465 parser.add_option("-m","--max",type="int",dest="max",
466 help="Maximum hg revision to import")
467 parser.add_option("--mapping",dest="mappingfile",
468 help="File to read last run's hg-to-git SHA1 mapping")
469 parser.add_option("--marks",dest="marksfile",
470 help="File to read git-fast-import's marks from")
471 parser.add_option("--heads",dest="headsfile",
472 help="File to read last run's git heads from")
473 parser.add_option("--status",dest="statusfile",
474 help="File to read status from")
475 parser.add_option("-r","--repo",dest="repourl",
476 help="URL of repo to import")
477 parser.add_option("-s",action="store_true",dest="sob",
478 default=False,help="Enable parsing Signed-off-by lines")
479 parser.add_option("--hgtags",action="store_true",dest="hgtags",
480 default=False,help="Enable exporting .hgtags files")
481 parser.add_option("-A","--authors",dest="authorfile",
482 help="Read authormap from AUTHORFILE")
483 parser.add_option("-B","--branches",dest="branchesfile",
484 help="Read branch map from BRANCHESFILE")
485 parser.add_option("-T","--tags",dest="tagsfile",
486 help="Read tags map from TAGSFILE")
487 parser.add_option("-f","--force",action="store_true",dest="force",
488 default=False,help="Ignore validation errors by force")
489 parser.add_option("-M","--default-branch",dest="default_branch",
490 help="Set the default branch")
491 parser.add_option("-o","--origin",dest="origin_name",
492 help="use <name> as namespace to track upstream")
493 parser.add_option("--hg-hash",action="store_true",dest="notes",
494 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
495 parser.add_option("-e",dest="encoding",
496 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
497 parser.add_option("--fe",dest="fn_encoding",
498 help="Assume file names from Mercurial are encoded in <filename_encoding>")
499 parser.add_option("--mappings-are-raw",dest="raw_mappings", default=False,
500 help="Assume mappings are raw <key>=<value> lines")
501 parser.add_option("--filter-contents",dest="filter_contents",
502 help="Pipe contents of each exported file through FILTER_CONTENTS <file-path> <hg-hash> <is-binary>")
504 (options,args)=parser.parse_args()
506 m=-1
507 if options.max!=None: m=options.max
509 if options.marksfile==None: bail(parser,'--marks')
510 if options.mappingfile==None: bail(parser,'--mapping')
511 if options.headsfile==None: bail(parser,'--heads')
512 if options.statusfile==None: bail(parser,'--status')
513 if options.repourl==None: bail(parser,'--repo')
515 a={}
516 if options.authorfile!=None:
517 a=load_mapping('authors', options.authorfile, options.raw_mappings)
519 b={}
520 if options.branchesfile!=None:
521 b=load_mapping('branches', options.branchesfile, options.raw_mappings)
523 t={}
524 if options.tagsfile!=None:
525 t=load_mapping('tags', options.tagsfile, True)
527 if options.default_branch!=None:
528 set_default_branch(options.default_branch)
530 if options.origin_name!=None:
531 set_origin_name(options.origin_name)
533 encoding=''
534 if options.encoding!=None:
535 encoding=options.encoding
537 fn_encoding=encoding
538 if options.fn_encoding!=None:
539 fn_encoding=options.fn_encoding
541 filter_contents=None
542 if options.filter_contents!=None:
543 import shlex
544 filter_contents=shlex.split(options.filter_contents)
546 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
547 options.headsfile, options.statusfile,
548 authors=a,branchesmap=b,tagsmap=t,
549 sob=options.sob,force=options.force,hgtags=options.hgtags,
550 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding,filter_contents=filter_contents))