Add a section about system requirements to the README
[fast-export.git] / hg-fast-export.py
blobff00de6075386e99bd93a979395c60972311d246
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139 strip_leading_slash(filename)))
140 wr('data %d' % len(d)) # had some trouble with size()
141 wr(d)
142 count+=1
143 if count%cfg_export_boundary==0:
144 sys.stderr.write('Exported %d/%d files\n' % (count,max))
145 if max>cfg_export_boundary:
146 sys.stderr.write('Exported %d/%d files\n' % (count,max))
148 def sanitize_name(name,what="branch", mapping={}):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
151 # NOTE: Do not update this transform to work around
152 # incompatibilities on your platform. If you change it and it starts
153 # modifying names which previously were not touched it will break
154 # preexisting setups which are doing incremental imports.
156 # Fast-export tries to not inflict arbitrary naming policy on the
157 # user, instead it aims to provide mechanisms allowing the user to
158 # apply their own policy. Therefore do not add a transform which can
159 # already be implemented with the -B and -T options to mangle branch
160 # and tag names. If you have a source repository where this is too
161 # much work to do manually, write a tool that does it for you.
164 def dot(name):
165 if not name: return name
166 if name[0] == '.': return '_'+name[1:]
167 return name
169 n=mapping.get(name,name)
170 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
171 n=p.sub('_', n)
172 if n[-1] in ('/', '.'): n=n[:-1]+'_'
173 n='/'.join(map(dot,n.split('/')))
174 p=re.compile('_+')
175 n=p.sub('_', n)
177 if n!=name:
178 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
179 return n
181 def strip_leading_slash(filename):
182 if filename[0] == '/':
183 return filename[1:]
184 return filename
186 def export_commit(ui,repo,revision,old_marks,max,count,authors,
187 branchesmap,sob,brmap,hgtags,encoding='',fn_encoding=''):
188 def get_branchname(name):
189 if brmap.has_key(name):
190 return brmap[name]
191 n=sanitize_name(name, "branch", branchesmap)
192 brmap[name]=n
193 return n
195 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
197 branch=get_branchname(branch)
199 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
201 if len(parents)==0 and revision != 0:
202 wr('reset refs/heads/%s' % branch)
204 wr('commit refs/heads/%s' % branch)
205 wr('mark :%d' % (revision+1))
206 if sob:
207 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
208 wr('committer %s %d %s' % (user,time,timezone))
209 wr('data %d' % (len(desc)+1)) # wtf?
210 wr(desc)
211 wr()
213 ctx=repo.changectx(str(revision))
214 man=ctx.manifest()
215 added,changed,removed,type=[],[],[],''
217 if len(parents) == 0:
218 # first revision: feed in full manifest
219 added=man.keys()
220 added.sort()
221 type='full'
222 else:
223 wr('from %s' % revnum_to_revref(parents[0], old_marks))
224 if len(parents) == 1:
225 # later non-merge revision: feed in changed manifest
226 # if we have exactly one parent, just take the changes from the
227 # manifest without expensively comparing checksums
228 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
229 added,changed,removed=f[1],f[0],f[2]
230 type='simple delta'
231 else: # a merge with two parents
232 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
233 # later merge revision: feed in changed manifest
234 # for many files comparing checksums is expensive so only do it for
235 # merges where we really need it due to hg's revlog logic
236 added,changed,removed=get_filechanges(repo,revision,parents,man)
237 type='thorough delta'
239 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
240 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
242 if fn_encoding:
243 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
245 removed=[strip_leading_slash(x) for x in removed]
247 map(lambda r: wr('D %s' % r),removed)
248 export_file_contents(ctx,man,added,hgtags,fn_encoding)
249 export_file_contents(ctx,man,changed,hgtags,fn_encoding)
250 wr()
252 return checkpoint(count)
254 def export_note(ui,repo,revision,count,authors,encoding,is_first):
255 (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
257 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
259 wr('commit refs/notes/hg')
260 wr('committer %s %d %s' % (user,time,timezone))
261 wr('data 0')
262 if is_first:
263 wr('from refs/notes/hg^0')
264 wr('N inline :%d' % (revision+1))
265 hg_hash=repo.changectx(str(revision)).hex()
266 wr('data %d' % (len(hg_hash)))
267 wr_no_nl(hg_hash)
268 wr()
269 return checkpoint(count)
271 wr('data %d' % (len(desc)+1)) # wtf?
272 wr(desc)
273 wr()
275 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
276 l=repo.tagslist()
277 for tag,node in l:
278 # Remap the branch name
279 tag=sanitize_name(tag,"tag",tagsmap)
280 # ignore latest revision
281 if tag=='tip': continue
282 # ignore tags to nodes that are missing (ie, 'in the future')
283 if node.encode('hex_codec') not in mapping_cache:
284 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
285 continue
287 rev=int(mapping_cache[node.encode('hex_codec')])
289 ref=revnum_to_revref(rev, old_marks)
290 if ref==None:
291 sys.stderr.write('Failed to find reference for creating tag'
292 ' %s at r%d\n' % (tag,rev))
293 continue
294 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
295 wr('reset refs/tags/%s' % tag)
296 wr('from %s' % ref)
297 wr()
298 count=checkpoint(count)
299 return count
301 def load_mapping(name, filename, mapping_is_raw):
302 raw_regexp=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
303 string_regexp='"(((\\.)|(\\")|[^"])*)"'
304 quoted_regexp=re.compile('^'+string_regexp+'[ ]*=[ ]*'+string_regexp+'$')
306 def parse_raw_line(line):
307 m=raw_regexp.match(line)
308 if m==None:
309 return None
310 return (m.group(1).strip(), m.group(2).strip())
312 def parse_quoted_line(line):
313 m=quoted_regexp.match(line)
314 if m==None:
315 return None
316 return (m.group(1).decode('string_escape'),
317 m.group(5).decode('string_escape'))
319 cache={}
320 if not os.path.exists(filename):
321 sys.stderr.write('Could not open mapping file [%s]\n' % (filename))
322 return cache
323 f=open(filename,'r')
326 for line in f.readlines():
327 l+=1
328 line=line.strip()
329 if l==1 and line[0]=='#' and line=='# quoted-escaped-strings':
330 continue
331 elif line=='' or line[0]=='#':
332 continue
333 m=parse_raw_line(line) if mapping_is_raw else parse_quoted_line(line)
334 if m==None:
335 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
336 continue
337 # put key:value in cache, key without ^:
338 cache[m[0]]=m[1]
339 a+=1
340 f.close()
341 sys.stderr.write('Loaded %d %s\n' % (a, name))
342 return cache
344 def branchtip(repo, heads):
345 '''return the tipmost branch head in heads'''
346 tip = heads[-1]
347 for h in reversed(heads):
348 if 'close' not in repo.changelog.read(h)[5]:
349 tip = h
350 break
351 return tip
353 def verify_heads(ui,repo,cache,force,branchesmap):
354 branches={}
355 for bn, heads in repo.branchmap().iteritems():
356 branches[bn] = branchtip(repo, heads)
357 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
358 l.sort()
360 # get list of hg's branches to verify, don't take all git has
361 for _,_,b in l:
362 b=get_branch(b)
363 sanitized_name=sanitize_name(b,"branch",branchesmap)
364 sha1=get_git_sha1(sanitized_name)
365 c=cache.get(sanitized_name)
366 if sha1!=c:
367 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
368 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
369 if not force: return False
371 # verify that branch has exactly one head
372 t={}
373 for h in repo.heads():
374 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
375 if t.get(branch,False):
376 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
377 repo.changelog.rev(h))
378 if not force: return False
379 t[branch]=True
381 return True
383 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
384 authors={},branchesmap={},tagsmap={},
385 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''):
386 def check_cache(filename, contents):
387 if len(contents) == 0:
388 sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename)
390 _max=int(m)
392 old_marks=load_cache(marksfile,lambda s: int(s)-1)
393 mapping_cache=load_cache(mappingfile)
394 heads_cache=load_cache(headsfile)
395 state_cache=load_cache(tipfile)
397 if len(state_cache) != 0:
398 for (name, data) in [(marksfile, old_marks),
399 (mappingfile, mapping_cache),
400 (headsfile, state_cache)]:
401 check_cache(name, data)
403 ui,repo=setup_repo(repourl)
405 if not verify_heads(ui,repo,heads_cache,force,branchesmap):
406 return 1
408 try:
409 tip=repo.changelog.count()
410 except AttributeError:
411 tip=len(repo)
413 min=int(state_cache.get('tip',0))
414 max=_max
415 if _max<0 or max>tip:
416 max=tip
418 for rev in range(0,max):
419 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
420 mapping_cache[revnode.encode('hex_codec')] = str(rev)
424 brmap={}
425 for rev in range(min,max):
426 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
427 sob,brmap,hgtags,encoding,fn_encoding)
428 if notes:
429 for rev in range(min,max):
430 c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
432 state_cache['tip']=max
433 state_cache['repo']=repourl
434 save_cache(tipfile,state_cache)
435 save_cache(mappingfile,mapping_cache)
437 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
439 sys.stderr.write('Issued %d commands\n' % c)
441 return 0
443 if __name__=='__main__':
444 def bail(parser,opt):
445 sys.stderr.write('Error: No %s option given\n' % opt)
446 parser.print_help()
447 sys.exit(2)
449 parser=OptionParser()
451 parser.add_option("-m","--max",type="int",dest="max",
452 help="Maximum hg revision to import")
453 parser.add_option("--mapping",dest="mappingfile",
454 help="File to read last run's hg-to-git SHA1 mapping")
455 parser.add_option("--marks",dest="marksfile",
456 help="File to read git-fast-import's marks from")
457 parser.add_option("--heads",dest="headsfile",
458 help="File to read last run's git heads from")
459 parser.add_option("--status",dest="statusfile",
460 help="File to read status from")
461 parser.add_option("-r","--repo",dest="repourl",
462 help="URL of repo to import")
463 parser.add_option("-s",action="store_true",dest="sob",
464 default=False,help="Enable parsing Signed-off-by lines")
465 parser.add_option("--hgtags",action="store_true",dest="hgtags",
466 default=False,help="Enable exporting .hgtags files")
467 parser.add_option("-A","--authors",dest="authorfile",
468 help="Read authormap from AUTHORFILE")
469 parser.add_option("-B","--branches",dest="branchesfile",
470 help="Read branch map from BRANCHESFILE")
471 parser.add_option("-T","--tags",dest="tagsfile",
472 help="Read tags map from TAGSFILE")
473 parser.add_option("-f","--force",action="store_true",dest="force",
474 default=False,help="Ignore validation errors by force")
475 parser.add_option("-M","--default-branch",dest="default_branch",
476 help="Set the default branch")
477 parser.add_option("-o","--origin",dest="origin_name",
478 help="use <name> as namespace to track upstream")
479 parser.add_option("--hg-hash",action="store_true",dest="notes",
480 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
481 parser.add_option("-e",dest="encoding",
482 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
483 parser.add_option("--fe",dest="fn_encoding",
484 help="Assume file names from Mercurial are encoded in <filename_encoding>")
485 parser.add_option("--mappings-are-raw",dest="raw_mappings", default=False,
486 help="Assume mappings are raw <key>=<value> lines")
488 (options,args)=parser.parse_args()
490 m=-1
491 if options.max!=None: m=options.max
493 if options.marksfile==None: bail(parser,'--marks')
494 if options.mappingfile==None: bail(parser,'--mapping')
495 if options.headsfile==None: bail(parser,'--heads')
496 if options.statusfile==None: bail(parser,'--status')
497 if options.repourl==None: bail(parser,'--repo')
499 a={}
500 if options.authorfile!=None:
501 a=load_mapping('authors', options.authorfile, options.raw_mappings)
503 b={}
504 if options.branchesfile!=None:
505 b=load_mapping('branches', options.branchesfile, options.raw_mappings)
507 t={}
508 if options.tagsfile!=None:
509 t=load_mapping('tags', options.tagsfile, True)
511 if options.default_branch!=None:
512 set_default_branch(options.default_branch)
514 if options.origin_name!=None:
515 set_origin_name(options.origin_name)
517 encoding=''
518 if options.encoding!=None:
519 encoding=options.encoding
521 fn_encoding=encoding
522 if options.fn_encoding!=None:
523 fn_encoding=options.fn_encoding
525 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
526 options.headsfile, options.statusfile,
527 authors=a,branchesmap=b,tagsmap=t,
528 sob=options.sob,force=options.force,hgtags=options.hgtags,
529 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))