Add dos2unix plugin
[fast-export.git] / hg-fast-export.py
blob253055df3fd13de43ba20852aa59d26ead17099d
1 #!/usr/bin/env python2
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from mercurial.scmutil import revsymbol
8 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
9 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
14 import pluginloader
16 if sys.platform == "win32":
17 # On Windows, sys.stdout is initially opened in text mode, which means that
18 # when a LF (\n) character is written to sys.stdout, it will be converted
19 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
20 # code to change the mode of sys.stdout to binary.
21 import msvcrt
22 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
24 # silly regex to catch Signed-off-by lines in log message
25 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
26 # insert 'checkpoint' command after this many commits or none at all if 0
27 cfg_checkpoint_count=0
28 # write some progress message every this many file contents written
29 cfg_export_boundary=1000
31 def gitmode(flags):
32 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
34 def wr_no_nl(msg=''):
35 if msg:
36 sys.stdout.write(msg)
38 def wr(msg=''):
39 wr_no_nl(msg)
40 sys.stdout.write('\n')
41 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
43 def checkpoint(count):
44 count=count+1
45 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
46 sys.stderr.write("Checkpoint after %d commits\n" % count)
47 wr('checkpoint')
48 wr()
49 return count
51 def revnum_to_revref(rev, old_marks):
52 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
53 or a mark)"""
54 return old_marks.get(rev) or ':%d' % (rev+1)
56 def file_mismatch(f1,f2):
57 """See if two revisions of a file are not equal."""
58 return node.hex(f1)!=node.hex(f2)
60 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
61 """Loop over our repository and find all changed and missing files."""
62 for left in dleft.keys():
63 right=dright.get(left,None)
64 if right==None:
65 # we have the file but our parent hasn't: add to left set
66 l.append(left)
67 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
68 # we have it but checksums mismatch: add to center set
69 c.append(left)
70 for right in dright.keys():
71 left=dleft.get(right,None)
72 if left==None:
73 # if parent has file but we don't: add to right set
74 r.append(right)
75 # change is already handled when comparing child against parent
76 return l,c,r
78 def get_filechanges(repo,revision,parents,mleft):
79 """Given some repository and revision, find all changed/deleted files."""
80 l,c,r=[],[],[]
81 for p in parents:
82 if p<0: continue
83 mright=revsymbol(repo,str(p)).manifest()
84 l,c,r=split_dict(mleft,mright,l,c,r)
85 l.sort()
86 c.sort()
87 r.sort()
88 return l,c,r
90 def get_author(logmessage,committer,authors):
91 """As git distincts between author and committer of a patch, try to
92 extract author by detecting Signed-off-by lines.
94 This walks from the end of the log message towards the top skipping
95 empty lines. Upon the first non-empty line, it walks all Signed-off-by
96 lines upwards to find the first one. For that (if found), it extracts
97 authorship information the usual way (authors table, cleaning, etc.)
99 If no Signed-off-by line is found, this defaults to the committer.
101 This may sound stupid (and it somehow is), but in log messages we
102 accidentially may have lines in the middle starting with
103 "Signed-off-by: foo" and thus matching our detection regex. Prevent
104 that."""
106 loglines=logmessage.split('\n')
107 i=len(loglines)
108 # from tail walk to top skipping empty lines
109 while i>=0:
110 i-=1
111 if len(loglines[i].strip())==0: continue
112 break
113 if i>=0:
114 # walk further upwards to find first sob line, store in 'first'
115 first=None
116 while i>=0:
117 m=sob_re.match(loglines[i])
118 if m==None: break
119 first=m
120 i-=1
121 # if the last non-empty line matches our Signed-Off-by regex: extract username
122 if first!=None:
123 r=fixup_user(first.group(1),authors)
124 return r
125 return committer
127 def export_file_contents(ctx,manifest,files,hgtags,encoding='',filter_contents=None,plugins={}):
128 count=0
129 max=len(files)
130 for file in files:
131 # Skip .hgtags files. They only get us in trouble.
132 if not hgtags and file == ".hgtags":
133 sys.stderr.write('Skip %s\n' % (file))
134 continue
135 if encoding:
136 filename=file.decode(encoding).encode('utf8')
137 else:
138 filename=file
139 file_ctx=ctx.filectx(file)
140 d=file_ctx.data()
141 if filter_contents:
142 import subprocess
143 filter_cmd=filter_contents + [filename,node.hex(file_ctx.filenode()),'1' if file_ctx.isbinary() else '0']
144 try:
145 filter_proc=subprocess.Popen(filter_cmd,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
146 d,_=filter_proc.communicate(d)
147 except:
148 sys.stderr.write('Running filter-contents %s:\n' % filter_cmd)
149 raise
150 filter_ret=filter_proc.poll()
151 if filter_ret:
152 raise subprocess.CalledProcessError(filter_ret,filter_cmd)
154 if plugins and plugins['file_data_filters']:
155 file_data = {'filename':filename,'file_ctx':file_ctx,'data':d}
156 for filter in plugins['file_data_filters']:
157 filter(file_data)
158 d=file_data['data']
159 filename=file_data['filename']
160 file_ctx=file_data['file_ctx']
162 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
163 strip_leading_slash(filename)))
164 wr('data %d' % len(d)) # had some trouble with size()
165 wr(d)
166 count+=1
167 if count%cfg_export_boundary==0:
168 sys.stderr.write('Exported %d/%d files\n' % (count,max))
169 if max>cfg_export_boundary:
170 sys.stderr.write('Exported %d/%d files\n' % (count,max))
172 def sanitize_name(name,what="branch", mapping={}):
173 """Sanitize input roughly according to git-check-ref-format(1)"""
175 # NOTE: Do not update this transform to work around
176 # incompatibilities on your platform. If you change it and it starts
177 # modifying names which previously were not touched it will break
178 # preexisting setups which are doing incremental imports.
180 # Fast-export tries to not inflict arbitrary naming policy on the
181 # user, instead it aims to provide mechanisms allowing the user to
182 # apply their own policy. Therefore do not add a transform which can
183 # already be implemented with the -B and -T options to mangle branch
184 # and tag names. If you have a source repository where this is too
185 # much work to do manually, write a tool that does it for you.
188 def dot(name):
189 if not name: return name
190 if name[0] == '.': return '_'+name[1:]
191 return name
193 n=mapping.get(name,name)
194 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
195 n=p.sub('_', n)
196 if n[-1] in ('/', '.'): n=n[:-1]+'_'
197 n='/'.join(map(dot,n.split('/')))
198 p=re.compile('_+')
199 n=p.sub('_', n)
201 if n!=name:
202 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
203 return n
205 def strip_leading_slash(filename):
206 if filename[0] == '/':
207 return filename[1:]
208 return filename
210 def export_commit(ui,repo,revision,old_marks,max,count,authors,
211 branchesmap,sob,brmap,hgtags,encoding='',fn_encoding='',filter_contents=None,
212 plugins={}):
213 def get_branchname(name):
214 if brmap.has_key(name):
215 return brmap[name]
216 n=sanitize_name(name, "branch", branchesmap)
217 brmap[name]=n
218 return n
220 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
222 branch=get_branchname(branch)
224 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
225 author = get_author(desc,user,authors)
227 if plugins and plugins['commit_message_filters']:
228 commit_data = {'branch': branch, 'parents': parents, 'author': author, 'desc': desc}
229 for filter in plugins['commit_message_filters']:
230 filter(commit_data)
231 branch = commit_data['branch']
232 parents = commit_data['parents']
233 author = commit_data['author']
234 desc = commit_data['desc']
236 if len(parents)==0 and revision != 0:
237 wr('reset refs/heads/%s' % branch)
239 wr('commit refs/heads/%s' % branch)
240 wr('mark :%d' % (revision+1))
241 if sob:
242 wr('author %s %d %s' % (author,time,timezone))
243 wr('committer %s %d %s' % (user,time,timezone))
244 wr('data %d' % (len(desc)+1)) # wtf?
245 wr(desc)
246 wr()
248 ctx=revsymbol(repo,str(revision))
249 man=ctx.manifest()
250 added,changed,removed,type=[],[],[],''
252 if len(parents) == 0:
253 # first revision: feed in full manifest
254 added=man.keys()
255 added.sort()
256 type='full'
257 else:
258 wr('from %s' % revnum_to_revref(parents[0], old_marks))
259 if len(parents) == 1:
260 # later non-merge revision: feed in changed manifest
261 # if we have exactly one parent, just take the changes from the
262 # manifest without expensively comparing checksums
263 f=repo.status(parents[0],revnode)[:3]
264 added,changed,removed=f[1],f[0],f[2]
265 type='simple delta'
266 else: # a merge with two parents
267 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
268 # later merge revision: feed in changed manifest
269 # for many files comparing checksums is expensive so only do it for
270 # merges where we really need it due to hg's revlog logic
271 added,changed,removed=get_filechanges(repo,revision,parents,man)
272 type='thorough delta'
274 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
275 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
277 if fn_encoding:
278 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
280 removed=[strip_leading_slash(x) for x in removed]
282 map(lambda r: wr('D %s' % r),removed)
283 export_file_contents(ctx,man,added,hgtags,fn_encoding,filter_contents,plugins)
284 export_file_contents(ctx,man,changed,hgtags,fn_encoding,filter_contents,plugins)
285 wr()
287 return checkpoint(count)
289 def export_note(ui,repo,revision,count,authors,encoding,is_first):
290 (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
292 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
294 wr('commit refs/notes/hg')
295 wr('committer %s %d %s' % (user,time,timezone))
296 wr('data 0')
297 if is_first:
298 wr('from refs/notes/hg^0')
299 wr('N inline :%d' % (revision+1))
300 hg_hash=revsymbol(repo,str(revision)).hex()
301 wr('data %d' % (len(hg_hash)))
302 wr_no_nl(hg_hash)
303 wr()
304 return checkpoint(count)
306 wr('data %d' % (len(desc)+1)) # wtf?
307 wr(desc)
308 wr()
310 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
311 l=repo.tagslist()
312 for tag,node in l:
313 # Remap the branch name
314 tag=sanitize_name(tag,"tag",tagsmap)
315 # ignore latest revision
316 if tag=='tip': continue
317 # ignore tags to nodes that are missing (ie, 'in the future')
318 if node.encode('hex_codec') not in mapping_cache:
319 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
320 continue
322 rev=int(mapping_cache[node.encode('hex_codec')])
324 ref=revnum_to_revref(rev, old_marks)
325 if ref==None:
326 sys.stderr.write('Failed to find reference for creating tag'
327 ' %s at r%d\n' % (tag,rev))
328 continue
329 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
330 wr('reset refs/tags/%s' % tag)
331 wr('from %s' % ref)
332 wr()
333 count=checkpoint(count)
334 return count
336 def load_mapping(name, filename, mapping_is_raw):
337 raw_regexp=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
338 string_regexp='"(((\\.)|(\\")|[^"])*)"'
339 quoted_regexp=re.compile('^'+string_regexp+'[ ]*=[ ]*'+string_regexp+'$')
341 def parse_raw_line(line):
342 m=raw_regexp.match(line)
343 if m==None:
344 return None
345 return (m.group(1).strip(), m.group(2).strip())
347 def parse_quoted_line(line):
348 m=quoted_regexp.match(line)
349 if m==None:
350 return None
351 return (m.group(1).decode('string_escape'),
352 m.group(5).decode('string_escape'))
354 cache={}
355 if not os.path.exists(filename):
356 sys.stderr.write('Could not open mapping file [%s]\n' % (filename))
357 return cache
358 f=open(filename,'r')
361 for line in f.readlines():
362 l+=1
363 line=line.strip()
364 if l==1 and line[0]=='#' and line=='# quoted-escaped-strings':
365 continue
366 elif line=='' or line[0]=='#':
367 continue
368 m=parse_raw_line(line) if mapping_is_raw else parse_quoted_line(line)
369 if m==None:
370 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
371 continue
372 # put key:value in cache, key without ^:
373 cache[m[0]]=m[1]
374 a+=1
375 f.close()
376 sys.stderr.write('Loaded %d %s\n' % (a, name))
377 return cache
379 def branchtip(repo, heads):
380 '''return the tipmost branch head in heads'''
381 tip = heads[-1]
382 for h in reversed(heads):
383 if 'close' not in repo.changelog.read(h)[5]:
384 tip = h
385 break
386 return tip
388 def verify_heads(ui,repo,cache,force,branchesmap):
389 branches={}
390 for bn, heads in repo.branchmap().iteritems():
391 branches[bn] = branchtip(repo, heads)
392 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
393 l.sort()
395 # get list of hg's branches to verify, don't take all git has
396 for _,_,b in l:
397 b=get_branch(b)
398 sanitized_name=sanitize_name(b,"branch",branchesmap)
399 sha1=get_git_sha1(sanitized_name)
400 c=cache.get(sanitized_name)
401 if sha1!=c:
402 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
403 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
404 if not force: return False
406 # verify that branch has exactly one head
407 t={}
408 for h in repo.heads():
409 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
410 if t.get(branch,False):
411 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
412 repo.changelog.rev(h))
413 if not force: return False
414 t[branch]=True
416 return True
418 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
419 authors={},branchesmap={},tagsmap={},
420 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding='',filter_contents=None,
421 plugins={}):
422 def check_cache(filename, contents):
423 if len(contents) == 0:
424 sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename)
426 _max=int(m)
428 old_marks=load_cache(marksfile,lambda s: int(s)-1)
429 mapping_cache=load_cache(mappingfile)
430 heads_cache=load_cache(headsfile)
431 state_cache=load_cache(tipfile)
433 if len(state_cache) != 0:
434 for (name, data) in [(marksfile, old_marks),
435 (mappingfile, mapping_cache),
436 (headsfile, state_cache)]:
437 check_cache(name, data)
439 ui,repo=setup_repo(repourl)
441 if not verify_heads(ui,repo,heads_cache,force,branchesmap):
442 return 1
444 try:
445 tip=repo.changelog.count()
446 except AttributeError:
447 tip=len(repo)
449 min=int(state_cache.get('tip',0))
450 max=_max
451 if _max<0 or max>tip:
452 max=tip
454 for rev in range(0,max):
455 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
456 mapping_cache[revnode.encode('hex_codec')] = str(rev)
460 brmap={}
461 for rev in range(min,max):
462 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
463 sob,brmap,hgtags,encoding,fn_encoding,filter_contents,
464 plugins)
465 if notes:
466 for rev in range(min,max):
467 c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
469 state_cache['tip']=max
470 state_cache['repo']=repourl
471 save_cache(tipfile,state_cache)
472 save_cache(mappingfile,mapping_cache)
474 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
476 sys.stderr.write('Issued %d commands\n' % c)
478 return 0
480 if __name__=='__main__':
481 def bail(parser,opt):
482 sys.stderr.write('Error: No %s option given\n' % opt)
483 parser.print_help()
484 sys.exit(2)
486 parser=OptionParser()
488 parser.add_option("-m","--max",type="int",dest="max",
489 help="Maximum hg revision to import")
490 parser.add_option("--mapping",dest="mappingfile",
491 help="File to read last run's hg-to-git SHA1 mapping")
492 parser.add_option("--marks",dest="marksfile",
493 help="File to read git-fast-import's marks from")
494 parser.add_option("--heads",dest="headsfile",
495 help="File to read last run's git heads from")
496 parser.add_option("--status",dest="statusfile",
497 help="File to read status from")
498 parser.add_option("-r","--repo",dest="repourl",
499 help="URL of repo to import")
500 parser.add_option("-s",action="store_true",dest="sob",
501 default=False,help="Enable parsing Signed-off-by lines")
502 parser.add_option("--hgtags",action="store_true",dest="hgtags",
503 default=False,help="Enable exporting .hgtags files")
504 parser.add_option("-A","--authors",dest="authorfile",
505 help="Read authormap from AUTHORFILE")
506 parser.add_option("-B","--branches",dest="branchesfile",
507 help="Read branch map from BRANCHESFILE")
508 parser.add_option("-T","--tags",dest="tagsfile",
509 help="Read tags map from TAGSFILE")
510 parser.add_option("-f","--force",action="store_true",dest="force",
511 default=False,help="Ignore validation errors by force")
512 parser.add_option("-M","--default-branch",dest="default_branch",
513 help="Set the default branch")
514 parser.add_option("-o","--origin",dest="origin_name",
515 help="use <name> as namespace to track upstream")
516 parser.add_option("--hg-hash",action="store_true",dest="notes",
517 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
518 parser.add_option("-e",dest="encoding",
519 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
520 parser.add_option("--fe",dest="fn_encoding",
521 help="Assume file names from Mercurial are encoded in <filename_encoding>")
522 parser.add_option("--mappings-are-raw",dest="raw_mappings", default=False,
523 help="Assume mappings are raw <key>=<value> lines")
524 parser.add_option("--filter-contents",dest="filter_contents",
525 help="Pipe contents of each exported file through FILTER_CONTENTS <file-path> <hg-hash> <is-binary>")
526 parser.add_option("--plugin-path", type="string", dest="pluginpath",
527 help="Additional search path for plugins ")
528 parser.add_option("--plugin", action="append", type="string", dest="plugins",
529 help="Add a plugin with the given init string <name=init>")
531 (options,args)=parser.parse_args()
533 m=-1
534 if options.max!=None: m=options.max
536 if options.marksfile==None: bail(parser,'--marks')
537 if options.mappingfile==None: bail(parser,'--mapping')
538 if options.headsfile==None: bail(parser,'--heads')
539 if options.statusfile==None: bail(parser,'--status')
540 if options.repourl==None: bail(parser,'--repo')
542 a={}
543 if options.authorfile!=None:
544 a=load_mapping('authors', options.authorfile, options.raw_mappings)
546 b={}
547 if options.branchesfile!=None:
548 b=load_mapping('branches', options.branchesfile, options.raw_mappings)
550 t={}
551 if options.tagsfile!=None:
552 t=load_mapping('tags', options.tagsfile, True)
554 if options.default_branch!=None:
555 set_default_branch(options.default_branch)
557 if options.origin_name!=None:
558 set_origin_name(options.origin_name)
560 encoding=''
561 if options.encoding!=None:
562 encoding=options.encoding
564 fn_encoding=encoding
565 if options.fn_encoding!=None:
566 fn_encoding=options.fn_encoding
568 plugins=[]
569 if options.plugins!=None:
570 plugins+=options.plugins
572 filter_contents=None
573 if options.filter_contents!=None:
574 import shlex
575 filter_contents=shlex.split(options.filter_contents)
577 plugins_dict={}
578 plugins_dict['commit_message_filters']=[]
579 plugins_dict['file_data_filters']=[]
581 if plugins and options.pluginpath:
582 sys.stderr.write('Using additional plugin path: ' + options.pluginpath + '\n')
584 for plugin in plugins:
585 split = plugin.split('=')
586 name, opts = split[0], '='.join(split[1:])
587 i = pluginloader.get_plugin(name,options.pluginpath)
588 sys.stderr.write('Loaded plugin ' + i['name'] + ' from path: ' + i['path'] +' with opts: ' + opts + '\n')
589 plugin = pluginloader.load_plugin(i).build_filter(opts)
590 if hasattr(plugin,'file_data_filter') and callable(plugin.file_data_filter):
591 plugins_dict['file_data_filters'].append(plugin.file_data_filter)
592 if hasattr(plugin, 'commit_message_filter') and callable(plugin.commit_message_filter):
593 plugins_dict['commit_message_filters'].append(plugin.commit_message_filter)
595 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
596 options.headsfile, options.statusfile,
597 authors=a,branchesmap=b,tagsmap=t,
598 sob=options.sob,force=options.force,hgtags=options.hgtags,
599 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding,filter_contents=filter_contents,
600 plugins=plugins_dict))