Move filter_contents to plugin system
[fast-export.git] / hg-fast-export.py
blobe53b5dd0898c0232cded30decdbdb5d90cf21c4e
1 #!/usr/bin/env python2
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from mercurial.scmutil import revsymbol
8 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
9 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
14 import pluginloader
16 if sys.platform == "win32":
17 # On Windows, sys.stdout is initially opened in text mode, which means that
18 # when a LF (\n) character is written to sys.stdout, it will be converted
19 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
20 # code to change the mode of sys.stdout to binary.
21 import msvcrt
22 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
24 # silly regex to catch Signed-off-by lines in log message
25 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
26 # insert 'checkpoint' command after this many commits or none at all if 0
27 cfg_checkpoint_count=0
28 # write some progress message every this many file contents written
29 cfg_export_boundary=1000
31 def gitmode(flags):
32 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
34 def wr_no_nl(msg=''):
35 if msg:
36 sys.stdout.write(msg)
38 def wr(msg=''):
39 wr_no_nl(msg)
40 sys.stdout.write('\n')
41 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
43 def checkpoint(count):
44 count=count+1
45 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
46 sys.stderr.write("Checkpoint after %d commits\n" % count)
47 wr('checkpoint')
48 wr()
49 return count
51 def revnum_to_revref(rev, old_marks):
52 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
53 or a mark)"""
54 return old_marks.get(rev) or ':%d' % (rev+1)
56 def file_mismatch(f1,f2):
57 """See if two revisions of a file are not equal."""
58 return node.hex(f1)!=node.hex(f2)
60 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
61 """Loop over our repository and find all changed and missing files."""
62 for left in dleft.keys():
63 right=dright.get(left,None)
64 if right==None:
65 # we have the file but our parent hasn't: add to left set
66 l.append(left)
67 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
68 # we have it but checksums mismatch: add to center set
69 c.append(left)
70 for right in dright.keys():
71 left=dleft.get(right,None)
72 if left==None:
73 # if parent has file but we don't: add to right set
74 r.append(right)
75 # change is already handled when comparing child against parent
76 return l,c,r
78 def get_filechanges(repo,revision,parents,mleft):
79 """Given some repository and revision, find all changed/deleted files."""
80 l,c,r=[],[],[]
81 for p in parents:
82 if p<0: continue
83 mright=revsymbol(repo,str(p)).manifest()
84 l,c,r=split_dict(mleft,mright,l,c,r)
85 l.sort()
86 c.sort()
87 r.sort()
88 return l,c,r
90 def get_author(logmessage,committer,authors):
91 """As git distincts between author and committer of a patch, try to
92 extract author by detecting Signed-off-by lines.
94 This walks from the end of the log message towards the top skipping
95 empty lines. Upon the first non-empty line, it walks all Signed-off-by
96 lines upwards to find the first one. For that (if found), it extracts
97 authorship information the usual way (authors table, cleaning, etc.)
99 If no Signed-off-by line is found, this defaults to the committer.
101 This may sound stupid (and it somehow is), but in log messages we
102 accidentially may have lines in the middle starting with
103 "Signed-off-by: foo" and thus matching our detection regex. Prevent
104 that."""
106 loglines=logmessage.split('\n')
107 i=len(loglines)
108 # from tail walk to top skipping empty lines
109 while i>=0:
110 i-=1
111 if len(loglines[i].strip())==0: continue
112 break
113 if i>=0:
114 # walk further upwards to find first sob line, store in 'first'
115 first=None
116 while i>=0:
117 m=sob_re.match(loglines[i])
118 if m==None: break
119 first=m
120 i-=1
121 # if the last non-empty line matches our Signed-Off-by regex: extract username
122 if first!=None:
123 r=fixup_user(first.group(1),authors)
124 return r
125 return committer
127 def export_file_contents(ctx,manifest,files,hgtags,encoding='',plugins={}):
128 count=0
129 max=len(files)
130 for file in files:
131 # Skip .hgtags files. They only get us in trouble.
132 if not hgtags and file == ".hgtags":
133 sys.stderr.write('Skip %s\n' % (file))
134 continue
135 if encoding:
136 filename=file.decode(encoding).encode('utf8')
137 else:
138 filename=file
139 file_ctx=ctx.filectx(file)
140 d=file_ctx.data()
142 if plugins and plugins['file_data_filters']:
143 file_data = {'filename':filename,'file_ctx':file_ctx,'data':d}
144 for filter in plugins['file_data_filters']:
145 filter(file_data)
146 d=file_data['data']
147 filename=file_data['filename']
148 file_ctx=file_data['file_ctx']
150 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
151 strip_leading_slash(filename)))
152 wr('data %d' % len(d)) # had some trouble with size()
153 wr(d)
154 count+=1
155 if count%cfg_export_boundary==0:
156 sys.stderr.write('Exported %d/%d files\n' % (count,max))
157 if max>cfg_export_boundary:
158 sys.stderr.write('Exported %d/%d files\n' % (count,max))
160 def sanitize_name(name,what="branch", mapping={}):
161 """Sanitize input roughly according to git-check-ref-format(1)"""
163 # NOTE: Do not update this transform to work around
164 # incompatibilities on your platform. If you change it and it starts
165 # modifying names which previously were not touched it will break
166 # preexisting setups which are doing incremental imports.
168 # Fast-export tries to not inflict arbitrary naming policy on the
169 # user, instead it aims to provide mechanisms allowing the user to
170 # apply their own policy. Therefore do not add a transform which can
171 # already be implemented with the -B and -T options to mangle branch
172 # and tag names. If you have a source repository where this is too
173 # much work to do manually, write a tool that does it for you.
176 def dot(name):
177 if not name: return name
178 if name[0] == '.': return '_'+name[1:]
179 return name
181 n=mapping.get(name,name)
182 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
183 n=p.sub('_', n)
184 if n[-1] in ('/', '.'): n=n[:-1]+'_'
185 n='/'.join(map(dot,n.split('/')))
186 p=re.compile('_+')
187 n=p.sub('_', n)
189 if n!=name:
190 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
191 return n
193 def strip_leading_slash(filename):
194 if filename[0] == '/':
195 return filename[1:]
196 return filename
198 def export_commit(ui,repo,revision,old_marks,max,count,authors,
199 branchesmap,sob,brmap,hgtags,encoding='',fn_encoding='',
200 plugins={}):
201 def get_branchname(name):
202 if brmap.has_key(name):
203 return brmap[name]
204 n=sanitize_name(name, "branch", branchesmap)
205 brmap[name]=n
206 return n
208 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
210 branch=get_branchname(branch)
212 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
213 author = get_author(desc,user,authors)
215 if plugins and plugins['commit_message_filters']:
216 commit_data = {'branch': branch, 'parents': parents, 'author': author, 'desc': desc}
217 for filter in plugins['commit_message_filters']:
218 filter(commit_data)
219 branch = commit_data['branch']
220 parents = commit_data['parents']
221 author = commit_data['author']
222 desc = commit_data['desc']
224 if len(parents)==0 and revision != 0:
225 wr('reset refs/heads/%s' % branch)
227 wr('commit refs/heads/%s' % branch)
228 wr('mark :%d' % (revision+1))
229 if sob:
230 wr('author %s %d %s' % (author,time,timezone))
231 wr('committer %s %d %s' % (user,time,timezone))
232 wr('data %d' % (len(desc)+1)) # wtf?
233 wr(desc)
234 wr()
236 ctx=revsymbol(repo,str(revision))
237 man=ctx.manifest()
238 added,changed,removed,type=[],[],[],''
240 if len(parents) == 0:
241 # first revision: feed in full manifest
242 added=man.keys()
243 added.sort()
244 type='full'
245 else:
246 wr('from %s' % revnum_to_revref(parents[0], old_marks))
247 if len(parents) == 1:
248 # later non-merge revision: feed in changed manifest
249 # if we have exactly one parent, just take the changes from the
250 # manifest without expensively comparing checksums
251 f=repo.status(parents[0],revnode)[:3]
252 added,changed,removed=f[1],f[0],f[2]
253 type='simple delta'
254 else: # a merge with two parents
255 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
256 # later merge revision: feed in changed manifest
257 # for many files comparing checksums is expensive so only do it for
258 # merges where we really need it due to hg's revlog logic
259 added,changed,removed=get_filechanges(repo,revision,parents,man)
260 type='thorough delta'
262 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
263 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
265 if fn_encoding:
266 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
268 removed=[strip_leading_slash(x) for x in removed]
270 map(lambda r: wr('D %s' % r),removed)
271 export_file_contents(ctx,man,added,hgtags,fn_encoding,plugins)
272 export_file_contents(ctx,man,changed,hgtags,fn_encoding,plugins)
273 wr()
275 return checkpoint(count)
277 def export_note(ui,repo,revision,count,authors,encoding,is_first):
278 (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
280 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
282 wr('commit refs/notes/hg')
283 wr('committer %s %d %s' % (user,time,timezone))
284 wr('data 0')
285 if is_first:
286 wr('from refs/notes/hg^0')
287 wr('N inline :%d' % (revision+1))
288 hg_hash=revsymbol(repo,str(revision)).hex()
289 wr('data %d' % (len(hg_hash)))
290 wr_no_nl(hg_hash)
291 wr()
292 return checkpoint(count)
294 wr('data %d' % (len(desc)+1)) # wtf?
295 wr(desc)
296 wr()
298 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
299 l=repo.tagslist()
300 for tag,node in l:
301 # Remap the branch name
302 tag=sanitize_name(tag,"tag",tagsmap)
303 # ignore latest revision
304 if tag=='tip': continue
305 # ignore tags to nodes that are missing (ie, 'in the future')
306 if node.encode('hex_codec') not in mapping_cache:
307 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
308 continue
310 rev=int(mapping_cache[node.encode('hex_codec')])
312 ref=revnum_to_revref(rev, old_marks)
313 if ref==None:
314 sys.stderr.write('Failed to find reference for creating tag'
315 ' %s at r%d\n' % (tag,rev))
316 continue
317 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
318 wr('reset refs/tags/%s' % tag)
319 wr('from %s' % ref)
320 wr()
321 count=checkpoint(count)
322 return count
324 def load_mapping(name, filename, mapping_is_raw):
325 raw_regexp=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
326 string_regexp='"(((\\.)|(\\")|[^"])*)"'
327 quoted_regexp=re.compile('^'+string_regexp+'[ ]*=[ ]*'+string_regexp+'$')
329 def parse_raw_line(line):
330 m=raw_regexp.match(line)
331 if m==None:
332 return None
333 return (m.group(1).strip(), m.group(2).strip())
335 def parse_quoted_line(line):
336 m=quoted_regexp.match(line)
337 if m==None:
338 return None
339 return (m.group(1).decode('string_escape'),
340 m.group(5).decode('string_escape'))
342 cache={}
343 if not os.path.exists(filename):
344 sys.stderr.write('Could not open mapping file [%s]\n' % (filename))
345 return cache
346 f=open(filename,'r')
349 for line in f.readlines():
350 l+=1
351 line=line.strip()
352 if l==1 and line[0]=='#' and line=='# quoted-escaped-strings':
353 continue
354 elif line=='' or line[0]=='#':
355 continue
356 m=parse_raw_line(line) if mapping_is_raw else parse_quoted_line(line)
357 if m==None:
358 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
359 continue
360 # put key:value in cache, key without ^:
361 cache[m[0]]=m[1]
362 a+=1
363 f.close()
364 sys.stderr.write('Loaded %d %s\n' % (a, name))
365 return cache
367 def branchtip(repo, heads):
368 '''return the tipmost branch head in heads'''
369 tip = heads[-1]
370 for h in reversed(heads):
371 if 'close' not in repo.changelog.read(h)[5]:
372 tip = h
373 break
374 return tip
376 def verify_heads(ui,repo,cache,force,branchesmap):
377 branches={}
378 for bn, heads in repo.branchmap().iteritems():
379 branches[bn] = branchtip(repo, heads)
380 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
381 l.sort()
383 # get list of hg's branches to verify, don't take all git has
384 for _,_,b in l:
385 b=get_branch(b)
386 sanitized_name=sanitize_name(b,"branch",branchesmap)
387 sha1=get_git_sha1(sanitized_name)
388 c=cache.get(sanitized_name)
389 if sha1!=c:
390 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
391 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
392 if not force: return False
394 # verify that branch has exactly one head
395 t={}
396 for h in repo.heads():
397 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
398 if t.get(branch,False):
399 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
400 repo.changelog.rev(h))
401 if not force: return False
402 t[branch]=True
404 return True
406 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
407 authors={},branchesmap={},tagsmap={},
408 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding='',
409 plugins={}):
410 def check_cache(filename, contents):
411 if len(contents) == 0:
412 sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename)
414 _max=int(m)
416 old_marks=load_cache(marksfile,lambda s: int(s)-1)
417 mapping_cache=load_cache(mappingfile)
418 heads_cache=load_cache(headsfile)
419 state_cache=load_cache(tipfile)
421 if len(state_cache) != 0:
422 for (name, data) in [(marksfile, old_marks),
423 (mappingfile, mapping_cache),
424 (headsfile, state_cache)]:
425 check_cache(name, data)
427 ui,repo=setup_repo(repourl)
429 if not verify_heads(ui,repo,heads_cache,force,branchesmap):
430 return 1
432 try:
433 tip=repo.changelog.count()
434 except AttributeError:
435 tip=len(repo)
437 min=int(state_cache.get('tip',0))
438 max=_max
439 if _max<0 or max>tip:
440 max=tip
442 for rev in range(0,max):
443 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
444 mapping_cache[revnode.encode('hex_codec')] = str(rev)
448 brmap={}
449 for rev in range(min,max):
450 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
451 sob,brmap,hgtags,encoding,fn_encoding,
452 plugins)
453 if notes:
454 for rev in range(min,max):
455 c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
457 state_cache['tip']=max
458 state_cache['repo']=repourl
459 save_cache(tipfile,state_cache)
460 save_cache(mappingfile,mapping_cache)
462 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
464 sys.stderr.write('Issued %d commands\n' % c)
466 return 0
468 if __name__=='__main__':
469 def bail(parser,opt):
470 sys.stderr.write('Error: No %s option given\n' % opt)
471 parser.print_help()
472 sys.exit(2)
474 parser=OptionParser()
476 parser.add_option("-m","--max",type="int",dest="max",
477 help="Maximum hg revision to import")
478 parser.add_option("--mapping",dest="mappingfile",
479 help="File to read last run's hg-to-git SHA1 mapping")
480 parser.add_option("--marks",dest="marksfile",
481 help="File to read git-fast-import's marks from")
482 parser.add_option("--heads",dest="headsfile",
483 help="File to read last run's git heads from")
484 parser.add_option("--status",dest="statusfile",
485 help="File to read status from")
486 parser.add_option("-r","--repo",dest="repourl",
487 help="URL of repo to import")
488 parser.add_option("-s",action="store_true",dest="sob",
489 default=False,help="Enable parsing Signed-off-by lines")
490 parser.add_option("--hgtags",action="store_true",dest="hgtags",
491 default=False,help="Enable exporting .hgtags files")
492 parser.add_option("-A","--authors",dest="authorfile",
493 help="Read authormap from AUTHORFILE")
494 parser.add_option("-B","--branches",dest="branchesfile",
495 help="Read branch map from BRANCHESFILE")
496 parser.add_option("-T","--tags",dest="tagsfile",
497 help="Read tags map from TAGSFILE")
498 parser.add_option("-f","--force",action="store_true",dest="force",
499 default=False,help="Ignore validation errors by force")
500 parser.add_option("-M","--default-branch",dest="default_branch",
501 help="Set the default branch")
502 parser.add_option("-o","--origin",dest="origin_name",
503 help="use <name> as namespace to track upstream")
504 parser.add_option("--hg-hash",action="store_true",dest="notes",
505 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
506 parser.add_option("-e",dest="encoding",
507 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
508 parser.add_option("--fe",dest="fn_encoding",
509 help="Assume file names from Mercurial are encoded in <filename_encoding>")
510 parser.add_option("--mappings-are-raw",dest="raw_mappings", default=False,
511 help="Assume mappings are raw <key>=<value> lines")
512 parser.add_option("--filter-contents",dest="filter_contents",
513 help="Pipe contents of each exported file through FILTER_CONTENTS <file-path> <hg-hash> <is-binary>")
514 parser.add_option("--plugin-path", type="string", dest="pluginpath",
515 help="Additional search path for plugins ")
516 parser.add_option("--plugin", action="append", type="string", dest="plugins",
517 help="Add a plugin with the given init string <name=init>")
519 (options,args)=parser.parse_args()
521 m=-1
522 if options.max!=None: m=options.max
524 if options.marksfile==None: bail(parser,'--marks')
525 if options.mappingfile==None: bail(parser,'--mapping')
526 if options.headsfile==None: bail(parser,'--heads')
527 if options.statusfile==None: bail(parser,'--status')
528 if options.repourl==None: bail(parser,'--repo')
530 a={}
531 if options.authorfile!=None:
532 a=load_mapping('authors', options.authorfile, options.raw_mappings)
534 b={}
535 if options.branchesfile!=None:
536 b=load_mapping('branches', options.branchesfile, options.raw_mappings)
538 t={}
539 if options.tagsfile!=None:
540 t=load_mapping('tags', options.tagsfile, True)
542 if options.default_branch!=None:
543 set_default_branch(options.default_branch)
545 if options.origin_name!=None:
546 set_origin_name(options.origin_name)
548 encoding=''
549 if options.encoding!=None:
550 encoding=options.encoding
552 fn_encoding=encoding
553 if options.fn_encoding!=None:
554 fn_encoding=options.fn_encoding
556 plugins=[]
557 if options.plugins!=None:
558 plugins+=options.plugins
560 if options.filter_contents!=None:
561 plugins+=['shell_filter_file_contents='+options.filter_contents]
563 plugins_dict={}
564 plugins_dict['commit_message_filters']=[]
565 plugins_dict['file_data_filters']=[]
567 if plugins and options.pluginpath:
568 sys.stderr.write('Using additional plugin path: ' + options.pluginpath + '\n')
570 for plugin in plugins:
571 split = plugin.split('=')
572 name, opts = split[0], '='.join(split[1:])
573 i = pluginloader.get_plugin(name,options.pluginpath)
574 sys.stderr.write('Loaded plugin ' + i['name'] + ' from path: ' + i['path'] +' with opts: ' + opts + '\n')
575 plugin = pluginloader.load_plugin(i).build_filter(opts)
576 if hasattr(plugin,'file_data_filter') and callable(plugin.file_data_filter):
577 plugins_dict['file_data_filters'].append(plugin.file_data_filter)
578 if hasattr(plugin, 'commit_message_filter') and callable(plugin.commit_message_filter):
579 plugins_dict['commit_message_filters'].append(plugin.commit_message_filter)
581 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
582 options.headsfile, options.statusfile,
583 authors=a,branchesmap=b,tagsmap=t,
584 sob=options.sob,force=options.force,hgtags=options.hgtags,
585 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding,
586 plugins=plugins_dict))