Change syntax of mapping files
[fast-export.git] / hg-fast-export.py
blob47290df834efb9b3501f168ae48c08605fb90f74
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139 strip_leading_slash(filename)))
140 wr('data %d' % len(d)) # had some trouble with size()
141 wr(d)
142 count+=1
143 if count%cfg_export_boundary==0:
144 sys.stderr.write('Exported %d/%d files\n' % (count,max))
145 if max>cfg_export_boundary:
146 sys.stderr.write('Exported %d/%d files\n' % (count,max))
148 def sanitize_name(name,what="branch", mapping={}):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
151 # NOTE: Do not update this transform to work around
152 # incompatibilities on your platform. If you change it and it starts
153 # modifying names which previously were not touched it will break
154 # preexisting setups which are doing incremental imports.
156 # Use the -B and -T options to mangle branch and tag names
157 # instead. If you have a source repository where this is too much
158 # work to do manually, write a tool that does it for you.
160 def dot(name):
161 if not name: return name
162 if name[0] == '.': return '_'+name[1:]
163 return name
165 n=mapping.get(name,name)
166 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
167 n=p.sub('_', n)
168 if n[-1] in ('/', '.'): n=n[:-1]+'_'
169 n='/'.join(map(dot,n.split('/')))
170 p=re.compile('_+')
171 n=p.sub('_', n)
173 if n!=name:
174 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
175 return n
177 def strip_leading_slash(filename):
178 if filename[0] == '/':
179 return filename[1:]
180 return filename
182 def export_commit(ui,repo,revision,old_marks,max,count,authors,
183 branchesmap,sob,brmap,hgtags,encoding='',fn_encoding=''):
184 def get_branchname(name):
185 if brmap.has_key(name):
186 return brmap[name]
187 n=sanitize_name(name, "branch", branchesmap)
188 brmap[name]=n
189 return n
191 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
193 branch=get_branchname(branch)
195 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
197 if len(parents)==0 and revision != 0:
198 wr('reset refs/heads/%s' % branch)
200 wr('commit refs/heads/%s' % branch)
201 wr('mark :%d' % (revision+1))
202 if sob:
203 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
204 wr('committer %s %d %s' % (user,time,timezone))
205 wr('data %d' % (len(desc)+1)) # wtf?
206 wr(desc)
207 wr()
209 ctx=repo.changectx(str(revision))
210 man=ctx.manifest()
211 added,changed,removed,type=[],[],[],''
213 if len(parents) == 0:
214 # first revision: feed in full manifest
215 added=man.keys()
216 added.sort()
217 type='full'
218 else:
219 wr('from %s' % revnum_to_revref(parents[0], old_marks))
220 if len(parents) == 1:
221 # later non-merge revision: feed in changed manifest
222 # if we have exactly one parent, just take the changes from the
223 # manifest without expensively comparing checksums
224 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
225 added,changed,removed=f[1],f[0],f[2]
226 type='simple delta'
227 else: # a merge with two parents
228 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
229 # later merge revision: feed in changed manifest
230 # for many files comparing checksums is expensive so only do it for
231 # merges where we really need it due to hg's revlog logic
232 added,changed,removed=get_filechanges(repo,revision,parents,man)
233 type='thorough delta'
235 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
236 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
238 if fn_encoding:
239 removed=[r.decode(fn_encoding).encode('utf8') for r in removed]
241 removed=[strip_leading_slash(x) for x in removed]
243 map(lambda r: wr('D %s' % r),removed)
244 export_file_contents(ctx,man,added,hgtags,fn_encoding)
245 export_file_contents(ctx,man,changed,hgtags,fn_encoding)
246 wr()
248 return checkpoint(count)
250 def export_note(ui,repo,revision,count,authors,encoding,is_first):
251 (revnode,_,user,(time,timezone),_,_,_,_)=get_changeset(ui,repo,revision,authors,encoding)
253 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
255 wr('commit refs/notes/hg')
256 wr('committer %s %d %s' % (user,time,timezone))
257 wr('data 0')
258 if is_first:
259 wr('from refs/notes/hg^0')
260 wr('N inline :%d' % (revision+1))
261 hg_hash=repo.changectx(str(revision)).hex()
262 wr('data %d' % (len(hg_hash)))
263 wr_no_nl(hg_hash)
264 wr()
265 return checkpoint(count)
267 wr('data %d' % (len(desc)+1)) # wtf?
268 wr(desc)
269 wr()
271 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,tagsmap):
272 l=repo.tagslist()
273 for tag,node in l:
274 # Remap the branch name
275 tag=sanitize_name(tag,"tag",tagsmap)
276 # ignore latest revision
277 if tag=='tip': continue
278 # ignore tags to nodes that are missing (ie, 'in the future')
279 if node.encode('hex_codec') not in mapping_cache:
280 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
281 continue
283 rev=int(mapping_cache[node.encode('hex_codec')])
285 ref=revnum_to_revref(rev, old_marks)
286 if ref==None:
287 sys.stderr.write('Failed to find reference for creating tag'
288 ' %s at r%d\n' % (tag,rev))
289 continue
290 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
291 wr('reset refs/tags/%s' % tag)
292 wr('from %s' % ref)
293 wr()
294 count=checkpoint(count)
295 return count
297 def load_mapping(name, filename, mapping_is_raw):
298 raw_regexp=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
299 string_regexp='"(((\\.)|(\\")|[^"])*)"'
300 quoted_regexp=re.compile('^'+string_regexp+'[ ]*=[ ]*'+string_regexp+'$')
302 def parse_raw_line(line):
303 m=raw_regexp.match(line)
304 if m==None:
305 return None
306 return (m.group(1).strip(), m.group(2).strip())
308 def parse_quoted_line(line):
309 m=quoted_regexp.match(line)
310 if m==None:
311 return None
312 return (m.group(1).decode('string_escape'),
313 m.group(5).decode('string_escape'))
315 cache={}
316 if not os.path.exists(filename):
317 sys.stderr.write('Could not open mapping file [%s]\n' % (filename))
318 return cache
319 f=open(filename,'r')
322 for line in f.readlines():
323 l+=1
324 line=line.strip()
325 if l==1 and line[0]=='#' and line=='# quoted-escaped-strings':
326 continue
327 elif line=='' or line[0]=='#':
328 continue
329 m=parse_raw_line(line) if mapping_is_raw else parse_quoted_line(line)
330 if m==None:
331 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
332 continue
333 # put key:value in cache, key without ^:
334 cache[m[0]]=m[1]
335 a+=1
336 f.close()
337 sys.stderr.write('Loaded %d %s\n' % (a, name))
338 return cache
340 def branchtip(repo, heads):
341 '''return the tipmost branch head in heads'''
342 tip = heads[-1]
343 for h in reversed(heads):
344 if 'close' not in repo.changelog.read(h)[5]:
345 tip = h
346 break
347 return tip
349 def verify_heads(ui,repo,cache,force,branchesmap):
350 branches={}
351 for bn, heads in repo.branchmap().iteritems():
352 branches[bn] = branchtip(repo, heads)
353 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
354 l.sort()
356 # get list of hg's branches to verify, don't take all git has
357 for _,_,b in l:
358 b=get_branch(b)
359 sanitized_name=sanitize_name(b,"branch",branchesmap)
360 sha1=get_git_sha1(sanitized_name)
361 c=cache.get(sanitized_name)
362 if sha1!=c:
363 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
364 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
365 if not force: return False
367 # verify that branch has exactly one head
368 t={}
369 for h in repo.heads():
370 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
371 if t.get(branch,False):
372 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
373 repo.changelog.rev(h))
374 if not force: return False
375 t[branch]=True
377 return True
379 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,
380 authors={},branchesmap={},tagsmap={},
381 sob=False,force=False,hgtags=False,notes=False,encoding='',fn_encoding=''):
382 def check_cache(filename, contents):
383 if len(contents) == 0:
384 sys.stderr.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename)
386 _max=int(m)
388 old_marks=load_cache(marksfile,lambda s: int(s)-1)
389 mapping_cache=load_cache(mappingfile)
390 heads_cache=load_cache(headsfile)
391 state_cache=load_cache(tipfile)
393 if len(state_cache) != 0:
394 for (name, data) in [(marksfile, old_marks),
395 (mappingfile, mapping_cache),
396 (headsfile, state_cache)]:
397 check_cache(name, data)
399 ui,repo=setup_repo(repourl)
401 if not verify_heads(ui,repo,heads_cache,force,branchesmap):
402 return 1
404 try:
405 tip=repo.changelog.count()
406 except AttributeError:
407 tip=len(repo)
409 min=int(state_cache.get('tip',0))
410 max=_max
411 if _max<0 or max>tip:
412 max=tip
414 for rev in range(0,max):
415 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
416 mapping_cache[revnode.encode('hex_codec')] = str(rev)
420 brmap={}
421 for rev in range(min,max):
422 c=export_commit(ui,repo,rev,old_marks,max,c,authors,branchesmap,
423 sob,brmap,hgtags,encoding,fn_encoding)
424 if notes:
425 for rev in range(min,max):
426 c=export_note(ui,repo,rev,c,authors, encoding, rev == min and min != 0)
428 state_cache['tip']=max
429 state_cache['repo']=repourl
430 save_cache(tipfile,state_cache)
431 save_cache(mappingfile,mapping_cache)
433 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,tagsmap)
435 sys.stderr.write('Issued %d commands\n' % c)
437 return 0
439 if __name__=='__main__':
440 def bail(parser,opt):
441 sys.stderr.write('Error: No %s option given\n' % opt)
442 parser.print_help()
443 sys.exit(2)
445 parser=OptionParser()
447 parser.add_option("-m","--max",type="int",dest="max",
448 help="Maximum hg revision to import")
449 parser.add_option("--mapping",dest="mappingfile",
450 help="File to read last run's hg-to-git SHA1 mapping")
451 parser.add_option("--marks",dest="marksfile",
452 help="File to read git-fast-import's marks from")
453 parser.add_option("--heads",dest="headsfile",
454 help="File to read last run's git heads from")
455 parser.add_option("--status",dest="statusfile",
456 help="File to read status from")
457 parser.add_option("-r","--repo",dest="repourl",
458 help="URL of repo to import")
459 parser.add_option("-s",action="store_true",dest="sob",
460 default=False,help="Enable parsing Signed-off-by lines")
461 parser.add_option("--hgtags",action="store_true",dest="hgtags",
462 default=False,help="Enable exporting .hgtags files")
463 parser.add_option("-A","--authors",dest="authorfile",
464 help="Read authormap from AUTHORFILE")
465 parser.add_option("-B","--branches",dest="branchesfile",
466 help="Read branch map from BRANCHESFILE")
467 parser.add_option("-T","--tags",dest="tagsfile",
468 help="Read tags map from TAGSFILE")
469 parser.add_option("-f","--force",action="store_true",dest="force",
470 default=False,help="Ignore validation errors by force")
471 parser.add_option("-M","--default-branch",dest="default_branch",
472 help="Set the default branch")
473 parser.add_option("-o","--origin",dest="origin_name",
474 help="use <name> as namespace to track upstream")
475 parser.add_option("--hg-hash",action="store_true",dest="notes",
476 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
477 parser.add_option("-e",dest="encoding",
478 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
479 parser.add_option("--fe",dest="fn_encoding",
480 help="Assume file names from Mercurial are encoded in <filename_encoding>")
481 parser.add_option("--mappings-are-raw",dest="raw_mappings", default=False,
482 help="Assume mappings are raw <key>=<value> lines")
484 (options,args)=parser.parse_args()
486 m=-1
487 if options.max!=None: m=options.max
489 if options.marksfile==None: bail(parser,'--marks')
490 if options.mappingfile==None: bail(parser,'--mapping')
491 if options.headsfile==None: bail(parser,'--heads')
492 if options.statusfile==None: bail(parser,'--status')
493 if options.repourl==None: bail(parser,'--repo')
495 a={}
496 if options.authorfile!=None:
497 a=load_mapping('authors', options.authorfile, options.raw_mappings)
499 b={}
500 if options.branchesfile!=None:
501 b=load_mapping('branches', options.branchesfile, options.raw_mappings)
503 t={}
504 if options.tagsfile!=None:
505 t=load_mapping('tags', options.tagsfile, True)
507 if options.default_branch!=None:
508 set_default_branch(options.default_branch)
510 if options.origin_name!=None:
511 set_origin_name(options.origin_name)
513 encoding=''
514 if options.encoding!=None:
515 encoding=options.encoding
517 fn_encoding=encoding
518 if options.fn_encoding!=None:
519 fn_encoding=options.fn_encoding
521 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
522 options.headsfile, options.statusfile,
523 authors=a,branchesmap=b,tagsmap=t,
524 sob=options.sob,force=options.force,hgtags=options.hgtags,
525 notes=options.notes,encoding=encoding,fn_encoding=fn_encoding))