refactor: Make author map loading more generic
[fast-export.git] / hg-fast-export.py
blob925ecbc7a6d371b4e0ef66daf9ded2d44fd7cae9
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
139 strip_leading_slash(filename)))
140 wr('data %d' % len(d)) # had some trouble with size()
141 wr(d)
142 count+=1
143 if count%cfg_export_boundary==0:
144 sys.stderr.write('Exported %d/%d files\n' % (count,max))
145 if max>cfg_export_boundary:
146 sys.stderr.write('Exported %d/%d files\n' % (count,max))
148 def sanitize_name(name,what="branch"):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
151 def dot(name):
152 if name[0] == '.': return '_'+name[1:]
153 return name
155 n=name
156 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
157 n=p.sub('_', n)
158 if n[-1] in ('/', '.'): n=n[:-1]+'_'
159 n='/'.join(map(dot,n.split('/')))
160 p=re.compile('_+')
161 n=p.sub('_', n)
163 if n!=name:
164 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
165 return n
167 def strip_leading_slash(filename):
168 if filename[0] == '/':
169 return filename[1:]
170 return filename
172 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,notes,encoding=''):
173 def get_branchname(name):
174 if brmap.has_key(name):
175 return brmap[name]
176 n=sanitize_name(name)
177 brmap[name]=n
178 return n
180 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
182 branch=get_branchname(branch)
184 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
186 if len(parents)==0 and revision != 0:
187 wr('reset refs/heads/%s' % branch)
189 wr('commit refs/heads/%s' % branch)
190 wr('mark :%d' % (revision+1))
191 if sob:
192 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
193 wr('committer %s %d %s' % (user,time,timezone))
194 wr('data %d' % (len(desc)+1)) # wtf?
195 wr(desc)
196 wr()
198 ctx=repo.changectx(str(revision))
199 man=ctx.manifest()
200 added,changed,removed,type=[],[],[],''
202 if len(parents) == 0:
203 # first revision: feed in full manifest
204 added=man.keys()
205 added.sort()
206 type='full'
207 else:
208 wr('from %s' % revnum_to_revref(parents[0], old_marks))
209 if len(parents) == 1:
210 # later non-merge revision: feed in changed manifest
211 # if we have exactly one parent, just take the changes from the
212 # manifest without expensively comparing checksums
213 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
214 added,changed,removed=f[1],f[0],f[2]
215 type='simple delta'
216 else: # a merge with two parents
217 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
218 # later merge revision: feed in changed manifest
219 # for many files comparing checksums is expensive so only do it for
220 # merges where we really need it due to hg's revlog logic
221 added,changed,removed=get_filechanges(repo,revision,parents,man)
222 type='thorough delta'
224 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
225 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
227 if encoding:
228 removed=[r.decode(encoding).encode('utf8') for r in removed]
230 removed=[strip_leading_slash(x) for x in removed]
232 map(lambda r: wr('D %s' % r),removed)
233 export_file_contents(ctx,man,added,hgtags,encoding)
234 export_file_contents(ctx,man,changed,hgtags,encoding)
235 wr()
237 count=checkpoint(count)
238 count=generate_note(user,time,timezone,revision,ctx,count,notes)
239 return count
241 def generate_note(user,time,timezone,revision,ctx,count,notes):
242 if not notes:
243 return count
244 wr('commit refs/notes/hg')
245 wr('committer %s %d %s' % (user,time,timezone))
246 wr('data 0')
247 wr('N inline :%d' % (revision+1))
248 hg_hash=ctx.hex()
249 wr('data %d' % (len(hg_hash)))
250 wr_no_nl(hg_hash)
251 wr()
252 return checkpoint(count)
254 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
255 l=repo.tagslist()
256 for tag,node in l:
257 tag=sanitize_name(tag,"tag")
258 # ignore latest revision
259 if tag=='tip': continue
260 # ignore tags to nodes that are missing (ie, 'in the future')
261 if node.encode('hex_codec') not in mapping_cache:
262 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
263 continue
265 rev=int(mapping_cache[node.encode('hex_codec')])
267 ref=revnum_to_revref(rev, old_marks)
268 if ref==None:
269 sys.stderr.write('Failed to find reference for creating tag'
270 ' %s at r%d\n' % (tag,rev))
271 continue
272 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
273 wr('reset refs/tags/%s' % tag)
274 wr('from %s' % ref)
275 wr()
276 count=checkpoint(count)
277 return count
279 def load_mapping(name, filename):
280 cache={}
281 if not os.path.exists(filename):
282 return cache
283 f=open(filename,'r')
286 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
287 for line in f.readlines():
288 l+=1
289 line=line.strip()
290 if line=='' or line[0]=='#':
291 continue
292 m=lre.match(line)
293 if m==None:
294 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
295 continue
296 # put key:value in cache, key without ^:
297 cache[m.group(1).strip()]=m.group(2).strip()
298 a+=1
299 f.close()
300 sys.stderr.write('Loaded %d %s\n' % (a, name))
301 return cache
303 def branchtip(repo, heads):
304 '''return the tipmost branch head in heads'''
305 tip = heads[-1]
306 for h in reversed(heads):
307 if 'close' not in repo.changelog.read(h)[5]:
308 tip = h
309 break
310 return tip
312 def verify_heads(ui,repo,cache,force):
313 branches={}
314 for bn, heads in repo.branchmap().iteritems():
315 branches[bn] = branchtip(repo, heads)
316 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
317 l.sort()
319 # get list of hg's branches to verify, don't take all git has
320 for _,_,b in l:
321 b=get_branch(b)
322 sha1=get_git_sha1(b)
323 c=cache.get(b)
324 if sha1!=c:
325 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
326 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
327 if not force: return False
329 # verify that branch has exactly one head
330 t={}
331 for h in repo.heads():
332 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
333 if t.get(branch,False):
334 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
335 repo.changelog.rev(h))
336 if not force: return False
337 t[branch]=True
339 return True
341 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,notes=False,encoding=''):
342 _max=int(m)
344 old_marks=load_cache(marksfile,lambda s: int(s)-1)
345 mapping_cache=load_cache(mappingfile)
346 heads_cache=load_cache(headsfile)
347 state_cache=load_cache(tipfile)
349 ui,repo=setup_repo(repourl)
351 if not verify_heads(ui,repo,heads_cache,force):
352 return 1
354 try:
355 tip=repo.changelog.count()
356 except AttributeError:
357 tip=len(repo)
359 min=int(state_cache.get('tip',0))
360 max=_max
361 if _max<0 or max>tip:
362 max=tip
364 for rev in range(0,max):
365 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
366 mapping_cache[revnode.encode('hex_codec')] = str(rev)
370 brmap={}
371 for rev in range(min,max):
372 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,notes,encoding)
374 state_cache['tip']=max
375 state_cache['repo']=repourl
376 save_cache(tipfile,state_cache)
377 save_cache(mappingfile,mapping_cache)
379 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
381 sys.stderr.write('Issued %d commands\n' % c)
383 return 0
385 if __name__=='__main__':
386 def bail(parser,opt):
387 sys.stderr.write('Error: No %s option given\n' % opt)
388 parser.print_help()
389 sys.exit(2)
391 parser=OptionParser()
393 parser.add_option("-m","--max",type="int",dest="max",
394 help="Maximum hg revision to import")
395 parser.add_option("--mapping",dest="mappingfile",
396 help="File to read last run's hg-to-git SHA1 mapping")
397 parser.add_option("--marks",dest="marksfile",
398 help="File to read git-fast-import's marks from")
399 parser.add_option("--heads",dest="headsfile",
400 help="File to read last run's git heads from")
401 parser.add_option("--status",dest="statusfile",
402 help="File to read status from")
403 parser.add_option("-r","--repo",dest="repourl",
404 help="URL of repo to import")
405 parser.add_option("-s",action="store_true",dest="sob",
406 default=False,help="Enable parsing Signed-off-by lines")
407 parser.add_option("--hgtags",action="store_true",dest="hgtags",
408 default=False,help="Enable exporting .hgtags files")
409 parser.add_option("-A","--authors",dest="authorfile",
410 help="Read authormap from AUTHORFILE")
411 parser.add_option("-f","--force",action="store_true",dest="force",
412 default=False,help="Ignore validation errors by force")
413 parser.add_option("-M","--default-branch",dest="default_branch",
414 help="Set the default branch")
415 parser.add_option("-o","--origin",dest="origin_name",
416 help="use <name> as namespace to track upstream")
417 parser.add_option("--hg-hash",action="store_true",dest="notes",
418 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
419 parser.add_option("-e",dest="encoding",
420 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
422 (options,args)=parser.parse_args()
424 m=-1
425 if options.max!=None: m=options.max
427 if options.marksfile==None: bail(parser,'--marks')
428 if options.mappingfile==None: bail(parser,'--mapping')
429 if options.headsfile==None: bail(parser,'--heads')
430 if options.statusfile==None: bail(parser,'--status')
431 if options.repourl==None: bail(parser,'--repo')
433 a={}
434 if options.authorfile!=None:
435 a=load_mapping('authors', options.authorfile)
437 if options.default_branch!=None:
438 set_default_branch(options.default_branch)
440 if options.origin_name!=None:
441 set_origin_name(options.origin_name)
443 encoding=''
444 if options.encoding!=None:
445 encoding=options.encoding
447 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
448 options.headsfile, options.statusfile,authors=a,
449 sob=options.sob,force=options.force,hgtags=options.hgtags,
450 notes=options.notes,encoding=encoding))