Annotate commits with the hg hash as git notes in the hg namespace
[fast-export.git] / hg-fast-export.py
blob4ed42e41ac66fa27ef9383ab61260798b419d22d
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
135 wr('data %d' % len(d)) # had some trouble with size()
136 wr(d)
137 count+=1
138 if count%cfg_export_boundary==0:
139 sys.stderr.write('Exported %d/%d files\n' % (count,max))
140 if max>cfg_export_boundary:
141 sys.stderr.write('Exported %d/%d files\n' % (count,max))
143 def sanitize_name(name,what="branch"):
144 """Sanitize input roughly according to git-check-ref-format(1)"""
146 def dot(name):
147 if name[0] == '.': return '_'+name[1:]
148 return name
150 n=name
151 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
152 n=p.sub('_', n)
153 if n[-1] in ('/', '.'): n=n[:-1]+'_'
154 n='/'.join(map(dot,n.split('/')))
155 p=re.compile('_+')
156 n=p.sub('_', n)
158 if n!=name:
159 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
160 return n
162 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,notes):
163 def get_branchname(name):
164 if brmap.has_key(name):
165 return brmap[name]
166 n=sanitize_name(name)
167 brmap[name]=n
168 return n
170 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
172 branch=get_branchname(branch)
174 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
176 if len(parents)==0 and revision != 0:
177 wr('reset refs/heads/%s' % branch)
179 wr('commit refs/heads/%s' % branch)
180 wr('mark :%d' % (revision+1))
181 if sob:
182 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
183 wr('committer %s %d %s' % (user,time,timezone))
184 wr('data %d' % (len(desc)+1)) # wtf?
185 wr(desc)
186 wr()
188 ctx=repo.changectx(str(revision))
189 man=ctx.manifest()
190 added,changed,removed,type=[],[],[],''
192 if len(parents) == 0:
193 # first revision: feed in full manifest
194 added=man.keys()
195 added.sort()
196 type='full'
197 else:
198 wr('from %s' % revnum_to_revref(parents[0], old_marks))
199 if len(parents) == 1:
200 # later non-merge revision: feed in changed manifest
201 # if we have exactly one parent, just take the changes from the
202 # manifest without expensively comparing checksums
203 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
204 added,changed,removed=f[1],f[0],f[2]
205 type='simple delta'
206 else: # a merge with two parents
207 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
208 # later merge revision: feed in changed manifest
209 # for many files comparing checksums is expensive so only do it for
210 # merges where we really need it due to hg's revlog logic
211 added,changed,removed=get_filechanges(repo,revision,parents,man)
212 type='thorough delta'
214 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
215 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
217 map(lambda r: wr('D %s' % r),removed)
218 export_file_contents(ctx,man,added,hgtags)
219 export_file_contents(ctx,man,changed,hgtags)
220 wr()
222 count=checkpoint(count)
223 count=generate_note(user,time,timezone,revision,ctx,count,notes)
224 return count
226 def generate_note(user,time,timezone,revision,ctx,count,notes):
227 if not notes:
228 return count
229 wr('commit refs/notes/hg')
230 wr('committer %s %d %s' % (user,time,timezone))
231 wr('data 0')
232 wr('N inline :%d' % (revision+1))
233 hg_hash=ctx.hex()
234 wr('data %d' % (len(hg_hash)))
235 wr_no_nl(hg_hash)
236 wr()
237 return checkpoint(count)
239 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
240 l=repo.tagslist()
241 for tag,node in l:
242 tag=sanitize_name(tag,"tag")
243 # ignore latest revision
244 if tag=='tip': continue
245 # ignore tags to nodes that are missing (ie, 'in the future')
246 if node.encode('hex_codec') not in mapping_cache:
247 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
248 continue
250 rev=int(mapping_cache[node.encode('hex_codec')])
252 ref=revnum_to_revref(rev, old_marks)
253 if ref==None:
254 sys.stderr.write('Failed to find reference for creating tag'
255 ' %s at r%d\n' % (tag,rev))
256 continue
257 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
258 wr('reset refs/tags/%s' % tag)
259 wr('from %s' % ref)
260 wr()
261 count=checkpoint(count)
262 return count
264 def load_authors(filename):
265 cache={}
266 if not os.path.exists(filename):
267 return cache
268 f=open(filename,'r')
271 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
272 for line in f.readlines():
273 l+=1
274 line=line.strip()
275 if line=='' or line[0]=='#':
276 continue
277 m=lre.match(line)
278 if m==None:
279 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
280 continue
281 # put key:value in cache, key without ^:
282 cache[m.group(1).strip()]=m.group(2).strip()
283 a+=1
284 f.close()
285 sys.stderr.write('Loaded %d authors\n' % a)
286 return cache
288 def branchtip(repo, heads):
289 '''return the tipmost branch head in heads'''
290 tip = heads[-1]
291 for h in reversed(heads):
292 if 'close' not in repo.changelog.read(h)[5]:
293 tip = h
294 break
295 return tip
297 def verify_heads(ui,repo,cache,force):
298 branches={}
299 for bn, heads in repo.branchmap().iteritems():
300 branches[bn] = branchtip(repo, heads)
301 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
302 l.sort()
304 # get list of hg's branches to verify, don't take all git has
305 for _,_,b in l:
306 b=get_branch(b)
307 sha1=get_git_sha1(b)
308 c=cache.get(b)
309 if sha1!=c:
310 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
311 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
312 if not force: return False
314 # verify that branch has exactly one head
315 t={}
316 for h in repo.heads():
317 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
318 if t.get(branch,False):
319 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
320 repo.changelog.rev(h))
321 if not force: return False
322 t[branch]=True
324 return True
326 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,notes=False):
327 _max=int(m)
329 old_marks=load_cache(marksfile,lambda s: int(s)-1)
330 mapping_cache=load_cache(mappingfile)
331 heads_cache=load_cache(headsfile)
332 state_cache=load_cache(tipfile)
334 ui,repo=setup_repo(repourl)
336 if not verify_heads(ui,repo,heads_cache,force):
337 return 1
339 try:
340 tip=repo.changelog.count()
341 except AttributeError:
342 tip=len(repo)
344 min=int(state_cache.get('tip',0))
345 max=_max
346 if _max<0 or max>tip:
347 max=tip
349 for rev in range(0,max):
350 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
351 mapping_cache[revnode.encode('hex_codec')] = str(rev)
355 brmap={}
356 for rev in range(min,max):
357 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,notes)
359 state_cache['tip']=max
360 state_cache['repo']=repourl
361 save_cache(tipfile,state_cache)
362 save_cache(mappingfile,mapping_cache)
364 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
366 sys.stderr.write('Issued %d commands\n' % c)
368 return 0
370 if __name__=='__main__':
371 def bail(parser,opt):
372 sys.stderr.write('Error: No %s option given\n' % opt)
373 parser.print_help()
374 sys.exit(2)
376 parser=OptionParser()
378 parser.add_option("-m","--max",type="int",dest="max",
379 help="Maximum hg revision to import")
380 parser.add_option("--mapping",dest="mappingfile",
381 help="File to read last run's hg-to-git SHA1 mapping")
382 parser.add_option("--marks",dest="marksfile",
383 help="File to read git-fast-import's marks from")
384 parser.add_option("--heads",dest="headsfile",
385 help="File to read last run's git heads from")
386 parser.add_option("--status",dest="statusfile",
387 help="File to read status from")
388 parser.add_option("-r","--repo",dest="repourl",
389 help="URL of repo to import")
390 parser.add_option("-s",action="store_true",dest="sob",
391 default=False,help="Enable parsing Signed-off-by lines")
392 parser.add_option("--hgtags",action="store_true",dest="hgtags",
393 default=False,help="Enable exporting .hgtags files")
394 parser.add_option("-A","--authors",dest="authorfile",
395 help="Read authormap from AUTHORFILE")
396 parser.add_option("-f","--force",action="store_true",dest="force",
397 default=False,help="Ignore validation errors by force")
398 parser.add_option("-M","--default-branch",dest="default_branch",
399 help="Set the default branch")
400 parser.add_option("-o","--origin",dest="origin_name",
401 help="use <name> as namespace to track upstream")
402 parser.add_option("--hg-hash",action="store_true",dest="notes",
403 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
405 (options,args)=parser.parse_args()
407 m=-1
408 if options.max!=None: m=options.max
410 if options.marksfile==None: bail(parser,'--marks')
411 if options.mappingfile==None: bail(parser,'--mapping')
412 if options.headsfile==None: bail(parser,'--heads')
413 if options.statusfile==None: bail(parser,'--status')
414 if options.repourl==None: bail(parser,'--repo')
416 a={}
417 if options.authorfile!=None:
418 a=load_authors(options.authorfile)
420 if options.default_branch!=None:
421 set_default_branch(options.default_branch)
423 if options.origin_name!=None:
424 set_origin_name(options.origin_name)
426 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
427 options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags,notes=options.notes))