Merge pull request #23 from ritcheyer/master
[fast-export/rorcz.git] / hg-fast-export.py
blob90d334b8fe8737b6b68a8d677c74974d0ad6ee51
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr(msg=''):
33 if msg:
34 sys.stdout.write(msg)
35 sys.stdout.write('\n')
36 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
38 def checkpoint(count):
39 count=count+1
40 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
41 sys.stderr.write("Checkpoint after %d commits\n" % count)
42 wr('checkpoint')
43 wr()
44 return count
46 def revnum_to_revref(rev, old_marks):
47 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
48 or a mark)"""
49 return old_marks.get(rev) or ':%d' % (rev+1)
51 def file_mismatch(f1,f2):
52 """See if two revisions of a file are not equal."""
53 return node.hex(f1)!=node.hex(f2)
55 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
56 """Loop over our repository and find all changed and missing files."""
57 for left in dleft.keys():
58 right=dright.get(left,None)
59 if right==None:
60 # we have the file but our parent hasn't: add to left set
61 l.append(left)
62 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
63 # we have it but checksums mismatch: add to center set
64 c.append(left)
65 for right in dright.keys():
66 left=dleft.get(right,None)
67 if left==None:
68 # if parent has file but we don't: add to right set
69 r.append(right)
70 # change is already handled when comparing child against parent
71 return l,c,r
73 def get_filechanges(repo,revision,parents,mleft):
74 """Given some repository and revision, find all changed/deleted files."""
75 l,c,r=[],[],[]
76 for p in parents:
77 if p<0: continue
78 mright=repo.changectx(p).manifest()
79 l,c,r=split_dict(mleft,mright,l,c,r)
80 l.sort()
81 c.sort()
82 r.sort()
83 return l,c,r
85 def get_author(logmessage,committer,authors):
86 """As git distincts between author and committer of a patch, try to
87 extract author by detecting Signed-off-by lines.
89 This walks from the end of the log message towards the top skipping
90 empty lines. Upon the first non-empty line, it walks all Signed-off-by
91 lines upwards to find the first one. For that (if found), it extracts
92 authorship information the usual way (authors table, cleaning, etc.)
94 If no Signed-off-by line is found, this defaults to the committer.
96 This may sound stupid (and it somehow is), but in log messages we
97 accidentially may have lines in the middle starting with
98 "Signed-off-by: foo" and thus matching our detection regex. Prevent
99 that."""
101 loglines=logmessage.split('\n')
102 i=len(loglines)
103 # from tail walk to top skipping empty lines
104 while i>=0:
105 i-=1
106 if len(loglines[i].strip())==0: continue
107 break
108 if i>=0:
109 # walk further upwards to find first sob line, store in 'first'
110 first=None
111 while i>=0:
112 m=sob_re.match(loglines[i])
113 if m==None: break
114 first=m
115 i-=1
116 # if the last non-empty line matches our Signed-Off-by regex: extract username
117 if first!=None:
118 r=fixup_user(first.group(1),authors)
119 return r
120 return committer
122 def export_file_contents(ctx,manifest,files,hgtags):
123 count=0
124 max=len(files)
125 for file in files:
126 # Skip .hgtags files. They only get us in trouble.
127 if not hgtags and file == ".hgtags":
128 sys.stderr.write('Skip %s\n' % (file))
129 continue
130 d=ctx.filectx(file).data()
131 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
132 wr('data %d' % len(d)) # had some trouble with size()
133 wr(d)
134 count+=1
135 if count%cfg_export_boundary==0:
136 sys.stderr.write('Exported %d/%d files\n' % (count,max))
137 if max>cfg_export_boundary:
138 sys.stderr.write('Exported %d/%d files\n' % (count,max))
140 def sanitize_name(name,what="branch"):
141 """Sanitize input roughly according to git-check-ref-format(1)"""
143 def dot(name):
144 if name[0] == '.': return '_'+name[1:]
145 return name
147 n=name
148 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
149 n=p.sub('_', n)
150 if n[-1] in ('/', '.'): n=n[:-1]+'_'
151 n='/'.join(map(dot,n.split('/')))
152 p=re.compile('_+')
153 n=p.sub('_', n)
155 if n!=name:
156 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
157 return n
159 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags):
160 def get_branchname(name):
161 if brmap.has_key(name):
162 return brmap[name]
163 n=sanitize_name(name)
164 brmap[name]=n
165 return n
167 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
169 branch=get_branchname(branch)
171 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
173 if len(parents)==0 and revision != 0:
174 wr('reset refs/heads/%s' % branch)
176 wr('commit refs/heads/%s' % branch)
177 wr('mark :%d' % (revision+1))
178 if sob:
179 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
180 wr('committer %s %d %s' % (user,time,timezone))
181 wr('data %d' % (len(desc)+1)) # wtf?
182 wr(desc)
183 wr()
186 # Sort the parents based on revision ids so that we always get the
187 # same resulting git repo, no matter how the revisions were
188 # numbered.
189 parents.sort(key=repo.changelog.node, reverse=True)
191 ctx=repo.changectx(str(revision))
192 man=ctx.manifest()
193 added,changed,removed,type=[],[],[],''
195 if len(parents) == 0:
196 # first revision: feed in full manifest
197 added=man.keys()
198 added.sort()
199 type='full'
200 else:
201 wr('from %s' % revnum_to_revref(parents[0], old_marks))
202 if len(parents) == 1:
203 # later non-merge revision: feed in changed manifest
204 # if we have exactly one parent, just take the changes from the
205 # manifest without expensively comparing checksums
206 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
207 added,changed,removed=f[1],f[0],f[2]
208 type='simple delta'
209 else: # a merge with two parents
210 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
211 # later merge revision: feed in changed manifest
212 # for many files comparing checksums is expensive so only do it for
213 # merges where we really need it due to hg's revlog logic
214 added,changed,removed=get_filechanges(repo,revision,parents,man)
215 type='thorough delta'
217 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
218 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
220 map(lambda r: wr('D %s' % r),removed)
221 export_file_contents(ctx,man,added,hgtags)
222 export_file_contents(ctx,man,changed,hgtags)
223 wr()
225 return checkpoint(count)
227 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
228 l=repo.tagslist()
229 for tag,node in l:
230 tag=sanitize_name(tag,"tag")
231 # ignore latest revision
232 if tag=='tip': continue
233 # ignore tags to nodes that are missing (ie, 'in the future')
234 if node.encode('hex_codec') not in mapping_cache:
235 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
236 continue
238 rev=int(mapping_cache[node.encode('hex_codec')])
240 ref=revnum_to_revref(rev, old_marks)
241 if ref==None:
242 sys.stderr.write('Failed to find reference for creating tag'
243 ' %s at r%d\n' % (tag,rev))
244 continue
245 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
246 wr('reset refs/tags/%s' % tag)
247 wr('from %s' % ref)
248 wr()
249 count=checkpoint(count)
250 return count
252 def load_authors(filename):
253 cache={}
254 if not os.path.exists(filename):
255 return cache
256 f=open(filename,'r')
259 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
260 for line in f.readlines():
261 l+=1
262 line=line.strip()
263 if line=='' or line[0]=='#':
264 continue
265 m=lre.match(line)
266 if m==None:
267 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
268 continue
269 # put key:value in cache, key without ^:
270 cache[m.group(1).strip()]=m.group(2).strip()
271 a+=1
272 f.close()
273 sys.stderr.write('Loaded %d authors\n' % a)
274 return cache
276 def branchtip(repo, heads):
277 '''return the tipmost branch head in heads'''
278 tip = heads[-1]
279 for h in reversed(heads):
280 if 'close' not in repo.changelog.read(h)[5]:
281 tip = h
282 break
283 return tip
285 def verify_heads(ui,repo,cache,force):
286 branches={}
287 for bn, heads in repo.branchmap().iteritems():
288 branches[bn] = branchtip(repo, heads)
289 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
290 l.sort()
292 # get list of hg's branches to verify, don't take all git has
293 for _,_,b in l:
294 b=get_branch(b)
295 sha1=get_git_sha1(b)
296 c=cache.get(b)
297 if sha1!=c:
298 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
299 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
300 if not force: return False
302 # verify that branch has exactly one head
303 t={}
304 for h in repo.heads():
305 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
306 if t.get(branch,False):
307 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
308 repo.changelog.rev(h))
309 if not force: return False
310 t[branch]=True
312 return True
314 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False):
315 _max=int(m)
317 old_marks=load_cache(marksfile,lambda s: int(s)-1)
318 mapping_cache=load_cache(mappingfile)
319 heads_cache=load_cache(headsfile)
320 state_cache=load_cache(tipfile)
322 ui,repo=setup_repo(repourl)
324 if not verify_heads(ui,repo,heads_cache,force):
325 return 1
327 try:
328 tip=repo.changelog.count()
329 except AttributeError:
330 tip=len(repo)
332 min=int(state_cache.get('tip',0))
333 max=_max
334 if _max<0 or max>tip:
335 max=tip
337 for rev in range(0,max):
338 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
339 mapping_cache[revnode.encode('hex_codec')] = str(rev)
343 brmap={}
344 for rev in range(min,max):
345 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags)
347 state_cache['tip']=max
348 state_cache['repo']=repourl
349 save_cache(tipfile,state_cache)
350 save_cache(mappingfile,mapping_cache)
352 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
354 sys.stderr.write('Issued %d commands\n' % c)
356 return 0
358 if __name__=='__main__':
359 def bail(parser,opt):
360 sys.stderr.write('Error: No %s option given\n' % opt)
361 parser.print_help()
362 sys.exit(2)
364 parser=OptionParser()
366 parser.add_option("-m","--max",type="int",dest="max",
367 help="Maximum hg revision to import")
368 parser.add_option("--mapping",dest="mappingfile",
369 help="File to read last run's hg-to-git SHA1 mapping")
370 parser.add_option("--marks",dest="marksfile",
371 help="File to read git-fast-import's marks from")
372 parser.add_option("--heads",dest="headsfile",
373 help="File to read last run's git heads from")
374 parser.add_option("--status",dest="statusfile",
375 help="File to read status from")
376 parser.add_option("-r","--repo",dest="repourl",
377 help="URL of repo to import")
378 parser.add_option("-s",action="store_true",dest="sob",
379 default=False,help="Enable parsing Signed-off-by lines")
380 parser.add_option("--hgtags",action="store_true",dest="hgtags",
381 default=False,help="Enable exporting .hgtags files")
382 parser.add_option("-A","--authors",dest="authorfile",
383 help="Read authormap from AUTHORFILE")
384 parser.add_option("-f","--force",action="store_true",dest="force",
385 default=False,help="Ignore validation errors by force")
386 parser.add_option("-M","--default-branch",dest="default_branch",
387 help="Set the default branch")
388 parser.add_option("-o","--origin",dest="origin_name",
389 help="use <name> as namespace to track upstream")
391 (options,args)=parser.parse_args()
393 m=-1
394 if options.max!=None: m=options.max
396 if options.marksfile==None: bail(parser,'--marks')
397 if options.mappingfile==None: bail(parser,'--mapping')
398 if options.headsfile==None: bail(parser,'--heads')
399 if options.statusfile==None: bail(parser,'--status')
400 if options.repourl==None: bail(parser,'--repo')
402 a={}
403 if options.authorfile!=None:
404 a=load_authors(options.authorfile)
406 if options.default_branch!=None:
407 set_default_branch(options.default_branch)
409 if options.origin_name!=None:
410 set_origin_name(options.origin_name)
412 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
413 options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags))