Merge pull request #5 from living180/master
[fast-export/fast-export-unix-compliant.git] / hg-fast-export.py
blob076d432aae3b67e2c2573565e64b911f96533925
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from tempfile import mkstemp
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
15 # silly regex to catch Signed-off-by lines in log message
16 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
17 # insert 'checkpoint' command after this many commits or none at all if 0
18 cfg_checkpoint_count=0
19 # write some progress message every this many file contents written
20 cfg_export_boundary=1000
22 def gitmode(flags):
23 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
25 def wr(msg=''):
26 if msg:
27 sys.stdout.write(msg)
28 sys.stdout.write('\n')
29 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
31 def checkpoint(count):
32 count=count+1
33 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
34 sys.stderr.write("Checkpoint after %d commits\n" % count)
35 wr('checkpoint')
36 wr()
37 return count
39 def revnum_to_revref(rev, old_marks):
40 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
41 or a mark)"""
42 return old_marks.get(rev) or ':%d' % (rev+1)
44 def file_mismatch(f1,f2):
45 """See if two revisions of a file are not equal."""
46 return node.hex(f1)!=node.hex(f2)
48 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
49 """Loop over our repository and find all changed and missing files."""
50 for left in dleft.keys():
51 right=dright.get(left,None)
52 if right==None:
53 # we have the file but our parent hasn't: add to left set
54 l.append(left)
55 elif match(dleft[left],right):
56 # we have it but checksums mismatch: add to center set
57 c.append(left)
58 for right in dright.keys():
59 left=dleft.get(right,None)
60 if left==None:
61 # if parent has file but we don't: add to right set
62 r.append(right)
63 # change is already handled when comparing child against parent
64 return l,c,r
66 def get_filechanges(repo,revision,parents,mleft):
67 """Given some repository and revision, find all changed/deleted files."""
68 l,c,r=[],[],[]
69 for p in parents:
70 if p<0: continue
71 mright=repo.changectx(p).manifest()
72 l,c,r=split_dict(mleft,mright,l,c,r)
73 l.sort()
74 c.sort()
75 r.sort()
76 return l,c,r
78 def get_author(logmessage,committer,authors):
79 """As git distincts between author and committer of a patch, try to
80 extract author by detecting Signed-off-by lines.
82 This walks from the end of the log message towards the top skipping
83 empty lines. Upon the first non-empty line, it walks all Signed-off-by
84 lines upwards to find the first one. For that (if found), it extracts
85 authorship information the usual way (authors table, cleaning, etc.)
87 If no Signed-off-by line is found, this defaults to the committer.
89 This may sound stupid (and it somehow is), but in log messages we
90 accidentially may have lines in the middle starting with
91 "Signed-off-by: foo" and thus matching our detection regex. Prevent
92 that."""
94 loglines=logmessage.split('\n')
95 i=len(loglines)
96 # from tail walk to top skipping empty lines
97 while i>=0:
98 i-=1
99 if len(loglines[i].strip())==0: continue
100 break
101 if i>=0:
102 # walk further upwards to find first sob line, store in 'first'
103 first=None
104 while i>=0:
105 m=sob_re.match(loglines[i])
106 if m==None: break
107 first=m
108 i-=1
109 # if the last non-empty line matches our Signed-Off-by regex: extract username
110 if first!=None:
111 r=fixup_user(first.group(1),authors)
112 return r
113 return committer
115 def export_file_contents(ctx,manifest,files):
116 count=0
117 max=len(files)
118 for file in files:
119 # Skip .hgtags files. They only get us in trouble.
120 if file == ".hgtags":
121 sys.stderr.write('Skip %s\n' % (file))
122 continue
123 d=ctx.filectx(file).data()
124 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
125 wr('data %d' % len(d)) # had some trouble with size()
126 wr(d)
127 count+=1
128 if count%cfg_export_boundary==0:
129 sys.stderr.write('Exported %d/%d files\n' % (count,max))
130 if max>cfg_export_boundary:
131 sys.stderr.write('Exported %d/%d files\n' % (count,max))
133 def sanitize_name(name,what="branch"):
134 """Sanitize input roughly according to git-check-ref-format(1)"""
136 def dot(name):
137 if name[0] == '.': return '_'+name[1:]
138 return name
140 n=name
141 p=re.compile('([[ ~^:?*]|\.\.)')
142 n=p.sub('_', n)
143 if n[-1] in ('/', '.'): n=n[:-1]+'_'
144 n='/'.join(map(dot,n.split('/')))
145 p=re.compile('_+')
146 n=p.sub('_', n)
148 if n!=name:
149 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
150 return n
152 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap):
153 def get_branchname(name):
154 if brmap.has_key(name):
155 return brmap[name]
156 n=sanitize_name(name)
157 brmap[name]=n
158 return n
160 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
162 branch=get_branchname(branch)
164 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
166 if len(parents)==0 and revision != 0:
167 wr('reset refs/heads/%s' % branch)
169 wr('commit refs/heads/%s' % branch)
170 wr('mark :%d' % (revision+1))
171 if sob:
172 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
173 wr('committer %s %d %s' % (user,time,timezone))
174 wr('data %d' % (len(desc)+1)) # wtf?
175 wr(desc)
176 wr()
179 # Sort the parents based on revision ids so that we always get the
180 # same resulting git repo, no matter how the revisions were
181 # numbered.
182 parents.sort(key=repo.changelog.node, reverse=True)
184 ctx=repo.changectx(str(revision))
185 man=ctx.manifest()
186 added,changed,removed,type=[],[],[],''
188 if len(parents) == 0:
189 # first revision: feed in full manifest
190 added=man.keys()
191 added.sort()
192 type='full'
193 else:
194 wr('from %s' % revnum_to_revref(parents[0], old_marks))
195 if len(parents) == 1:
196 # later non-merge revision: feed in changed manifest
197 # if we have exactly one parent, just take the changes from the
198 # manifest without expensively comparing checksums
199 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
200 added,changed,removed=f[1],f[0],f[2]
201 type='simple delta'
202 else: # a merge with two parents
203 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
204 # later merge revision: feed in changed manifest
205 # for many files comparing checksums is expensive so only do it for
206 # merges where we really need it due to hg's revlog logic
207 added,changed,removed=get_filechanges(repo,revision,parents,man)
208 type='thorough delta'
210 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
211 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
213 map(lambda r: wr('D %s' % r),removed)
214 export_file_contents(ctx,man,added)
215 export_file_contents(ctx,man,changed)
216 wr()
218 return checkpoint(count)
220 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
221 l=repo.tagslist()
222 for tag,node in l:
223 tag=sanitize_name(tag,"tag")
224 # ignore latest revision
225 if tag=='tip': continue
226 # ignore tags to nodes that are missing (ie, 'in the future')
227 if node.encode('hex_codec') not in mapping_cache:
228 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
229 continue
231 rev=int(mapping_cache[node.encode('hex_codec')])
233 ref=revnum_to_revref(rev, old_marks)
234 if ref==None:
235 sys.stderr.write('Failed to find reference for creating tag'
236 ' %s at r%d\n' % (tag,rev))
237 continue
238 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
239 wr('reset refs/tags/%s' % tag)
240 wr('from %s' % ref)
241 wr()
242 count=checkpoint(count)
243 return count
245 def load_authors(filename):
246 cache={}
247 if not os.path.exists(filename):
248 return cache
249 f=open(filename,'r')
251 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
252 for line in f.readlines():
253 l+=1
254 m=lre.match(line)
255 if m==None:
256 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
257 continue
258 # put key:value in cache, key without ^:
259 cache[m.group(1).strip()]=m.group(2).strip()
260 f.close()
261 sys.stderr.write('Loaded %d authors\n' % l)
262 return cache
264 def verify_heads(ui,repo,cache,force):
265 branches=repo.branchtags()
266 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
267 l.sort()
269 # get list of hg's branches to verify, don't take all git has
270 for _,_,b in l:
271 b=get_branch(b)
272 sha1=get_git_sha1(b)
273 c=cache.get(b)
274 if sha1!=c:
275 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
276 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
277 if not force: return False
279 # verify that branch has exactly one head
280 t={}
281 for h in repo.heads():
282 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
283 if t.get(branch,False):
284 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
285 repo.changelog.rev(h))
286 if not force: return False
287 t[branch]=True
289 return True
291 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False):
292 _max=int(m)
294 old_marks=load_cache(marksfile,lambda s: int(s)-1)
295 mapping_cache=load_cache(mappingfile)
296 heads_cache=load_cache(headsfile)
297 state_cache=load_cache(tipfile)
299 ui,repo=setup_repo(repourl)
301 if not verify_heads(ui,repo,heads_cache,force):
302 return 1
304 try:
305 tip=repo.changelog.count()
306 except AttributeError:
307 tip=len(repo)
309 min=int(state_cache.get('tip',0))
310 max=_max
311 if _max<0 or max>tip:
312 max=tip
314 for rev in range(0,max):
315 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
316 mapping_cache[revnode.encode('hex_codec')] = str(rev)
320 brmap={}
321 for rev in range(min,max):
322 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap)
324 state_cache['tip']=max
325 state_cache['repo']=repourl
326 save_cache(tipfile,state_cache)
327 save_cache(mappingfile,mapping_cache)
329 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
331 sys.stderr.write('Issued %d commands\n' % c)
333 return 0
335 if __name__=='__main__':
336 def bail(parser,opt):
337 sys.stderr.write('Error: No %s option given\n' % opt)
338 parser.print_help()
339 sys.exit(2)
341 parser=OptionParser()
343 parser.add_option("-m","--max",type="int",dest="max",
344 help="Maximum hg revision to import")
345 parser.add_option("--mapping",dest="mappingfile",
346 help="File to read last run's hg-to-git SHA1 mapping")
347 parser.add_option("--marks",dest="marksfile",
348 help="File to read git-fast-import's marks from")
349 parser.add_option("--heads",dest="headsfile",
350 help="File to read last run's git heads from")
351 parser.add_option("--status",dest="statusfile",
352 help="File to read status from")
353 parser.add_option("-r","--repo",dest="repourl",
354 help="URL of repo to import")
355 parser.add_option("-s",action="store_true",dest="sob",
356 default=False,help="Enable parsing Signed-off-by lines")
357 parser.add_option("-A","--authors",dest="authorfile",
358 help="Read authormap from AUTHORFILE")
359 parser.add_option("-f","--force",action="store_true",dest="force",
360 default=False,help="Ignore validation errors by force")
361 parser.add_option("-M","--default-branch",dest="default_branch",
362 help="Set the default branch")
363 parser.add_option("-o","--origin",dest="origin_name",
364 help="use <name> as namespace to track upstream")
366 (options,args)=parser.parse_args()
368 m=-1
369 if options.max!=None: m=options.max
371 if options.marksfile==None: bail(parser,'--marks')
372 if options.mappingfile==None: bail(parser,'--mapping')
373 if options.headsfile==None: bail(parser,'--heads')
374 if options.statusfile==None: bail(parser,'--status')
375 if options.repourl==None: bail(parser,'--repo')
377 a={}
378 if options.authorfile!=None:
379 a=load_authors(options.authorfile)
381 if options.default_branch!=None:
382 set_default_branch(options.default_branch)
384 if options.origin_name!=None:
385 set_origin_name(options.origin_name)
387 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
388 options.statusfile,authors=a,sob=options.sob,force=options.force))