Organized imports
[fast-export.git] / hg-fast-export.py
blobe21378216ba3f8c4dbbd4053a89523921e208b9e
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 # silly regex to catch Signed-off-by lines in log message
15 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
16 # insert 'checkpoint' command after this many commits or none at all if 0
17 cfg_checkpoint_count=0
18 # write some progress message every this many file contents written
19 cfg_export_boundary=1000
21 def gitmode(flags):
22 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
24 def wr(msg=''):
25 if msg == None:
26 msg = ''
27 print msg
28 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
30 def checkpoint(count):
31 count=count+1
32 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
33 sys.stderr.write("Checkpoint after %d commits\n" % count)
34 wr('checkpoint')
35 wr()
36 return count
38 def revnum_to_revref(rev, old_marks):
39 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
40 or a mark)"""
41 return old_marks.get(rev) or ':%d' % (rev+1)
43 def file_mismatch(f1,f2):
44 """See if two revisions of a file are not equal."""
45 return node.hex(f1)!=node.hex(f2)
47 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
48 """Loop over our repository and find all changed and missing files."""
49 for left in dleft.keys():
50 right=dright.get(left,None)
51 if right==None:
52 # we have the file but our parent hasn't: add to left set
53 l.append(left)
54 elif match(dleft[left],right):
55 # we have it but checksums mismatch: add to center set
56 c.append(left)
57 for right in dright.keys():
58 left=dleft.get(right,None)
59 if left==None:
60 # if parent has file but we don't: add to right set
61 r.append(right)
62 # change is already handled when comparing child against parent
63 return l,c,r
65 def get_filechanges(repo,revision,parents,mleft):
66 """Given some repository and revision, find all changed/deleted files."""
67 l,c,r=[],[],[]
68 for p in parents:
69 if p<0: continue
70 mright=repo.changectx(p).manifest()
71 l,c,r=split_dict(mleft,mright,l,c,r)
72 l.sort()
73 c.sort()
74 r.sort()
75 return l,c,r
77 def get_author(logmessage,committer,authors):
78 """As git distincts between author and committer of a patch, try to
79 extract author by detecting Signed-off-by lines.
81 This walks from the end of the log message towards the top skipping
82 empty lines. Upon the first non-empty line, it walks all Signed-off-by
83 lines upwards to find the first one. For that (if found), it extracts
84 authorship information the usual way (authors table, cleaning, etc.)
86 If no Signed-off-by line is found, this defaults to the committer.
88 This may sound stupid (and it somehow is), but in log messages we
89 accidentially may have lines in the middle starting with
90 "Signed-off-by: foo" and thus matching our detection regex. Prevent
91 that."""
93 loglines=logmessage.split('\n')
94 i=len(loglines)
95 # from tail walk to top skipping empty lines
96 while i>=0:
97 i-=1
98 if len(loglines[i].strip())==0: continue
99 break
100 if i>=0:
101 # walk further upwards to find first sob line, store in 'first'
102 first=None
103 while i>=0:
104 m=sob_re.match(loglines[i])
105 if m==None: break
106 first=m
107 i-=1
108 # if the last non-empty line matches our Signed-Off-by regex: extract username
109 if first!=None:
110 r=fixup_user(first.group(1),authors)
111 return r
112 return committer
114 def export_file_contents(ctx,manifest,files):
115 count=0
116 max=len(files)
117 for file in files:
118 # Skip .hgtags files. They only get us in trouble.
119 if file == ".hgtags":
120 sys.stderr.write('Skip %s\n' % (file))
121 continue
122 d=ctx.filectx(file).data()
123 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
124 wr('data %d' % len(d)) # had some trouble with size()
125 wr(d)
126 count+=1
127 if count%cfg_export_boundary==0:
128 sys.stderr.write('Exported %d/%d files\n' % (count,max))
129 if max>cfg_export_boundary:
130 sys.stderr.write('Exported %d/%d files\n' % (count,max))
132 def sanitize_name(name,what="branch"):
133 """Sanitize input roughly according to git-check-ref-format(1)"""
135 def dot(name):
136 if name[0] == '.': return '_'+name[1:]
137 return name
139 n=name
140 p=re.compile('([[ ~^:?*]|\.\.)')
141 n=p.sub('_', n)
142 if n[-1] in ('/', '.'): n=n[:-1]+'_'
143 n='/'.join(map(dot,n.split('/')))
144 p=re.compile('_+')
145 n=p.sub('_', n)
147 if n!=name:
148 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
149 return n
151 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap):
152 def get_branchname(name):
153 if brmap.has_key(name):
154 return brmap[name]
155 n=sanitize_name(name)
156 brmap[name]=n
157 return n
159 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
161 branch=get_branchname(branch)
163 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
165 if len(parents)==0 and revision != 0:
166 wr('reset refs/heads/%s' % branch)
168 wr('commit refs/heads/%s' % branch)
169 wr('mark :%d' % (revision+1))
170 if sob:
171 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
172 wr('committer %s %d %s' % (user,time,timezone))
173 wr('data %d' % (len(desc)+1)) # wtf?
174 wr(desc)
175 wr()
178 # Sort the parents based on revision ids so that we always get the
179 # same resulting git repo, no matter how the revisions were
180 # numbered.
181 parents.sort(key=repo.changelog.node, reverse=True)
183 ctx=repo.changectx(str(revision))
184 man=ctx.manifest()
185 added,changed,removed,type=[],[],[],''
187 if len(parents) == 0:
188 # first revision: feed in full manifest
189 added=man.keys()
190 added.sort()
191 type='full'
192 else:
193 wr('from %s' % revnum_to_revref(parents[0], old_marks))
194 if len(parents) == 1:
195 # later non-merge revision: feed in changed manifest
196 # if we have exactly one parent, just take the changes from the
197 # manifest without expensively comparing checksums
198 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
199 added,changed,removed=f[1],f[0],f[2]
200 type='simple delta'
201 else: # a merge with two parents
202 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
203 # later merge revision: feed in changed manifest
204 # for many files comparing checksums is expensive so only do it for
205 # merges where we really need it due to hg's revlog logic
206 added,changed,removed=get_filechanges(repo,revision,parents,man)
207 type='thorough delta'
209 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
210 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
212 map(lambda r: wr('D %s' % r),removed)
213 export_file_contents(ctx,man,added)
214 export_file_contents(ctx,man,changed)
215 wr()
217 return checkpoint(count)
219 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
220 l=repo.tagslist()
221 for tag,node in l:
222 tag=sanitize_name(tag,"tag")
223 # ignore latest revision
224 if tag=='tip': continue
225 # ignore tags to nodes that are missing (ie, 'in the future')
226 if node.encode('hex_codec') not in mapping_cache:
227 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
228 continue
230 rev=int(mapping_cache[node.encode('hex_codec')])
232 ref=revnum_to_revref(rev, old_marks)
233 if ref==None:
234 sys.stderr.write('Failed to find reference for creating tag'
235 ' %s at r%d\n' % (tag,rev))
236 continue
237 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
238 wr('reset refs/tags/%s' % tag)
239 wr('from %s' % ref)
240 wr()
241 count=checkpoint(count)
242 return count
244 def load_authors(filename):
245 cache={}
246 if not os.path.exists(filename):
247 return cache
248 f=open(filename,'r')
250 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
251 for line in f.readlines():
252 l+=1
253 m=lre.match(line)
254 if m==None:
255 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
256 continue
257 # put key:value in cache, key without ^:
258 cache[m.group(1).strip()]=m.group(2).strip()
259 f.close()
260 sys.stderr.write('Loaded %d authors\n' % l)
261 return cache
263 def verify_heads(ui,repo,cache,force):
264 branches=repo.branchtags()
265 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
266 l.sort()
268 # get list of hg's branches to verify, don't take all git has
269 for _,_,b in l:
270 b=get_branch(b)
271 sha1=get_git_sha1(b)
272 c=cache.get(b)
273 if sha1!=c:
274 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
275 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
276 if not force: return False
278 # verify that branch has exactly one head
279 t={}
280 for h in repo.heads():
281 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
282 if t.get(branch,False):
283 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
284 repo.changelog.rev(h))
285 if not force: return False
286 t[branch]=True
288 return True
290 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False):
291 _max=int(m)
293 old_marks=load_cache(marksfile,lambda s: int(s)-1)
294 mapping_cache=load_cache(mappingfile)
295 heads_cache=load_cache(headsfile)
296 state_cache=load_cache(tipfile)
298 ui,repo=setup_repo(repourl)
300 if not verify_heads(ui,repo,heads_cache,force):
301 return 1
303 try:
304 tip=repo.changelog.count()
305 except AttributeError:
306 tip=len(repo)
308 min=int(state_cache.get('tip',0))
309 max=_max
310 if _max<0 or max>tip:
311 max=tip
313 for rev in range(0,max):
314 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
315 mapping_cache[revnode.encode('hex_codec')] = str(rev)
319 brmap={}
320 for rev in range(min,max):
321 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap)
323 state_cache['tip']=max
324 state_cache['repo']=repourl
325 save_cache(tipfile,state_cache)
326 save_cache(mappingfile,mapping_cache)
328 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
330 sys.stderr.write('Issued %d commands\n' % c)
332 return 0
334 if __name__=='__main__':
335 def bail(parser,opt):
336 sys.stderr.write('Error: No %s option given\n' % opt)
337 parser.print_help()
338 sys.exit(2)
340 parser=OptionParser()
342 parser.add_option("-m","--max",type="int",dest="max",
343 help="Maximum hg revision to import")
344 parser.add_option("--mapping",dest="mappingfile",
345 help="File to read last run's hg-to-git SHA1 mapping")
346 parser.add_option("--marks",dest="marksfile",
347 help="File to read git-fast-import's marks from")
348 parser.add_option("--heads",dest="headsfile",
349 help="File to read last run's git heads from")
350 parser.add_option("--status",dest="statusfile",
351 help="File to read status from")
352 parser.add_option("-r","--repo",dest="repourl",
353 help="URL of repo to import")
354 parser.add_option("-s",action="store_true",dest="sob",
355 default=False,help="Enable parsing Signed-off-by lines")
356 parser.add_option("-A","--authors",dest="authorfile",
357 help="Read authormap from AUTHORFILE")
358 parser.add_option("-f","--force",action="store_true",dest="force",
359 default=False,help="Ignore validation errors by force")
360 parser.add_option("-M","--default-branch",dest="default_branch",
361 help="Set the default branch")
362 parser.add_option("-o","--origin",dest="origin_name",
363 help="use <name> as namespace to track upstream")
365 (options,args)=parser.parse_args()
367 m=-1
368 if options.max!=None: m=options.max
370 if options.marksfile==None: bail(parser,'--marks')
371 if options.mappingfile==None: bail(parser,'--mapping')
372 if options.headsfile==None: bail(parser,'--heads')
373 if options.statusfile==None: bail(parser,'--status')
374 if options.repourl==None: bail(parser,'--repo')
376 a={}
377 if options.authorfile!=None:
378 a=load_authors(options.authorfile)
380 if options.default_branch!=None:
381 set_default_branch(options.default_branch)
383 if options.origin_name!=None:
384 set_origin_name(options.origin_name)
386 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
387 options.statusfile,authors=a,sob=options.sob,force=options.force))