hg2git: Update copyrights and maintainership information.
[fast-export/barak.git] / hg-fast-export.py
blob166204cd3b540c348468a5500d9773c73787d61e
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch
9 from tempfile import mkstemp
10 from optparse import OptionParser
11 import re
12 import sys
13 import os
15 # silly regex to catch Signed-off-by lines in log message
16 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
17 # insert 'checkpoint' command after this many commits or none at all if 0
18 cfg_checkpoint_count=0
19 # write some progress message every this many file contents written
20 cfg_export_boundary=1000
22 def gitmode(flags):
23 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
25 def wr(msg=''):
26 if msg == None:
27 msg = ''
28 print msg
29 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
31 def checkpoint(count):
32 count=count+1
33 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
34 sys.stderr.write("Checkpoint after %d commits\n" % count)
35 wr('checkpoint')
36 wr()
37 return count
39 def get_parent_mark(parent,marks):
40 """Get the mark for some parent.
41 If we saw it in the current session, return :%d syntax and
42 otherwise the SHA1 from the cache."""
43 return marks.get(str(parent),':%d' % (parent+1))
45 def file_mismatch(f1,f2):
46 """See if two revisions of a file are not equal."""
47 return node.hex(f1)!=node.hex(f2)
49 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
50 """Loop over our repository and find all changed and missing files."""
51 for left in dleft.keys():
52 right=dright.get(left,None)
53 if right==None:
54 # we have the file but our parent hasn't: add to left set
55 l.append(left)
56 elif match(dleft[left],right):
57 # we have it but checksums mismatch: add to center set
58 c.append(left)
59 for right in dright.keys():
60 left=dleft.get(right,None)
61 if left==None:
62 # if parent has file but we don't: add to right set
63 r.append(right)
64 # change is already handled when comparing child against parent
65 return l,c,r
67 def get_filechanges(repo,revision,parents,mleft):
68 """Given some repository and revision, find all changed/deleted files."""
69 l,c,r=[],[],[]
70 for p in parents:
71 if p<0: continue
72 mright=repo.changectx(p).manifest()
73 l,c,r=split_dict(mleft,mright,l,c,r)
74 l.sort()
75 c.sort()
76 r.sort()
77 return l,c,r
79 def get_author(logmessage,committer,authors):
80 """As git distincts between author and committer of a patch, try to
81 extract author by detecting Signed-off-by lines.
83 This walks from the end of the log message towards the top skipping
84 empty lines. Upon the first non-empty line, it walks all Signed-off-by
85 lines upwards to find the first one. For that (if found), it extracts
86 authorship information the usual way (authors table, cleaning, etc.)
88 If no Signed-off-by line is found, this defaults to the committer.
90 This may sound stupid (and it somehow is), but in log messages we
91 accidentially may have lines in the middle starting with
92 "Signed-off-by: foo" and thus matching our detection regex. Prevent
93 that."""
95 loglines=logmessage.split('\n')
96 i=len(loglines)
97 # from tail walk to top skipping empty lines
98 while i>=0:
99 i-=1
100 if len(loglines[i].strip())==0: continue
101 break
102 if i>=0:
103 # walk further upwards to find first sob line, store in 'first'
104 first=None
105 while i>=0:
106 m=sob_re.match(loglines[i])
107 if m==None: break
108 first=m
109 i-=1
110 # if the last non-empty line matches our Signed-Off-by regex: extract username
111 if first!=None:
112 r=fixup_user(first.group(1),authors)
113 return r
114 return committer
116 def export_file_contents(ctx,manifest,files):
117 count=0
118 max=len(files)
119 for file in files:
120 d=ctx.filectx(file).data()
121 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
122 wr('data %d' % len(d)) # had some trouble with size()
123 wr(d)
124 count+=1
125 if count%cfg_export_boundary==0:
126 sys.stderr.write('Exported %d/%d files\n' % (count,max))
127 if max>cfg_export_boundary:
128 sys.stderr.write('Exported %d/%d files\n' % (count,max))
130 def is_merge(parents):
132 for parent in parents:
133 if parent>=0:
134 c+=1
135 return c>1
137 def sanitize_name(name,what="branch"):
138 """Sanitize input roughly according to git-check-ref-format(1)"""
140 def dot(name):
141 if name[0] == '.': return '_'+name[1:]
142 return name
144 n=name
145 p=re.compile('([[ ~^:?*]|\.\.)')
146 n=p.sub('_', n)
147 if n[-1] == '/': n=n[:-1]+'_'
148 n='/'.join(map(dot,n.split('/')))
149 p=re.compile('_+')
150 n=p.sub('_', n)
152 if n!=name:
153 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
154 return n
156 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob,brmap):
157 def get_branchname(name):
158 if brmap.has_key(name):
159 return brmap[name]
160 n=sanitize_name(name)
161 brmap[name]=n
162 return n
164 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
165 parents=repo.changelog.parentrevs(revision)
167 branch=get_branchname(branch)
169 wr('commit refs/heads/%s' % branch)
170 wr('mark :%d' % (revision+1))
171 if sob:
172 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
173 wr('committer %s %d %s' % (user,time,timezone))
174 wr('data %d' % (len(desc)+1)) # wtf?
175 wr(desc)
176 wr()
178 pidx1, pidx2 = 0, 1
179 if parents[0] < parents[1]:
180 pidx1, pidx2 = 1, 0
182 src=heads.get(branch,'')
183 link=''
184 if src!='':
185 # if we have a cached head, this is an incremental import: initialize it
186 # and kill reference so we won't init it again
187 wr('from %s' % src)
188 heads[branch]=''
189 sys.stderr.write('%s: Initializing to parent [%s]\n' %
190 (branch,src))
191 link=src # avoid making a merge commit for incremental import
192 elif link=='' and not heads.has_key(branch) and revision>0:
193 # newly created branch and not the first one: connect to parent
194 tmp=get_parent_mark(parents[0],marks)
195 wr('from %s' % tmp)
196 sys.stderr.write('%s: Link new branch to parent [%s]\n' %
197 (branch,tmp))
198 link=tmp # avoid making a merge commit for branch fork
199 elif last.get(branch,revision) != parents[pidx1] and parents[pidx1] > 0 and revision > 0:
200 pm=get_parent_mark(parents[pidx1],marks)
201 sys.stderr.write('%s: Placing commit [r%d] in branch [%s] on top of [r%d]\n' %
202 (branch,revision,branch,parents[pidx1]));
203 wr('from %s' % pm)
205 if parents[pidx2] > 0:
206 pm=get_parent_mark(parents[pidx2],marks)
207 sys.stderr.write('%s: Merging with parent [%s] from [r%d]\n' %
208 (branch,pm,parents[pidx2]))
209 wr('merge %s' % pm)
211 last[branch]=revision
212 heads[branch]=''
213 # we need this later to write out tags
214 marks[str(revision)]=':%d'%(revision+1)
216 ctx=repo.changectx(str(revision))
217 man=ctx.manifest()
218 added,changed,removed,type=[],[],[],''
220 if revision==0:
221 # first revision: feed in full manifest
222 added=man.keys()
223 added.sort()
224 type='full'
225 elif is_merge(parents):
226 # later merge revision: feed in changed manifest
227 # for many files comparing checksums is expensive so only do it for
228 # merges where we really need it due to hg's revlog logic
229 added,changed,removed=get_filechanges(repo,revision,parents,man)
230 type='thorough delta'
231 else:
232 # later non-merge revision: feed in changed manifest
233 # if we have exactly one parent, just take the changes from the
234 # manifest without expensively comparing checksums
235 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
236 added,changed,removed=f[1],f[0],f[2]
237 type='simple delta'
239 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
240 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
242 map(lambda r: wr('D %s' % r),removed)
243 export_file_contents(ctx,man,added)
244 export_file_contents(ctx,man,changed)
245 wr()
247 return checkpoint(count)
249 def export_tags(ui,repo,marks_cache,start,end,count,authors):
250 l=repo.tagslist()
251 for tag,node in l:
252 tag=sanitize_name(tag,"tag")
253 # ignore latest revision
254 if tag=='tip': continue
255 rev=repo.changelog.rev(node)
256 # ignore those tags not in our import range
257 if rev<start or rev>=end: continue
259 ref=get_parent_mark(rev,marks_cache)
260 if ref==None:
261 sys.stderr.write('Failed to find reference for creating tag'
262 ' %s at r%d\n' % (tag,rev))
263 continue
264 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
265 wr('reset refs/tags/%s' % tag)
266 wr('from %s' % ref)
267 wr()
268 count=checkpoint(count)
269 return count
271 def load_authors(filename):
272 cache={}
273 if not os.path.exists(filename):
274 return cache
275 f=open(filename,'r')
277 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
278 for line in f.readlines():
279 l+=1
280 m=lre.match(line)
281 if m==None:
282 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
283 continue
284 # put key:value in cache, key without ^:
285 cache[m.group(1).strip()]=m.group(2).strip()
286 f.close()
287 sys.stderr.write('Loaded %d authors\n' % l)
288 return cache
290 def verify_heads(ui,repo,cache,force):
291 branches=repo.branchtags()
292 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
293 l.sort()
295 # get list of hg's branches to verify, don't take all git has
296 for _,_,b in l:
297 b=get_branch(b)
298 sha1=get_git_sha1(b)
299 c=cache.get(b)
300 if sha1!=None and c!=None:
301 sys.stderr.write('Verifying branch [%s]\n' % b)
302 if sha1!=c:
303 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
304 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
305 if not force: return False
307 # verify that branch has exactly one head
308 t={}
309 for h in repo.heads():
310 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
311 if t.get(branch,False):
312 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
313 repo.changelog.rev(h))
314 if not force: return False
315 t[branch]=True
317 return True
319 def mangle_mark(mark):
320 return str(int(mark)-1)
322 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
323 _max=int(m)
325 marks_cache=load_cache(marksfile,mangle_mark)
326 heads_cache=load_cache(headsfile)
327 state_cache=load_cache(tipfile)
329 ui,repo=setup_repo(repourl)
331 if not verify_heads(ui,repo,heads_cache,force):
332 return 1
334 try:
335 tip=repo.changelog.count()
336 except AttributeError:
337 tip=len(repo)
339 min=int(state_cache.get('tip',0))
340 max=_max
341 if _max<0 or max>tip:
342 max=tip
345 last={}
346 brmap={}
347 for rev in range(min,max):
348 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob,brmap)
350 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
352 sys.stderr.write('Issued %d commands\n' % c)
354 state_cache['tip']=max
355 state_cache['repo']=repourl
356 save_cache(tipfile,state_cache)
358 return 0
360 if __name__=='__main__':
361 def bail(parser,opt):
362 sys.stderr.write('Error: No %s option given\n' % opt)
363 parser.print_help()
364 sys.exit(2)
366 parser=OptionParser()
368 parser.add_option("-m","--max",type="int",dest="max",
369 help="Maximum hg revision to import")
370 parser.add_option("--marks",dest="marksfile",
371 help="File to read git-fast-import's marks from")
372 parser.add_option("--heads",dest="headsfile",
373 help="File to read last run's git heads from")
374 parser.add_option("--status",dest="statusfile",
375 help="File to read status from")
376 parser.add_option("-r","--repo",dest="repourl",
377 help="URL of repo to import")
378 parser.add_option("-s",action="store_true",dest="sob",
379 default=False,help="Enable parsing Signed-off-by lines")
380 parser.add_option("-A","--authors",dest="authorfile",
381 help="Read authormap from AUTHORFILE")
382 parser.add_option("-f","--force",action="store_true",dest="force",
383 default=False,help="Ignore validation errors by force")
384 parser.add_option("-M","--default-branch",dest="default_branch",
385 help="Set the default branch")
387 (options,args)=parser.parse_args()
389 m=-1
390 if options.max!=None: m=options.max
392 if options.marksfile==None: bail(parser,'--marks')
393 if options.headsfile==None: bail(parser,'--heads')
394 if options.statusfile==None: bail(parser,'--status')
395 if options.repourl==None: bail(parser,'--repo')
397 a={}
398 if options.authorfile!=None:
399 a=load_authors(options.authorfile)
401 if options.default_branch!=None:
402 set_default_branch(options.default_branch)
404 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
405 options.statusfile,authors=a,sob=options.sob,force=options.force))