hg-fast-export.py: Minor tweaks/cleanup
[fast-export/fast-export-unix-compliant.git] / hg-fast-export.py
blob811333367f43fa2c8085bb0991cbfb551ef7145c
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset,load_cache,save_cache,get_git_sha1
8 from tempfile import mkstemp
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 # silly regex to catch Signed-off-by lines in log message
15 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
16 # insert 'checkpoint' command after this many commits or none at all if 0
17 cfg_checkpoint_count=0
18 # write some progress message every this many file contents written
19 cfg_export_boundary=1000
21 def gitmode(x):
22 return x and '100755' or '100644'
24 def wr(msg=''):
25 if msg == None:
26 msg = ''
27 print msg
28 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
30 def checkpoint(count):
31 count=count+1
32 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
33 sys.stderr.write("Checkpoint after %d commits\n" % count)
34 wr('checkpoint')
35 wr()
36 return count
38 def get_parent_mark(parent,marks):
39 """Get the mark for some parent.
40 If we saw it in the current session, return :%d syntax and
41 otherwise the SHA1 from the cache."""
42 return marks.get(str(parent),':%d' % (parent+1))
44 def file_mismatch(f1,f2):
45 """See if two revisions of a file are not equal."""
46 return node.hex(f1)!=node.hex(f2)
48 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
49 """Loop over our repository and find all changed and missing files."""
50 for left in dleft.keys():
51 right=dright.get(left,None)
52 if right==None:
53 # we have the file but our parent hasn't: add to left set
54 l.append(left)
55 elif match(dleft[left],right):
56 # we have it but checksums mismatch: add to center set
57 c.append(left)
58 for right in dright.keys():
59 left=dleft.get(right,None)
60 if left==None:
61 # if parent has file but we don't: add to right set
62 r.append(right)
63 # change is already handled when comparing child against parent
64 return l,c,r
66 def get_filechanges(repo,revision,parents,mleft):
67 """Given some repository and revision, find all changed/deleted files."""
68 l,c,r=[],[],[]
69 for p in parents:
70 if p<0: continue
71 mright=repo.changectx(p).manifest()
72 l,c,r=split_dict(mleft,mright,l,c,r)
73 l.sort()
74 c.sort()
75 r.sort()
76 return l,c,r
78 def get_author(logmessage,committer,authors):
79 """As git distincts between author and committer of a patch, try to
80 extract author by detecting Signed-off-by lines.
82 This walks from the end of the log message towards the top skipping
83 empty lines. Upon the first non-empty line, it walks all Signed-off-by
84 lines upwards to find the first one. For that (if found), it extracts
85 authorship information the usual way (authors table, cleaning, etc.)
87 If no Signed-off-by line is found, this defaults to the committer.
89 This may sound stupid (and it somehow is), but in log messages we
90 accidentially may have lines in the middle starting with
91 "Signed-off-by: foo" and thus matching our detection regex. Prevent
92 that."""
94 loglines=logmessage.split('\n')
95 i=len(loglines)
96 # from tail walk to top skipping empty lines
97 while i>=0:
98 i-=1
99 if len(loglines[i].strip())==0: continue
100 break
101 if i>=0:
102 # walk further upwards to find first sob line, store in 'first'
103 first=None
104 while i>=0:
105 m=sob_re.match(loglines[i])
106 if m==None: break
107 first=m
108 i-=1
109 # if the last non-empty line matches our Signed-Off-by regex: extract username
110 if first!=None:
111 r=fixup_user(first.group(1),authors)
112 return r
113 return committer
115 def export_file_contents(ctx,manifest,files):
116 count=0
117 max=len(files)
118 for file in files:
119 d=ctx.filectx(file).data()
120 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
121 wr('data %d' % len(d)) # had some trouble with size()
122 wr(d)
123 count+=1
124 if count%cfg_export_boundary==0:
125 sys.stderr.write('Exported %d/%d files\n' % (count,max))
126 if max>cfg_export_boundary:
127 sys.stderr.write('Exported %d/%d files\n' % (count,max))
129 def is_merge(parents):
131 for parent in parents:
132 if parent>=0:
133 c+=1
134 return c>1
136 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
137 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
138 parents=repo.changelog.parentrevs(revision)
140 wr('commit refs/heads/%s' % branch)
141 wr('mark :%d' % (revision+1))
142 if sob:
143 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
144 wr('committer %s %d %s' % (user,time,timezone))
145 wr('data %d' % (len(desc)+1)) # wtf?
146 wr(desc)
147 wr()
149 pidx1, pidx2 = 0, 1
150 if parents[0] < parents[1]:
151 pidx1, pidx2 = 1, 0
153 src=heads.get(branch,'')
154 link=''
155 if src!='':
156 # if we have a cached head, this is an incremental import: initialize it
157 # and kill reference so we won't init it again
158 wr('from %s' % src)
159 heads[branch]=''
160 sys.stderr.write('%s: Initializing to parent [%s]\n' %
161 (branch,src))
162 link=src # avoid making a merge commit for incremental import
163 elif link=='' and not heads.has_key(branch) and revision>0:
164 # newly created branch and not the first one: connect to parent
165 tmp=get_parent_mark(parents[0],marks)
166 wr('from %s' % tmp)
167 sys.stderr.write('%s: Link new branch to parent [%s]\n' %
168 (branch,tmp))
169 link=tmp # avoid making a merge commit for branch fork
170 elif last.get(branch,revision) != parents[pidx1] and parents[pidx1] > 0 and revision > 0:
171 pm=get_parent_mark(parents[pidx1],marks)
172 sys.stderr.write('%s: Placing commit [r%d] in branch [%s] on top of [r%d]\n' %
173 (branch,revision,branch,parents[pidx1]));
174 wr('from %s' % pm)
176 if parents[pidx2] > 0:
177 pm=get_parent_mark(parents[pidx2],marks)
178 sys.stderr.write('%s: Merging with parent [%s] from [r%d]\n' %
179 (branch,pm,parents[pidx2]))
180 wr('merge %s' % pm)
182 last[branch]=revision
183 heads[branch]=''
184 # we need this later to write out tags
185 marks[str(revision)]=':%d'%(revision+1)
187 ctx=repo.changectx(str(revision))
188 man=ctx.manifest()
189 added,changed,removed,type=[],[],[],''
191 if revision==0:
192 # first revision: feed in full manifest
193 added=man.keys()
194 added.sort()
195 type='full'
196 elif is_merge(parents):
197 # later merge revision: feed in changed manifest
198 # for many files comparing checksums is expensive so only do it for
199 # merges where we really need it due to hg's revlog logic
200 added,changed,removed=get_filechanges(repo,revision,parents,man)
201 type='thorough delta'
202 else:
203 # later non-merge revision: feed in changed manifest
204 # if we have exactly one parent, just take the changes from the
205 # manifest without expensively comparing checksums
206 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
207 added,changed,removed=f[1],f[0],f[2]
208 type='simple delta'
210 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
211 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
213 map(lambda r: wr('D %s' % r),removed)
214 export_file_contents(ctx,man,added)
215 export_file_contents(ctx,man,changed)
216 wr()
218 return checkpoint(count)
220 def export_tags(ui,repo,marks_cache,start,end,count,authors):
221 l=repo.tagslist()
222 for tag,node in l:
223 # ignore latest revision
224 if tag=='tip': continue
225 rev=repo.changelog.rev(node)
226 # ignore those tags not in our import range
227 if rev<start or rev>=end: continue
229 ref=get_parent_mark(rev,marks_cache)
230 if ref==None:
231 sys.stderr.write('Failed to find reference for creating tag'
232 ' %s at r%d\n' % (tag,rev))
233 continue
234 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
235 wr('reset refs/tags/%s' % tag)
236 wr('from %s' % ref)
237 wr()
238 count=checkpoint(count)
239 return count
241 def load_authors(filename):
242 cache={}
243 if not os.path.exists(filename):
244 return cache
245 f=open(filename,'r')
247 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
248 for line in f.readlines():
249 l+=1
250 m=lre.match(line)
251 if m==None:
252 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
253 continue
254 # put key:value in cache, key without ^:
255 cache[m.group(1).strip()]=m.group(2).strip()
256 f.close()
257 sys.stderr.write('Loaded %d authors\n' % l)
258 return cache
260 def verify_heads(ui,repo,cache,force):
261 branches=repo.branchtags()
262 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
263 l.sort()
265 # get list of hg's branches to verify, don't take all git has
266 for _,_,b in l:
267 b=get_branch(b)
268 sha1=get_git_sha1(b)
269 c=cache.get(b)
270 if sha1!=None and c!=None:
271 sys.stderr.write('Verifying branch [%s]\n' % b)
272 if sha1!=c:
273 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
274 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
275 if not force: return False
277 # verify that branch has exactly one head
278 t={}
279 for h in repo.heads():
280 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
281 if t.get(branch,False):
282 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
283 repo.changelog.rev(h))
284 if not force: return False
285 t[branch]=True
287 return True
289 def mangle_mark(mark):
290 return str(int(mark)-1)
292 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
293 _max=int(m)
295 marks_cache=load_cache(marksfile,mangle_mark)
296 heads_cache=load_cache(headsfile)
297 state_cache=load_cache(tipfile)
299 ui,repo=setup_repo(repourl)
301 if not verify_heads(ui,repo,heads_cache,force):
302 return 1
304 tip=repo.changelog.count()
306 min=int(state_cache.get('tip',0))
307 max=_max
308 if _max<0:
309 max=tip
312 last={}
313 for rev in range(min,max):
314 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
316 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
318 sys.stderr.write('Issued %d commands\n' % c)
320 state_cache['tip']=max
321 state_cache['repo']=repourl
322 save_cache(tipfile,state_cache)
324 return 0
326 if __name__=='__main__':
327 def bail(parser,opt):
328 sys.stderr.write('Error: No %s option given\n' % opt)
329 parser.print_help()
330 sys.exit(2)
332 parser=OptionParser()
334 parser.add_option("-m","--max",type="int",dest="max",
335 help="Maximum hg revision to import")
336 parser.add_option("--marks",dest="marksfile",
337 help="File to read git-fast-import's marks from")
338 parser.add_option("--heads",dest="headsfile",
339 help="File to read last run's git heads from")
340 parser.add_option("--status",dest="statusfile",
341 help="File to read status from")
342 parser.add_option("-r","--repo",dest="repourl",
343 help="URL of repo to import")
344 parser.add_option("-s",action="store_true",dest="sob",
345 default=False,help="Enable parsing Signed-off-by lines")
346 parser.add_option("-A","--authors",dest="authorfile",
347 help="Read authormap from AUTHORFILE")
348 parser.add_option("-f","--force",action="store_true",dest="force",
349 default=False,help="Ignore validation errors by force")
351 (options,args)=parser.parse_args()
353 m=-1
354 if options.max!=None: m=options.max
356 if options.marksfile==None: bail(parser,'--marks')
357 if options.headsfile==None: bail(parser,'--heads')
358 if options.statusfile==None: bail(parser,'--status')
359 if options.repourl==None: bail(parser,'--repo')
361 a={}
362 if options.authorfile!=None:
363 a=load_authors(options.authorfile)
365 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
366 options.statusfile,authors=a,sob=options.sob,force=options.force))