Add hg2git.py with library routines
[fast-export/dharding.git] / hg-fast-export.py
blob54e68683a616354ea6c567ff4102645c123f8887
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset,load_cache,save_cache
8 from tempfile import mkstemp
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 # silly regex to catch Signed-off-by lines in log message
15 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
16 # insert 'checkpoint' command after this many commits or none at all if 0
17 cfg_checkpoint_count=0
18 # write some progress message every this many file contents written
19 cfg_export_boundary=1000
21 def gitmode(x):
22 return x and '100755' or '100644'
24 def wr(msg=''):
25 print msg
26 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
28 def checkpoint(count):
29 count=count+1
30 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
31 sys.stderr.write("Checkpoint after %d commits\n" % count)
32 wr('checkpoint')
33 wr()
34 return count
36 def get_parent_mark(parent,marks):
37 """Get the mark for some parent.
38 If we saw it in the current session, return :%d syntax and
39 otherwise the SHA1 from the cache."""
40 return marks.get(str(parent+1),':%d' % (parent+1))
42 def mismatch(f1,f2):
43 """See if two revisions of a file are not equal."""
44 return node.hex(f1)!=node.hex(f2)
46 def outer_set(dleft,dright,l,c,r):
47 """Loop over our repository and find all changed and missing files."""
48 for left in dleft.keys():
49 right=dright.get(left,None)
50 if right==None:
51 # we have the file but our parent hasn't: add to left set
52 l.append(left)
53 elif mismatch(dleft[left],right):
54 # we have it but checksums mismatch: add to center set
55 c.append(left)
56 for right in dright.keys():
57 left=dleft.get(right,None)
58 if left==None:
59 # if parent has file but we don't: add to right set
60 r.append(right)
61 # change is already handled when comparing child against parent
62 return l,c,r
64 def get_filechanges(repo,revision,parents,mleft):
65 """Given some repository and revision, find all changed/deleted files."""
66 l,c,r=[],[],[]
67 for p in parents:
68 if p<0: continue
69 mright=repo.changectx(p).manifest()
70 dleft=mleft.keys()
71 dleft.sort()
72 dright=mright.keys()
73 dright.sort()
74 l,c,r=outer_set(mleft,mright,l,c,r)
75 return l,c,r
77 def get_author(logmessage,committer,authors):
78 """As git distincts between author and committer of a patch, try to
79 extract author by detecting Signed-off-by lines.
81 This walks from the end of the log message towards the top skipping
82 empty lines. Upon the first non-empty line, it walks all Signed-off-by
83 lines upwards to find the first one. For that (if found), it extracts
84 authorship information the usual way (authors table, cleaning, etc.)
86 If no Signed-off-by line is found, this defaults to the committer.
88 This may sound stupid (and it somehow is), but in log messages we
89 accidentially may have lines in the middle starting with
90 "Signed-off-by: foo" and thus matching our detection regex. Prevent
91 that."""
93 loglines=logmessage.split('\n')
94 i=len(loglines)
95 # from tail walk to top skipping empty lines
96 while i>=0:
97 i-=1
98 if len(loglines[i].strip())==0: continue
99 break
100 if i>=0:
101 # walk further upwards to find first sob line, store in 'first'
102 first=None
103 while i>=0:
104 m=sob_re.match(loglines[i])
105 if m==None: break
106 first=m
107 i-=1
108 # if the last non-empty line matches our Signed-Off-by regex: extract username
109 if first!=None:
110 r=fixup_user(first.group(1),authors)
111 return r
112 return committer
114 def export_file_contents(ctx,manifest,files):
115 count=0
116 files.sort()
117 max=len(files)
118 for file in files:
119 fctx=ctx.filectx(file)
120 d=fctx.data()
121 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
122 wr('data %d' % len(d)) # had some trouble with size()
123 wr(d)
124 count+=1
125 if count%cfg_export_boundary==0:
126 sys.stderr.write('Exported %d/%d files\n' % (count,max))
127 if max>cfg_export_boundary:
128 sys.stderr.write('Exported %d/%d files\n' % (count,max))
130 def is_merge(parents):
132 for parent in parents:
133 if parent>=0:
134 c+=1
135 return c>1
137 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
138 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
139 parents=repo.changelog.parentrevs(revision)
141 wr('commit refs/heads/%s' % branch)
142 wr('mark :%d' % (revision+1))
143 if sob:
144 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
145 wr('committer %s %d %s' % (user,time,timezone))
146 wr('data %d' % (len(desc)+1)) # wtf?
147 wr(desc)
148 wr()
150 src=heads.get(branch,'')
151 link=''
152 if src!='':
153 # if we have a cached head, this is an incremental import: initialize it
154 # and kill reference so we won't init it again
155 wr('from %s' % src)
156 heads[branch]=''
157 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
158 (branch,src))
159 link=src # avoid making a merge commit for incremental import
160 elif link=='' and not heads.has_key(branch) and revision>0:
161 # newly created branch and not the first one: connect to parent
162 tmp=get_parent_mark(parents[0],marks)
163 wr('from %s' % tmp)
164 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
165 (branch,tmp))
166 link=tmp # avoid making a merge commit for branch fork
168 if parents:
169 l=last.get(branch,revision)
170 for p in parents:
171 # 1) as this commit implicitely is the child of the most recent
172 # commit of this branch, ignore this parent
173 # 2) ignore nonexistent parents
174 # 3) merge otherwise
175 if p==l or p==revision or p<0:
176 continue
177 tmp=get_parent_mark(p,marks)
178 # if we fork off a branch, don't merge with our parent via 'merge'
179 # as we have 'from' already above
180 if tmp==link:
181 continue
182 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
183 (branch,tmp,p))
184 wr('merge %s' % tmp)
186 last[branch]=revision
187 heads[branch]=''
188 # we need this later to write out tags
189 marks[str(revision)]=':%d'%(revision+1)
191 ctx=repo.changectx(str(revision))
192 man=ctx.manifest()
193 added,changed,removed,type=[],[],[],''
195 if revision==0:
196 # first revision: feed in full manifest
197 added=man.keys()
198 type='full'
199 elif is_merge(parents):
200 # later merge revision: feed in changed manifest
201 # for many files comparing checksums is expensive so only do it for
202 # merges where we really need it due to hg's revlog logic
203 added,changed,removed=get_filechanges(repo,revision,parents,man)
204 type='thorough delta'
205 else:
206 # later non-merge revision: feed in changed manifest
207 # if we have exactly one parent, just take the changes from the
208 # manifest without expensively comparing checksums
209 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
210 added,changed,removed=f[1],f[0],f[2]
211 type='simple delta'
213 sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
214 (type,revision+1,max,len(added),len(changed),len(removed)))
216 map(lambda r: wr('D %s' % r),removed)
217 export_file_contents(ctx,man,added+changed)
218 wr()
220 return checkpoint(count)
222 def export_tags(ui,repo,marks_cache,start,end,count,authors):
223 l=repo.tagslist()
224 for tag,node in l:
225 # ignore latest revision
226 if tag=='tip': continue
227 rev=repo.changelog.rev(node)
228 # ignore those tags not in our import range
229 if rev<start or rev>=end: continue
231 ref=get_parent_mark(rev,marks_cache)
232 if ref==None:
233 sys.stderr.write('Failed to find reference for creating tag'
234 ' %s at r%d\n' % (tag,rev))
235 continue
236 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
237 wr('reset refs/tags/%s' % tag)
238 wr('from %s' % ref)
239 wr()
240 count=checkpoint(count)
241 return count
243 def load_authors(filename):
244 cache={}
245 if not os.path.exists(filename):
246 return cache
247 f=open(filename,'r')
249 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
250 for line in f.readlines():
251 l+=1
252 m=lre.match(line)
253 if m==None:
254 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
255 continue
256 # put key:value in cache, key without ^:
257 cache[m.group(1).strip()]=m.group(2).strip()
258 f.close()
259 sys.stderr.write('Loaded %d authors\n' % l)
260 return cache
262 def verify_heads(ui,repo,cache,force):
263 def getsha1(branch):
264 try:
265 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
266 sha1=f.readlines()[0].split('\n')[0]
267 f.close()
268 return sha1
269 except IOError:
270 return None
272 branches=repo.branchtags()
273 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
274 l.sort()
276 # get list of hg's branches to verify, don't take all git has
277 for _,_,b in l:
278 b=get_branch(b)
279 sha1=getsha1(b)
280 c=cache.get(b)
281 if sha1!=None and c!=None:
282 sys.stderr.write('Verifying branch [%s]\n' % b)
283 if sha1!=c:
284 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
285 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
286 if not force: return False
288 # verify that branch has exactly one head
289 t={}
290 for h in repo.heads():
291 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
292 if t.get(branch,False):
293 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
294 repo.changelog.rev(h))
295 if not force: return False
296 t[branch]=True
298 return True
300 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
301 _max=int(m)
303 marks_cache=load_cache(marksfile)
304 heads_cache=load_cache(headsfile)
305 state_cache=load_cache(tipfile)
307 ui,repo=setup_repo(repourl)
309 if not verify_heads(ui,repo,heads_cache,force):
310 return 1
312 tip=repo.changelog.count()
314 min=int(state_cache.get('tip',0))
315 max=_max
316 if _max<0:
317 max=tip
320 last={}
321 for rev in range(min,max):
322 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
324 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
326 sys.stderr.write('Issued %d commands\n' % c)
328 state_cache['tip']=max
329 state_cache['repo']=repourl
330 save_cache(tipfile,state_cache)
332 return 0
334 if __name__=='__main__':
335 def bail(parser,opt):
336 sys.stderr.write('Error: No %s option given\n' % opt)
337 parser.print_help()
338 sys.exit(2)
340 parser=OptionParser()
342 parser.add_option("-m","--max",type="int",dest="max",
343 help="Maximum hg revision to import")
344 parser.add_option("--marks",dest="marksfile",
345 help="File to read git-fast-import's marks from")
346 parser.add_option("--heads",dest="headsfile",
347 help="File to read last run's git heads from")
348 parser.add_option("--status",dest="statusfile",
349 help="File to read status from")
350 parser.add_option("-r","--repo",dest="repourl",
351 help="URL of repo to import")
352 parser.add_option("-s",action="store_true",dest="sob",
353 default=False,help="Enable parsing Signed-off-by lines")
354 parser.add_option("-A","--authors",dest="authorfile",
355 help="Read authormap from AUTHORFILE")
356 parser.add_option("-f","--force",action="store_true",dest="force",
357 default=False,help="Ignore validation errors by force")
359 (options,args)=parser.parse_args()
361 m=-1
362 if options.max!=None: m=options.max
364 if options.marksfile==None: bail(parser,'--marks')
365 if options.marksfile==None: bail(parser,'--heads')
366 if options.marksfile==None: bail(parser,'--status')
367 if options.marksfile==None: bail(parser,'--repo')
369 a={}
370 if options.authorfile!=None:
371 a=load_authors(options.authorfile)
373 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
374 options.statusfile,authors=a,sob=options.sob,force=options.force))