Use svn_repos_open() for svn-archive.c, too
[fast-export/dharding.git] / hg-fast-export.py
blobcbd295bff7e80ba1e994b629b30b64b9447db324
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from tempfile import mkstemp
8 from optparse import OptionParser
9 import re
10 import sys
11 import os
13 # silly regex to catch Signed-off-by lines in log message
14 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
15 # silly regex to see if user field has email address
16 user_re=re.compile('([^<]+) (<[^>]+>)$')
17 # silly regex to clean out user names
18 user_clean_re=re.compile('^["]([^"]+)["]$')
19 # git branch for hg's default 'HEAD' branch
20 cfg_master='master'
21 # insert 'checkpoint' command after this many commits or none at all if 0
22 cfg_checkpoint_count=0
23 # write some progress message every this many file contents written
24 cfg_export_boundary=1000
26 def setup_repo(url):
27 myui=ui.ui()
28 return myui,hg.repository(myui,url)
30 def fixup_user(user,authors):
31 if authors!=None:
32 # if we have an authors table, try to get mapping
33 # by defaulting to the current value of 'user'
34 user=authors.get(user,user)
35 name,mail,m='','',user_re.match(user)
36 if m==None:
37 # if we don't have 'Name <mail>' syntax, use 'user
38 # <devnull@localhost>' if use contains no at and
39 # 'user <user>' otherwise
40 name=user
41 if '@' not in user:
42 mail='<devnull@localhost>'
43 else:
44 mail='<%s>' % user
45 else:
46 # if we have 'Name <mail>' syntax, everything is fine :)
47 name,mail=m.group(1),m.group(2)
49 # remove any silly quoting from username
50 m2=user_clean_re.match(name)
51 if m2!=None:
52 name=m2.group(1)
53 return '%s %s' % (name,mail)
55 def get_branch(name):
56 if name=='HEAD':
57 name=cfg_master
58 return name
60 def get_changeset(ui,repo,revision,authors={}):
61 node=repo.lookup(revision)
62 (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
63 tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
64 branch=get_branch(extra.get('branch','master'))
65 return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
67 def gitmode(x):
68 return x and '100755' or '100644'
70 def wr(msg=''):
71 print msg
72 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
74 def checkpoint(count):
75 count=count+1
76 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
77 sys.stderr.write("Checkpoint after %d commits\n" % count)
78 wr('checkpoint')
79 wr()
80 return count
82 def get_parent_mark(parent,marks):
83 """Get the mark for some parent.
84 If we saw it in the current session, return :%d syntax and
85 otherwise the SHA1 from the cache."""
86 return marks.get(str(parent+1),':%d' % (parent+1))
88 def mismatch(f1,f2):
89 """See if two revisions of a file are not equal."""
90 return node.hex(f1)!=node.hex(f2)
92 def outer_set(dleft,dright,l,c,r):
93 """Loop over our repository and find all changed and missing files."""
94 for left in dleft.keys():
95 right=dright.get(left,None)
96 if right==None:
97 # we have the file but our parent hasn't: add to left set
98 l.append(left)
99 elif mismatch(dleft[left],right):
100 # we have it but checksums mismatch: add to center set
101 c.append(left)
102 for right in dright.keys():
103 left=dleft.get(right,None)
104 if left==None:
105 # if parent has file but we don't: add to right set
106 r.append(right)
107 # change is already handled when comparing child against parent
108 return l,c,r
110 def get_filechanges(repo,revision,parents,mleft):
111 """Given some repository and revision, find all changed/deleted files."""
112 l,c,r=[],[],[]
113 for p in parents:
114 if p<0: continue
115 mright=repo.changectx(p).manifest()
116 dleft=mleft.keys()
117 dleft.sort()
118 dright=mright.keys()
119 dright.sort()
120 l,c,r=outer_set(mleft,mright,l,c,r)
121 return l,c,r
123 def get_author(logmessage,committer,authors):
124 """As git distincts between author and committer of a patch, try to
125 extract author by detecting Signed-off-by lines.
127 This walks from the end of the log message towards the top skipping
128 empty lines. Upon the first non-empty line, it walks all Signed-off-by
129 lines upwards to find the first one. For that (if found), it extracts
130 authorship information the usual way (authors table, cleaning, etc.)
132 If no Signed-off-by line is found, this defaults to the committer.
134 This may sound stupid (and it somehow is), but in log messages we
135 accidentially may have lines in the middle starting with
136 "Signed-off-by: foo" and thus matching our detection regex. Prevent
137 that."""
139 loglines=logmessage.split('\n')
140 i=len(loglines)
141 # from tail walk to top skipping empty lines
142 while i>=0:
143 i-=1
144 if len(loglines[i].strip())==0: continue
145 break
146 if i>=0:
147 # walk further upwards to find first sob line, store in 'first'
148 first=None
149 while i>=0:
150 m=sob_re.match(loglines[i])
151 if m==None: break
152 first=m
153 i-=1
154 # if the last non-empty line matches our Signed-Off-by regex: extract username
155 if first!=None:
156 r=fixup_user(first.group(1),authors)
157 return r
158 return committer
160 def export_file_contents(ctx,manifest,files):
161 count=0
162 files.sort()
163 max=len(files)
164 for file in files:
165 fctx=ctx.filectx(file)
166 d=fctx.data()
167 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
168 wr('data %d' % len(d)) # had some trouble with size()
169 wr(d)
170 count+=1
171 if count%cfg_export_boundary==0:
172 sys.stderr.write('Exported %d/%d files\n' % (count,max))
173 if max>cfg_export_boundary:
174 sys.stderr.write('Exported %d/%d files\n' % (count,max))
176 def is_merge(parents):
178 for parent in parents:
179 if parent>=0:
180 c+=1
181 return c>1
183 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
184 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
185 parents=repo.changelog.parentrevs(revision)
187 wr('commit refs/heads/%s' % branch)
188 wr('mark :%d' % (revision+1))
189 if sob:
190 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
191 wr('committer %s %d %s' % (user,time,timezone))
192 wr('data %d' % (len(desc)+1)) # wtf?
193 wr(desc)
194 wr()
196 src=heads.get(branch,'')
197 link=''
198 if src!='':
199 # if we have a cached head, this is an incremental import: initialize it
200 # and kill reference so we won't init it again
201 wr('from %s' % src)
202 heads[branch]=''
203 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
204 (branch,src))
205 link=src # avoid making a merge commit for incremental import
206 elif link=='' and not heads.has_key(branch) and revision>0:
207 # newly created branch and not the first one: connect to parent
208 tmp=get_parent_mark(parents[0],marks)
209 wr('from %s' % tmp)
210 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
211 (branch,tmp))
212 link=tmp # avoid making a merge commit for branch fork
214 if parents:
215 l=last.get(branch,revision)
216 for p in parents:
217 # 1) as this commit implicitely is the child of the most recent
218 # commit of this branch, ignore this parent
219 # 2) ignore nonexistent parents
220 # 3) merge otherwise
221 if p==l or p==revision or p<0:
222 continue
223 tmp=get_parent_mark(p,marks)
224 # if we fork off a branch, don't merge with our parent via 'merge'
225 # as we have 'from' already above
226 if tmp==link:
227 continue
228 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
229 (branch,tmp,p))
230 wr('merge %s' % tmp)
232 last[branch]=revision
233 heads[branch]=''
234 # we need this later to write out tags
235 marks[str(revision)]=':%d'%(revision+1)
237 ctx=repo.changectx(str(revision))
238 man=ctx.manifest()
239 added,changed,removed,type=[],[],[],''
241 if revision==0:
242 # first revision: feed in full manifest
243 added=man.keys()
244 type='full'
245 elif is_merge(parents):
246 # later merge revision: feed in changed manifest
247 # for many files comparing checksums is expensive so only do it for
248 # merges where we really need it due to hg's revlog logic
249 added,changed,removed=get_filechanges(repo,revision,parents,man)
250 type='thorough delta'
251 else:
252 # later non-merge revision: feed in changed manifest
253 # if we have exactly one parent, just take the changes from the
254 # manifest without expensively comparing checksums
255 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
256 added,changed,removed=f[1],f[0],f[2]
257 type='simple delta'
259 sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
260 (type,revision+1,max,len(added),len(changed),len(removed)))
262 map(lambda r: wr('D %s' % r),removed)
263 export_file_contents(ctx,man,added+changed)
264 wr()
266 return checkpoint(count)
268 def export_tags(ui,repo,marks_cache,start,end,count,authors):
269 l=repo.tagslist()
270 for tag,node in l:
271 # ignore latest revision
272 if tag=='tip': continue
273 rev=repo.changelog.rev(node)
274 # ignore those tags not in our import range
275 if rev<start or rev>=end: continue
277 ref=get_parent_mark(rev,marks_cache)
278 if ref==None:
279 sys.stderr.write('Failed to find reference for creating tag'
280 ' %s at r%d\n' % (tag,rev))
281 continue
282 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
283 wr('reset refs/tags/%s' % tag)
284 wr('from %s' % ref)
285 wr()
286 count=checkpoint(count)
287 return count
289 def load_authors(filename):
290 cache={}
291 if not os.path.exists(filename):
292 return cache
293 f=open(filename,'r')
295 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
296 for line in f.readlines():
297 l+=1
298 m=lre.match(line)
299 if m==None:
300 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
301 continue
302 # put key:value in cache, key without ^:
303 cache[m.group(1).strip()]=m.group(2).strip()
304 f.close()
305 sys.stderr.write('Loaded %d authors\n' % l)
306 return cache
308 def load_cache(filename):
309 cache={}
310 if not os.path.exists(filename):
311 return cache
312 f=open(filename,'r')
314 for line in f.readlines():
315 l+=1
316 fields=line.split(' ')
317 if fields==None or not len(fields)==2 or fields[0][0]!=':':
318 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
319 continue
320 # put key:value in cache, key without ^:
321 cache[fields[0][1:]]=fields[1].split('\n')[0]
322 f.close()
323 return cache
325 def save_cache(filename,cache):
326 f=open(filename,'w+')
327 map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
328 f.close()
330 def verify_heads(ui,repo,cache,force):
331 def getsha1(branch):
332 try:
333 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
334 sha1=f.readlines()[0].split('\n')[0]
335 f.close()
336 return sha1
337 except IOError:
338 return None
340 branches=repo.branchtags()
341 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
342 l.sort()
344 # get list of hg's branches to verify, don't take all git has
345 for _,_,b in l:
346 b=get_branch(b)
347 sha1=getsha1(b)
348 c=cache.get(b)
349 if sha1!=None and c!=None:
350 sys.stderr.write('Verifying branch [%s]\n' % b)
351 if sha1!=c:
352 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
353 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
354 if not force: return False
356 # verify that branch has exactly one head
357 t={}
358 for h in repo.heads():
359 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
360 if t.get(branch,False):
361 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
362 repo.changelog.rev(h))
363 if not force: return False
364 t[branch]=True
366 return True
368 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
369 _max=int(m)
371 marks_cache=load_cache(marksfile)
372 heads_cache=load_cache(headsfile)
373 state_cache=load_cache(tipfile)
375 ui,repo=setup_repo(repourl)
377 if not verify_heads(ui,repo,heads_cache,force):
378 return 1
380 tip=repo.changelog.count()
382 min=int(state_cache.get('tip',0))
383 max=_max
384 if _max<0:
385 max=tip
388 last={}
389 for rev in range(min,max):
390 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
392 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
394 sys.stderr.write('Issued %d commands\n' % c)
396 state_cache['tip']=max
397 state_cache['repo']=repourl
398 save_cache(tipfile,state_cache)
400 return 0
402 if __name__=='__main__':
403 def bail(parser,opt):
404 sys.stderr.write('Error: No %s option given\n' % opt)
405 parser.print_help()
406 sys.exit(2)
408 parser=OptionParser()
410 parser.add_option("-m","--max",type="int",dest="max",
411 help="Maximum hg revision to import")
412 parser.add_option("--marks",dest="marksfile",
413 help="File to read git-fast-import's marks from")
414 parser.add_option("--heads",dest="headsfile",
415 help="File to read last run's git heads from")
416 parser.add_option("--status",dest="statusfile",
417 help="File to read status from")
418 parser.add_option("-r","--repo",dest="repourl",
419 help="URL of repo to import")
420 parser.add_option("-s",action="store_true",dest="sob",
421 default=False,help="Enable parsing Signed-off-by lines")
422 parser.add_option("-A","--authors",dest="authorfile",
423 help="Read authormap from AUTHORFILE")
424 parser.add_option("-f","--force",action="store_true",dest="force",
425 default=False,help="Ignore validation errors by force")
427 (options,args)=parser.parse_args()
429 m=-1
430 if options.max!=None: m=options.max
432 if options.marksfile==None: bail(parser,'--marks')
433 if options.marksfile==None: bail(parser,'--heads')
434 if options.marksfile==None: bail(parser,'--status')
435 if options.marksfile==None: bail(parser,'--repo')
437 a={}
438 if options.authorfile!=None:
439 a=load_authors(options.authorfile)
441 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
442 options.statusfile,authors=a,sob=options.sob,force=options.force))