Use MIT license, adjust hg2git script names to match fast-export repo style
[fast-export/barak.git] / hg-fast-export.py
blobb876ce46ca635b38c9c4451c5d40fbbf619dd33d
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 """hg-fast-export.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg-fast-export.py <hg repo url> <marks file> <heads file> <tip file>
8 """
10 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
11 from tempfile import mkstemp
12 from optparse import OptionParser
13 import re
14 import sys
15 import os
17 # silly regex to catch Signed-off-by lines in log message
18 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
19 # silly regex to see if user field has email address
20 user_re=re.compile('([^<]+) (<[^>]+>)$')
21 # silly regex to clean out user names
22 user_clean_re=re.compile('^["]([^"]+)["]$')
23 # git branch for hg's default 'HEAD' branch
24 cfg_master='master'
25 # insert 'checkpoint' command after this many commits or none at all if 0
26 cfg_checkpoint_count=0
27 # write some progress message every this many file contents written
28 cfg_export_boundary=1000
30 def usage(ret):
31 sys.stderr.write(__doc__)
32 return ret
34 def setup_repo(url):
35 myui=ui.ui()
36 return myui,hg.repository(myui,url)
38 def fixup_user(user,authors):
39 if authors!=None:
40 # if we have an authors table, try to get mapping
41 # by defaulting to the current value of 'user'
42 user=authors.get(user,user)
43 name,mail,m='','',user_re.match(user)
44 if m==None:
45 # if we don't have 'Name <mail>' syntax, use 'user
46 # <devnull@localhost>' if use contains no at and
47 # 'user <user>' otherwise
48 name=user
49 if '@' not in user:
50 mail='<devnull@localhost>'
51 else:
52 mail='<%s>' % user
53 else:
54 # if we have 'Name <mail>' syntax, everything is fine :)
55 name,mail=m.group(1),m.group(2)
57 # remove any silly quoting from username
58 m2=user_clean_re.match(name)
59 if m2!=None:
60 name=m2.group(1)
61 return '%s %s' % (name,mail)
63 def get_branch(name):
64 if name=='HEAD':
65 name=cfg_master
66 return name
68 def get_changeset(ui,repo,revision,authors={}):
69 node=repo.lookup(revision)
70 (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
71 tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
72 branch=get_branch(extra.get('branch','master'))
73 return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
75 def gitmode(x):
76 return x and '100755' or '100644'
78 def wr(msg=''):
79 print msg
80 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
82 def checkpoint(count):
83 count=count+1
84 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
85 sys.stderr.write("Checkpoint after %d commits\n" % count)
86 wr('checkpoint')
87 wr()
88 return count
90 def get_parent_mark(parent,marks):
91 """Get the mark for some parent.
92 If we saw it in the current session, return :%d syntax and
93 otherwise the SHA1 from the cache."""
94 return marks.get(str(parent+1),':%d' % (parent+1))
96 def mismatch(f1,f2):
97 """See if two revisions of a file are not equal."""
98 return node.hex(f1)!=node.hex(f2)
100 def outer_set(dleft,dright,l,c,r):
101 """Loop over our repository and find all changed and missing files."""
102 for left in dleft.keys():
103 right=dright.get(left,None)
104 if right==None:
105 # we have the file but our parent hasn't: add to left set
106 l.append(left)
107 elif mismatch(dleft[left],right):
108 # we have it but checksums mismatch: add to center set
109 c.append(left)
110 for right in dright.keys():
111 left=dleft.get(right,None)
112 if left==None:
113 # if parent has file but we don't: add to right set
114 r.append(right)
115 # change is already handled when comparing child against parent
116 return l,c,r
118 def get_filechanges(repo,revision,parents,mleft):
119 """Given some repository and revision, find all changed/deleted files."""
120 l,c,r=[],[],[]
121 for p in parents:
122 if p<0: continue
123 mright=repo.changectx(p).manifest()
124 dleft=mleft.keys()
125 dleft.sort()
126 dright=mright.keys()
127 dright.sort()
128 l,c,r=outer_set(mleft,mright,l,c,r)
129 return l,c,r
131 def get_author(logmessage,committer,authors):
132 """As git distincts between author and committer of a patch, try to
133 extract author by detecting Signed-off-by lines.
135 This walks from the end of the log message towards the top skipping
136 empty lines. Upon the first non-empty line, it walks all Signed-off-by
137 lines upwards to find the first one. For that (if found), it extracts
138 authorship information the usual way (authors table, cleaning, etc.)
140 If no Signed-off-by line is found, this defaults to the committer.
142 This may sound stupid (and it somehow is), but in log messages we
143 accidentially may have lines in the middle starting with
144 "Signed-off-by: foo" and thus matching our detection regex. Prevent
145 that."""
147 loglines=logmessage.split('\n')
148 i=len(loglines)
149 # from tail walk to top skipping empty lines
150 while i>=0:
151 i-=1
152 if len(loglines[i].strip())==0: continue
153 break
154 if i>=0:
155 # walk further upwards to find first sob line, store in 'first'
156 first=None
157 while i>=0:
158 m=sob_re.match(loglines[i])
159 if m==None: break
160 first=m
161 i-=1
162 # if the last non-empty line matches our Signed-Off-by regex: extract username
163 if first!=None:
164 r=fixup_user(first.group(1),authors)
165 return r
166 return committer
168 def export_file_contents(ctx,manifest,files):
169 count=0
170 files.sort()
171 max=len(files)
172 for file in files:
173 fctx=ctx.filectx(file)
174 d=fctx.data()
175 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
176 wr('data %d' % len(d)) # had some trouble with size()
177 wr(d)
178 count+=1
179 if count%cfg_export_boundary==0:
180 sys.stderr.write('Exported %d/%d files\n' % (count,max))
181 if max>cfg_export_boundary:
182 sys.stderr.write('Exported %d/%d files\n' % (count,max))
184 def is_merge(parents):
186 for parent in parents:
187 if parent>=0:
188 c+=1
189 return c>1
191 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
192 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
193 parents=repo.changelog.parentrevs(revision)
195 wr('commit refs/heads/%s' % branch)
196 wr('mark :%d' % (revision+1))
197 if sob:
198 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
199 wr('committer %s %d %s' % (user,time,timezone))
200 wr('data %d' % (len(desc)+1)) # wtf?
201 wr(desc)
202 wr()
204 src=heads.get(branch,'')
205 link=''
206 if src!='':
207 # if we have a cached head, this is an incremental import: initialize it
208 # and kill reference so we won't init it again
209 wr('from %s' % src)
210 heads[branch]=''
211 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
212 (branch,src))
213 link=src # avoid making a merge commit for incremental import
214 elif link=='' and not heads.has_key(branch) and revision>0:
215 # newly created branch and not the first one: connect to parent
216 tmp=get_parent_mark(parents[0],marks)
217 wr('from %s' % tmp)
218 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
219 (branch,tmp))
220 link=tmp # avoid making a merge commit for branch fork
222 if parents:
223 l=last.get(branch,revision)
224 for p in parents:
225 # 1) as this commit implicitely is the child of the most recent
226 # commit of this branch, ignore this parent
227 # 2) ignore nonexistent parents
228 # 3) merge otherwise
229 if p==l or p==revision or p<0:
230 continue
231 tmp=get_parent_mark(p,marks)
232 # if we fork off a branch, don't merge with our parent via 'merge'
233 # as we have 'from' already above
234 if tmp==link:
235 continue
236 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
237 (branch,tmp,p))
238 wr('merge %s' % tmp)
240 last[branch]=revision
241 heads[branch]=''
242 # we need this later to write out tags
243 marks[str(revision)]=':%d'%(revision+1)
245 ctx=repo.changectx(str(revision))
246 man=ctx.manifest()
247 added,changed,removed,type=[],[],[],''
249 if revision==0:
250 # first revision: feed in full manifest
251 added=man.keys()
252 type='full'
253 elif is_merge(parents):
254 # later merge revision: feed in changed manifest
255 # for many files comparing checksums is expensive so only do it for
256 # merges where we really need it due to hg's revlog logic
257 added,changed,removed=get_filechanges(repo,revision,parents,man)
258 type='thorough delta'
259 else:
260 # later non-merge revision: feed in changed manifest
261 # if we have exactly one parent, just take the changes from the
262 # manifest without expensively comparing checksums
263 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
264 added,changed,removed=f[1],f[0],f[2]
265 type='simple delta'
267 sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
268 (type,revision+1,max,len(added),len(changed),len(removed)))
270 map(lambda r: wr('D %s' % r),removed)
271 export_file_contents(ctx,man,added+changed)
272 wr()
274 return checkpoint(count)
276 def export_tags(ui,repo,marks_cache,start,end,count,authors):
277 l=repo.tagslist()
278 for tag,node in l:
279 # ignore latest revision
280 if tag=='tip': continue
281 rev=repo.changelog.rev(node)
282 # ignore those tags not in our import range
283 if rev<start or rev>=end: continue
285 ref=get_parent_mark(rev,marks_cache)
286 if ref==None:
287 sys.stderr.write('Failed to find reference for creating tag'
288 ' %s at r%d\n' % (tag,rev))
289 continue
290 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
291 wr('reset refs/tags/%s' % tag)
292 wr('from %s' % ref)
293 wr()
294 count=checkpoint(count)
295 return count
297 def load_authors(filename):
298 cache={}
299 if not os.path.exists(filename):
300 return cache
301 f=open(filename,'r')
303 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
304 for line in f.readlines():
305 l+=1
306 m=lre.match(line)
307 if m==None:
308 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
309 continue
310 # put key:value in cache, key without ^:
311 cache[m.group(1).strip()]=m.group(2).strip()
312 f.close()
313 sys.stderr.write('Loaded %d authors\n' % l)
314 return cache
316 def load_cache(filename):
317 cache={}
318 if not os.path.exists(filename):
319 return cache
320 f=open(filename,'r')
322 for line in f.readlines():
323 l+=1
324 fields=line.split(' ')
325 if fields==None or not len(fields)==2 or fields[0][0]!=':':
326 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
327 continue
328 # put key:value in cache, key without ^:
329 cache[fields[0][1:]]=fields[1].split('\n')[0]
330 f.close()
331 return cache
333 def save_cache(filename,cache):
334 f=open(filename,'w+')
335 map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
336 f.close()
338 def verify_heads(ui,repo,cache,force):
339 def getsha1(branch):
340 try:
341 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
342 sha1=f.readlines()[0].split('\n')[0]
343 f.close()
344 return sha1
345 except IOError:
346 return None
348 branches=repo.branchtags()
349 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
350 l.sort()
352 # get list of hg's branches to verify, don't take all git has
353 for _,_,b in l:
354 b=get_branch(b)
355 sha1=getsha1(b)
356 c=cache.get(b)
357 if sha1!=None and c!=None:
358 sys.stderr.write('Verifying branch [%s]\n' % b)
359 if sha1!=c:
360 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
361 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
362 if not force: return False
364 # verify that branch has exactly one head
365 t={}
366 for h in repo.heads():
367 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
368 if t.get(branch,False):
369 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
370 repo.changelog.rev(h))
371 if not force: return False
372 t[branch]=True
374 return True
376 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
377 _max=int(m)
379 marks_cache=load_cache(marksfile)
380 heads_cache=load_cache(headsfile)
381 state_cache=load_cache(tipfile)
383 ui,repo=setup_repo(repourl)
385 if not verify_heads(ui,repo,heads_cache,force):
386 return 1
388 tip=repo.changelog.count()
390 min=int(state_cache.get('tip',0))
391 max=_max
392 if _max<0:
393 max=tip
396 last={}
397 for rev in range(min,max):
398 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
400 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
402 sys.stderr.write('Issued %d commands\n' % c)
404 state_cache['tip']=max
405 state_cache['repo']=repourl
406 save_cache(tipfile,state_cache)
408 return 0
410 if __name__=='__main__':
411 def bail(parser,opt):
412 sys.stderr.write('Error: No %s option given\n' % opt)
413 parser.print_help()
414 sys.exit(2)
416 parser=OptionParser()
418 parser.add_option("-m","--max",type="int",dest="max",
419 help="Maximum hg revision to import")
420 parser.add_option("--marks",dest="marksfile",
421 help="File to read git-fast-import's marks from")
422 parser.add_option("--heads",dest="headsfile",
423 help="File to read last run's git heads from")
424 parser.add_option("--status",dest="statusfile",
425 help="File to read status from")
426 parser.add_option("-r","--repo",dest="repourl",
427 help="URL of repo to import")
428 parser.add_option("-s",action="store_true",dest="sob",
429 default=False,help="Enable parsing Signed-off-by lines")
430 parser.add_option("-A","--authors",dest="authorfile",
431 help="Read authormap from AUTHORFILE")
432 parser.add_option("-f","--force",action="store_true",dest="force",
433 default=False,help="Ignore validation errors by force")
435 (options,args)=parser.parse_args()
437 m=-1
438 if options.max!=None: m=options.max
440 if options.marksfile==None: bail(parser,'--marks')
441 if options.marksfile==None: bail(parser,'--heads')
442 if options.marksfile==None: bail(parser,'--status')
443 if options.marksfile==None: bail(parser,'--repo')
445 a={}
446 if options.authorfile!=None:
447 a=load_authors(options.authorfile)
449 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
450 options.statusfile,authors=a,sob=options.sob,force=options.force))