Mark repo as deprecated, point to fast-export.git
[hg2git.git] / hg2git.py
blob36419cb09b35372571480c453d0739e51dfbdc81
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: GPLv2
6 # This file is deprecated after merging it into:
7 # http://repo.or.cz/w/fast-export.git
9 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
10 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
11 """
13 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
14 from tempfile import mkstemp
15 from optparse import OptionParser
16 import re
17 import sys
18 import os
20 # silly regex to catch Signed-off-by lines in log message
21 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
22 # silly regex to see if user field has email address
23 user_re=re.compile('([^<]+) (<[^>]+>)$')
24 # silly regex to clean out user names
25 user_clean_re=re.compile('^["]([^"]+)["]$')
26 # git branch for hg's default 'HEAD' branch
27 cfg_master='master'
28 # insert 'checkpoint' command after this many commits or none at all if 0
29 cfg_checkpoint_count=0
30 # write some progress message every this many file contents written
31 cfg_export_boundary=1000
33 def usage(ret):
34 sys.stderr.write(__doc__)
35 return ret
37 def setup_repo(url):
38 myui=ui.ui()
39 return myui,hg.repository(myui,url)
41 def fixup_user(user,authors):
42 if authors!=None:
43 # if we have an authors table, try to get mapping
44 # by defaulting to the current value of 'user'
45 user=authors.get(user,user)
46 name,mail,m='','',user_re.match(user)
47 if m==None:
48 # if we don't have 'Name <mail>' syntax, use 'user
49 # <devnull@localhost>' if use contains no at and
50 # 'user <user>' otherwise
51 name=user
52 if '@' not in user:
53 mail='<devnull@localhost>'
54 else:
55 mail='<%s>' % user
56 else:
57 # if we have 'Name <mail>' syntax, everything is fine :)
58 name,mail=m.group(1),m.group(2)
60 # remove any silly quoting from username
61 m2=user_clean_re.match(name)
62 if m2!=None:
63 name=m2.group(1)
64 return '%s %s' % (name,mail)
66 def get_branch(name):
67 if name=='HEAD':
68 name=cfg_master
69 return name
71 def get_changeset(ui,repo,revision,authors={}):
72 node=repo.lookup(revision)
73 (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
74 tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
75 branch=get_branch(extra.get('branch','master'))
76 return (node,manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
78 def gitmode(x):
79 return x and '100755' or '100644'
81 def wr(msg=''):
82 print msg
83 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
85 def checkpoint(count):
86 count=count+1
87 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
88 sys.stderr.write("Checkpoint after %d commits\n" % count)
89 wr('checkpoint')
90 wr()
91 return count
93 def get_parent_mark(parent,marks):
94 """Get the mark for some parent.
95 If we saw it in the current session, return :%d syntax and
96 otherwise the SHA1 from the cache."""
97 return marks.get(str(parent+1),':%d' % (parent+1))
99 def mismatch(f1,f2):
100 """See if two revisions of a file are not equal."""
101 return node.hex(f1)!=node.hex(f2)
103 def outer_set(dleft,dright,l,c,r):
104 """Loop over our repository and find all changed and missing files."""
105 for left in dleft.keys():
106 right=dright.get(left,None)
107 if right==None:
108 # we have the file but our parent hasn't: add to left set
109 l.append(left)
110 elif mismatch(dleft[left],right):
111 # we have it but checksums mismatch: add to center set
112 c.append(left)
113 for right in dright.keys():
114 left=dleft.get(right,None)
115 if left==None:
116 # if parent has file but we don't: add to right set
117 r.append(right)
118 # change is already handled when comparing child against parent
119 return l,c,r
121 def get_filechanges(repo,revision,parents,mleft):
122 """Given some repository and revision, find all changed/deleted files."""
123 l,c,r=[],[],[]
124 for p in parents:
125 if p<0: continue
126 mright=repo.changectx(p).manifest()
127 dleft=mleft.keys()
128 dleft.sort()
129 dright=mright.keys()
130 dright.sort()
131 l,c,r=outer_set(mleft,mright,l,c,r)
132 return l,c,r
134 def get_author(logmessage,committer,authors):
135 """As git distincts between author and committer of a patch, try to
136 extract author by detecting Signed-off-by lines.
138 This walks from the end of the log message towards the top skipping
139 empty lines. Upon the first non-empty line, it walks all Signed-off-by
140 lines upwards to find the first one. For that (if found), it extracts
141 authorship information the usual way (authors table, cleaning, etc.)
143 If no Signed-off-by line is found, this defaults to the committer.
145 This may sound stupid (and it somehow is), but in log messages we
146 accidentially may have lines in the middle starting with
147 "Signed-off-by: foo" and thus matching our detection regex. Prevent
148 that."""
150 loglines=logmessage.split('\n')
151 i=len(loglines)
152 # from tail walk to top skipping empty lines
153 while i>=0:
154 i-=1
155 if len(loglines[i].strip())==0: continue
156 break
157 if i>=0:
158 # walk further upwards to find first sob line, store in 'first'
159 first=None
160 while i>=0:
161 m=sob_re.match(loglines[i])
162 if m==None: break
163 first=m
164 i-=1
165 # if the last non-empty line matches our Signed-Off-by regex: extract username
166 if first!=None:
167 r=fixup_user(first.group(1),authors)
168 return r
169 return committer
171 def export_file_contents(ctx,manifest,files):
172 count=0
173 files.sort()
174 max=len(files)
175 for file in files:
176 fctx=ctx.filectx(file)
177 d=fctx.data()
178 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
179 wr('data %d' % len(d)) # had some trouble with size()
180 wr(d)
181 count+=1
182 if count%cfg_export_boundary==0:
183 sys.stderr.write('Exported %d/%d files\n' % (count,max))
184 if max>cfg_export_boundary:
185 sys.stderr.write('Exported %d/%d files\n' % (count,max))
187 def is_merge(parents):
189 for parent in parents:
190 if parent>=0:
191 c+=1
192 return c>1
194 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
195 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
196 parents=repo.changelog.parentrevs(revision)
198 wr('commit refs/heads/%s' % branch)
199 wr('mark :%d' % (revision+1))
200 if sob:
201 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
202 wr('committer %s %d %s' % (user,time,timezone))
203 wr('data %d' % (len(desc)+1)) # wtf?
204 wr(desc)
205 wr()
207 src=heads.get(branch,'')
208 link=''
209 if src!='':
210 # if we have a cached head, this is an incremental import: initialize it
211 # and kill reference so we won't init it again
212 wr('from %s' % src)
213 heads[branch]=''
214 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
215 (branch,src))
216 link=src # avoid making a merge commit for incremental import
217 elif link=='' and not heads.has_key(branch) and revision>0:
218 # newly created branch and not the first one: connect to parent
219 tmp=get_parent_mark(parents[0],marks)
220 wr('from %s' % tmp)
221 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
222 (branch,tmp))
223 link=tmp # avoid making a merge commit for branch fork
225 if parents:
226 l=last.get(branch,revision)
227 for p in parents:
228 # 1) as this commit implicitely is the child of the most recent
229 # commit of this branch, ignore this parent
230 # 2) ignore nonexistent parents
231 # 3) merge otherwise
232 if p==l or p==revision or p<0:
233 continue
234 tmp=get_parent_mark(p,marks)
235 # if we fork off a branch, don't merge with our parent via 'merge'
236 # as we have 'from' already above
237 if tmp==link:
238 continue
239 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
240 (branch,tmp,p))
241 wr('merge %s' % tmp)
243 last[branch]=revision
244 heads[branch]=''
245 # we need this later to write out tags
246 marks[str(revision)]=':%d'%(revision+1)
248 ctx=repo.changectx(str(revision))
249 man=ctx.manifest()
250 added,changed,removed,type=[],[],[],''
252 if revision==0:
253 # first revision: feed in full manifest
254 added=man.keys()
255 type='full'
256 elif is_merge(parents):
257 # later merge revision: feed in changed manifest
258 # for many files comparing checksums is expensive so only do it for
259 # merges where we really need it due to hg's revlog logic
260 added,changed,removed=get_filechanges(repo,revision,parents,man)
261 type='thorough delta'
262 else:
263 # later non-merge revision: feed in changed manifest
264 # if we have exactly one parent, just take the changes from the
265 # manifest without expensively comparing checksums
266 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
267 added,changed,removed=f[1],f[0],f[2]
268 type='simple delta'
270 sys.stderr.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
271 (type,revision+1,max,len(added),len(changed),len(removed)))
273 map(lambda r: wr('D %s' % r),removed)
274 export_file_contents(ctx,man,added+changed)
275 wr()
277 return checkpoint(count)
279 def export_tags(ui,repo,marks_cache,start,end,count,authors):
280 l=repo.tagslist()
281 for tag,node in l:
282 # ignore latest revision
283 if tag=='tip': continue
284 rev=repo.changelog.rev(node)
285 # ignore those tags not in our import range
286 if rev<start or rev>=end: continue
288 ref=get_parent_mark(rev,marks_cache)
289 if ref==None:
290 sys.stderr.write('Failed to find reference for creating tag'
291 ' %s at r%d\n' % (tag,rev))
292 continue
293 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
294 wr('reset refs/tags/%s' % tag)
295 wr('from %s' % ref)
296 wr()
297 count=checkpoint(count)
298 return count
300 def load_authors(filename):
301 cache={}
302 if not os.path.exists(filename):
303 return cache
304 f=open(filename,'r')
306 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
307 for line in f.readlines():
308 l+=1
309 m=lre.match(line)
310 if m==None:
311 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
312 continue
313 # put key:value in cache, key without ^:
314 cache[m.group(1).strip()]=m.group(2).strip()
315 f.close()
316 sys.stderr.write('Loaded %d authors\n' % l)
317 return cache
319 def load_cache(filename):
320 cache={}
321 if not os.path.exists(filename):
322 return cache
323 f=open(filename,'r')
325 for line in f.readlines():
326 l+=1
327 fields=line.split(' ')
328 if fields==None or not len(fields)==2 or fields[0][0]!=':':
329 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
330 continue
331 # put key:value in cache, key without ^:
332 cache[fields[0][1:]]=fields[1].split('\n')[0]
333 f.close()
334 return cache
336 def save_cache(filename,cache):
337 f=open(filename,'w+')
338 map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
339 f.close()
341 def verify_heads(ui,repo,cache,force):
342 def getsha1(branch):
343 try:
344 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
345 sha1=f.readlines()[0].split('\n')[0]
346 f.close()
347 return sha1
348 except IOError:
349 return None
351 branches=repo.branchtags()
352 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
353 l.sort()
355 # get list of hg's branches to verify, don't take all git has
356 for _,_,b in l:
357 b=get_branch(b)
358 sha1=getsha1(b)
359 c=cache.get(b)
360 if sha1!=None and c!=None:
361 sys.stderr.write('Verifying branch [%s]\n' % b)
362 if sha1!=c:
363 sys.stderr.write('Error: Branch [%s] modified outside hg2git:'
364 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
365 if not force: return False
367 # verify that branch has exactly one head
368 t={}
369 for h in repo.heads():
370 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
371 if t.get(branch,False):
372 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
373 repo.changelog.rev(h))
374 if not force: return False
375 t[branch]=True
377 return True
379 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
380 _max=int(m)
382 marks_cache=load_cache(marksfile)
383 heads_cache=load_cache(headsfile)
384 state_cache=load_cache(tipfile)
386 ui,repo=setup_repo(repourl)
388 if not verify_heads(ui,repo,heads_cache,force):
389 return 1
391 tip=repo.changelog.count()
393 min=int(state_cache.get('tip',0))
394 max=_max
395 if _max<0:
396 max=tip
399 last={}
400 for rev in range(min,max):
401 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
403 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
405 sys.stderr.write('Issued %d commands\n' % c)
407 state_cache['tip']=max
408 state_cache['repo']=repourl
409 save_cache(tipfile,state_cache)
411 return 0
413 if __name__=='__main__':
414 def bail(parser,opt):
415 sys.stderr.write('Error: No %s option given\n' % opt)
416 parser.print_help()
417 sys.exit(2)
419 parser=OptionParser()
421 parser.add_option("-m","--max",type="int",dest="max",
422 help="Maximum hg revision to import")
423 parser.add_option("--marks",dest="marksfile",
424 help="File to read git-fast-import's marks from")
425 parser.add_option("--heads",dest="headsfile",
426 help="File to read last run's git heads from")
427 parser.add_option("--status",dest="statusfile",
428 help="File to read status from")
429 parser.add_option("-r","--repo",dest="repourl",
430 help="URL of repo to import")
431 parser.add_option("-s",action="store_true",dest="sob",
432 default=False,help="Enable parsing Signed-off-by lines")
433 parser.add_option("-A","--authors",dest="authorfile",
434 help="Read authormap from AUTHORFILE")
435 parser.add_option("-f","--force",action="store_true",dest="force",
436 default=False,help="Ignore validation errors by force")
438 (options,args)=parser.parse_args()
440 m=-1
441 if options.max!=None: m=options.max
443 if options.marksfile==None: bail(parser,'--marks')
444 if options.marksfile==None: bail(parser,'--heads')
445 if options.marksfile==None: bail(parser,'--status')
446 if options.marksfile==None: bail(parser,'--repo')
448 a={}
449 if options.authorfile!=None:
450 a=load_authors(options.authorfile)
452 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
453 options.statusfile,authors=a,sob=options.sob,force=options.force))