Remove SHA stability note from readme
[fast-export/barak.git] / hg2git.py
blob9ac395b5fc26da5c206bb2b915511bbf79e7a5e4
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: GPLv2
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
8 """
10 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
11 from tempfile import mkstemp
12 from optparse import OptionParser
13 import re
14 import sys
15 import os
17 # silly regex to catch Signed-off-by lines in log message
18 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
19 # silly regex to see if user field has email address
20 user_re=re.compile('([^<]+) (<[^>]+>)$')
21 # silly regex to clean out user names
22 user_clean_re=re.compile('^["]([^"]+)["]$')
23 # git branch for hg's default 'HEAD' branch
24 cfg_master='master'
25 # insert 'checkpoint' command after this many commits or none at all if 0
26 cfg_checkpoint_count=0
28 def usage(ret):
29 sys.stderr.write(__doc__)
30 return ret
32 def setup_repo(url):
33 myui=ui.ui()
34 return myui,hg.repository(myui,url)
36 def fixup_user(user,authors):
37 if authors!=None:
38 # if we have an authors table, try to get mapping
39 # by defaulting to the current value of 'user'
40 user=authors.get(user,user)
41 name,mail,m='','',user_re.match(user)
42 if m==None:
43 # if we don't have 'Name <mail>' syntax, use 'user
44 # <devnull@localhost>' if use contains no at and
45 # 'user <user>' otherwise
46 name=user
47 if '@' not in user:
48 mail='<devnull@localhost>'
49 else:
50 mail='<%s>' % user
51 else:
52 # if we have 'Name <mail>' syntax, everything is fine :)
53 name,mail=m.group(1),m.group(2)
55 # remove any silly quoting from username
56 m2=user_clean_re.match(name)
57 if m2!=None:
58 name=m2.group(1)
59 return '%s %s' % (name,mail)
61 def get_branch(name):
62 if name=='HEAD':
63 name=cfg_master
64 return name
66 def get_changeset(ui,repo,revision,authors):
67 node=repo.lookup(revision)
68 (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
69 tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
70 branch=get_branch(extra.get('branch','master'))
71 return (manifest,fixup_user(user,authors),(time,tz),files,desc,branch,extra)
73 def gitmode(x):
74 return x and '100755' or '100644'
76 def wr(msg=''):
77 print msg
78 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
80 def checkpoint(count):
81 count=count+1
82 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
83 sys.stderr.write("Checkpoint after %d commits\n" % count)
84 wr('checkpoint')
85 wr()
86 return count
88 def get_parent_mark(parent,marks):
89 """Get the mark for some parent.
90 If we saw it in the current session, return :%d syntax and
91 otherwise the SHA1 from the cache."""
92 return marks.get(str(parent+1),':%d' % (parent+1))
94 def mismatch(f1,f2):
95 """See if two revisions of a file are not equal."""
96 return node.hex(f1)!=node.hex(f2)
98 def outer_set(dleft,dright,l,c,r):
99 """Loop over our repository and find all changed and missing files."""
100 for left in dleft.keys():
101 right=dright.get(left,None)
102 if right==None:
103 # we have the file but our parent hasn't: add to left set
104 l.append(left)
105 elif mismatch(dleft[left],right):
106 # we have it but checksums mismatch: add to center set
107 c.append(left)
108 for right in dright.keys():
109 left=dleft.get(right,None)
110 if left==None:
111 # if parent has file but we don't: add to right set
112 r.append(right)
113 # change is already handled when comparing child against parent
114 return l,c,r
116 def get_filechanges(repo,revision,parents,mleft):
117 """Given some repository and revision, find all changed/deleted files."""
118 l,c,r=[],[],[]
119 for p in parents:
120 if p<0: continue
121 mright=repo.changectx(p).manifest()
122 dleft=mleft.keys()
123 dleft.sort()
124 dright=mright.keys()
125 dright.sort()
126 l,c,r=outer_set(mleft,mright,l,c,r)
127 return l,c,r
129 def get_author(logmessage,committer,authors):
130 """As git distincts between author and committer of a patch, try to
131 extract author by detecting Signed-off-by lines.
133 This walks from the end of the log message towards the top skipping
134 empty lines. Upon the first non-empty line, it walks all Signed-off-by
135 lines upwards to find the first one. For that (if found), it extracts
136 authorship information the usual way (authors table, cleaning, etc.)
138 If no Signed-off-by line is found, this defaults to the committer.
140 This may sound stupid (and it somehow is), but in log messages we
141 accidentially may have lines in the middle starting with
142 "Signed-off-by: foo" and thus matching our detection regex. Prevent
143 that."""
145 loglines=logmessage.split('\n')
146 i=len(loglines)
147 # from tail walk to top skipping empty lines
148 while i>=0:
149 i-=1
150 if len(loglines[i].strip())==0: continue
151 break
152 if i>=0:
153 # walk further upwards to find first sob line, store in 'first'
154 first=None
155 while i>=0:
156 m=sob_re.match(loglines[i])
157 if m==None: break
158 first=m
159 i-=1
160 # if the last non-empty line matches our Signed-Off-by regex: extract username
161 if first!=None:
162 r=fixup_user(first.group(1),authors)
163 return r
164 return committer
166 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
167 (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
168 parents=repo.changelog.parentrevs(revision)
170 wr('commit refs/heads/%s' % branch)
171 wr('mark :%d' % (revision+1))
172 if sob:
173 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
174 wr('committer %s %d %s' % (user,time,timezone))
175 wr('data %d' % (len(desc)+1)) # wtf?
176 wr(desc)
177 wr()
179 src=heads.get(branch,'')
180 link=''
181 if src!='':
182 # if we have a cached head, this is an incremental import: initialize it
183 # and kill reference so we won't init it again
184 wr('from %s' % src)
185 heads[branch]=''
186 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
187 (branch,src))
188 link=src # avoid making a merge commit for incremental import
189 elif link=='' and not heads.has_key(branch) and revision>0:
190 # newly created branch and not the first one: connect to parent
191 tmp=get_parent_mark(parents[0],marks)
192 wr('from %s' % tmp)
193 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
194 (branch,tmp))
195 link=tmp # avoid making a merge commit for branch fork
197 if parents:
198 l=last.get(branch,revision)
199 for p in parents:
200 # 1) as this commit implicitely is the child of the most recent
201 # commit of this branch, ignore this parent
202 # 2) ignore nonexistent parents
203 # 3) merge otherwise
204 if p==l or p==revision or p<0:
205 continue
206 tmp=get_parent_mark(p,marks)
207 # if we fork off a branch, don't merge with our parent via 'merge'
208 # as we have 'from' already above
209 if tmp==link:
210 continue
211 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
212 (branch,tmp,p))
213 wr('merge %s' % tmp)
215 last[branch]=revision
216 heads[branch]=''
217 # we need this later to write out tags
218 marks[str(revision)]=':%d'%(revision+1)
220 ctx=repo.changectx(str(revision))
221 man=ctx.manifest()
222 added,changed,removed=get_filechanges(repo,revision,parents,man)
224 sys.stderr.write('Exporting revision %d/%d with %d/%d/%d added/changed/removed files\n' %
225 (revision,max,len(added),len(changed),len(removed)))
227 for a in added+changed:
228 fctx=ctx.filectx(a)
229 d=fctx.data()
230 wr('M %s inline %s' % (gitmode(man.execf(a)),a))
231 wr('data %d' % len(d)) # had some trouble with size()
232 wr(d)
234 for r in removed:
235 wr('D %s' % r)
237 wr()
238 return checkpoint(count)
240 def export_tags(ui,repo,marks_cache,start,end,count,authors):
241 l=repo.tagslist()
242 for tag,node in l:
243 # ignore latest revision
244 if tag=='tip': continue
245 rev=repo.changelog.rev(node)
246 # ignore those tags not in our import range
247 if rev<start or rev>=end: continue
249 ref=marks_cache.get(str(rev),None)
250 if ref==None:
251 sys.stderr.write('Failed to find reference for creating tag'
252 ' %s at r%d\n' % (tag,rev))
253 continue
254 (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev,authors)
255 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
256 wr('tag %s' % tag)
257 wr('from %s' % ref)
258 wr('tagger %s %d %s' % (user,time,timezone))
259 msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag,
260 rev,branch,desc.split('\n')[0])
261 wr('data %d' % (len(msg)+1))
262 wr(msg)
263 wr()
264 count=checkpoint(count)
265 return count
267 def load_authors(filename):
268 cache={}
269 if not os.path.exists(filename):
270 return cache
271 f=open(filename,'r')
273 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
274 for line in f.readlines():
275 l+=1
276 m=lre.match(line)
277 if m==None:
278 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
279 continue
280 # put key:value in cache, key without ^:
281 cache[m.group(1).strip()]=m.group(2).strip()
282 f.close()
283 sys.stderr.write('Loaded %d authors\n' % l)
284 return cache
286 def load_cache(filename):
287 cache={}
288 if not os.path.exists(filename):
289 return cache
290 f=open(filename,'r')
292 for line in f.readlines():
293 l+=1
294 fields=line.split(' ')
295 if fields==None or not len(fields)==2 or fields[0][0]!=':':
296 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
297 continue
298 # put key:value in cache, key without ^:
299 cache[fields[0][1:]]=fields[1].split('\n')[0]
300 f.close()
301 return cache
303 def save_cache(filename,cache):
304 f=open(filename,'w+')
305 map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
306 f.close()
308 def verify_heads(ui,repo,cache):
309 def getsha1(branch):
310 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
311 sha1=f.readlines()[0].split('\n')[0]
312 f.close()
313 return sha1
315 # get list of hg's branches to verify, don't take all git has
316 branches=repo.branchtags()
317 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
318 l.sort()
320 for _,_,b in l:
321 b=get_branch(b)
322 sys.stderr.write('Verifying branch [%s]\n' % b)
323 sha1=getsha1(b)
324 c=cache.get(b)
325 if sha1!=c:
326 sys.stderr.write('Warning: Branch [%s] modified outside hg2git:'
327 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
328 return True
330 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False):
331 _max=int(m)
333 marks_cache=load_cache(marksfile)
334 heads_cache=load_cache(headsfile)
335 state_cache=load_cache(tipfile)
337 ui,repo=setup_repo(repourl)
339 if not verify_heads(ui,repo,heads_cache):
340 return 1
342 tip=repo.changelog.count()
344 min=int(state_cache.get('tip',0))
345 max=_max
346 if _max<0:
347 max=tip
350 last={}
351 for rev in range(min,max):
352 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
354 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
356 sys.stderr.write('Issued %d commands\n' % c)
358 state_cache['tip']=max
359 state_cache['repo']=repourl
360 save_cache(tipfile,state_cache)
362 return 0
364 if __name__=='__main__':
365 def bail(parser,opt):
366 sys.stderr.write('Error: No %s option given\n' % opt)
367 parser.print_help()
368 sys.exit(2)
370 parser=OptionParser()
372 parser.add_option("-m","--max",type="int",dest="max",
373 help="Maximum hg revision to import")
374 parser.add_option("--marks",dest="marksfile",
375 help="File to read git-fast-import's marks from")
376 parser.add_option("--heads",dest="headsfile",
377 help="File to read last run's git heads from")
378 parser.add_option("--status",dest="statusfile",
379 help="File to read status from")
380 parser.add_option("-r","--repo",dest="repourl",
381 help="URL of repo to import")
382 parser.add_option("-s",action="store_true",dest="sob",
383 default=False,help="Enable parsing Signed-off-by lines")
384 parser.add_option("-A","--authors",dest="authorfile",
385 help="Read authormap from AUTHORFILE")
387 (options,args)=parser.parse_args()
389 m=-1
390 if options.max!=None: m=options.max
392 if options.marksfile==None: bail(parser,'--marks')
393 if options.marksfile==None: bail(parser,'--heads')
394 if options.marksfile==None: bail(parser,'--status')
395 if options.marksfile==None: bail(parser,'--repo')
397 a={}
398 if options.authorfile!=None:
399 a=load_authors(options.authorfile)
401 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
402 options.statusfile,authors=a,sob=options.sob))