Speedup: Use manifest and compare checksums to detect changes
[fast-export/barak.git] / hg2git.py
blob4d67d9b727077fae3562cf1e89b1f0b6772719a6
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: GPLv2
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
8 """
10 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
11 from tempfile import mkstemp
12 import re
13 import sys
14 import os
16 # silly regex to see if user field has email address
17 user_re=re.compile('[^<]+ <[^>]+>$')
18 # git branch for hg's default 'HEAD' branch
19 cfg_master='master'
20 # insert 'checkpoint' command after this many commits
21 cfg_checkpoint_count=1000
23 def usage(ret):
24 sys.stderr.write(__doc__)
25 return ret
27 def setup_repo(url):
28 myui=ui.ui()
29 return myui,hg.repository(myui,url)
31 def get_changeset(ui,repo,revision):
32 def get_branch(name):
33 if name=='HEAD':
34 name=cfg_master
35 return name
36 def fixup_user(user):
37 if user_re.match(user)==None:
38 if '@' not in user:
39 return user+' <none@none>'
40 return user+' <'+user+'>'
41 return user
42 node=repo.lookup(revision)
43 (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
44 tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
45 branch=get_branch(extra.get('branch','master'))
46 return (manifest,fixup_user(user),(time,tz),files,desc,branch,extra)
48 def gitmode(x):
49 return x and '100755' or '100644'
51 def wr(msg=''):
52 print msg
53 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
55 def checkpoint(count):
56 count=count+1
57 if count%cfg_checkpoint_count==0:
58 sys.stderr.write("Checkpoint after %d commits\n" % count)
59 wr('checkpoint')
60 wr()
61 return count
63 def get_parent_mark(parent,marks):
64 """Get the mark for some parent.
65 If we saw it in the current session, return :%d syntax and
66 otherwise the SHA1 from the cache."""
67 return marks.get(str(parent+1),':%d' % (parent+1))
69 def mismatch(x,f1,f2):
70 """See if two revisions of a file are not equal."""
71 return node.hex(f1)!=node.hex(f2)
73 def outer_set(dleft,dright,l,r):
74 """Loop over our repository in and find all changed and missing files."""
75 for left in dleft.keys():
76 right=dright.get(left,None)
77 if right==None or mismatch('A',dleft[left],right):
78 # if either have the current file not in parent or the
79 # checksums differ: add it to changed files
80 l.append(left)
81 for right in dright.keys():
82 left=dleft.get(right,None)
83 if left==None:
84 # if we have a file in the parent but not our manifest,
85 # add it to deleted files; checksums are checked earlier
86 r.append(right)
87 return l,r
89 def get_filechanges(repo,revision,parents,mleft):
90 """Given some repository and revision, find all changed/deleted files."""
91 l,r=[],[]
92 for p in parents:
93 if p<0: continue
94 mright=repo.changectx(p).manifest()
95 dleft=mleft.keys()
96 dleft.sort()
97 dright=mright.keys()
98 dright.sort()
99 l,r=outer_set(mleft,mright,l,r)
100 return l,r
102 def export_commit(ui,repo,revision,marks,heads,last,max,count):
103 (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision)
104 parents=repo.changelog.parentrevs(revision)
106 wr('commit refs/heads/%s' % branch)
107 wr('mark :%d' % (revision+1))
108 wr('committer %s %d %s' % (user,time,timezone))
109 wr('data %d' % (len(desc)+1)) # wtf?
110 wr(desc)
111 wr()
113 src=heads.get(branch,'')
114 link=''
115 if src!='':
116 # if we have a cached head, this is an incremental import: initialize it
117 # and kill reference so we won't init it again
118 wr('from %s' % src)
119 heads[branch]=''
120 sys.stderr.write('Initializing branch [%s] to parent [%s]\n' %
121 (branch,src))
122 link=src # avoid making a merge commit for incremental import
123 elif link=='' and not heads.has_key(branch) and revision>0:
124 # newly created branch and not the first one: connect to parent
125 tmp=get_parent_mark(parents[0],marks)
126 wr('from %s' % tmp)
127 sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
128 (branch,tmp))
129 link=tmp # avoid making a merge commit for branch fork
131 if parents:
132 l=last.get(branch,revision)
133 for p in parents:
134 # 1) as this commit implicitely is the child of the most recent
135 # commit of this branch, ignore this parent
136 # 2) ignore nonexistent parents
137 # 3) merge otherwise
138 if p==l or p==revision or p<0:
139 continue
140 tmp=get_parent_mark(p,marks)
141 # if we fork off a branch, don't merge with our parent via 'merge'
142 # as we have 'from' already above
143 if tmp==link:
144 continue
145 sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
146 (branch,tmp,p))
147 wr('merge %s' % tmp)
149 last[branch]=revision
150 heads[branch]=''
151 # we need this later to write out tags
152 marks[str(revision)]=':%d'%(revision+1)
154 ctx=repo.changectx(str(revision))
155 man=ctx.manifest()
156 added,removed=get_filechanges(repo,revision,parents,man)
158 sys.stderr.write('Exporting revision %d with %d changed/%d removed files\n' %
159 (revision,len(added),len(removed)))
161 for a in added:
162 fctx=ctx.filectx(a)
163 d=fctx.data()
164 wr('M %s inline %s' % (gitmode(man.execf(a)),a))
165 wr('data %d' % len(d)) # had some trouble with size()
166 wr(d)
168 for r in removed:
169 wr('D %s' % r)
171 wr()
172 return checkpoint(count)
174 def export_tags(ui,repo,cache,count):
175 l=repo.tagslist()
176 for tag,node in l:
177 if tag=='tip':
178 continue
179 rev=repo.changelog.rev(node)
180 ref=cache.get(str(rev),None)
181 if ref==None:
182 #sys.stderr.write('Failed to find reference for creating tag'
183 # ' %s at r%d\n' % (tag,rev))
184 continue
185 (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev)
186 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
187 wr('tag %s' % tag)
188 wr('from %s' % ref)
189 wr('tagger %s %d %s' % (user,time,timezone))
190 msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag,
191 rev,branch,desc.split('\n')[0])
192 wr('data %d' % (len(msg)+1))
193 wr(msg)
194 wr()
195 count=checkpoint(count)
196 return count
198 def load_cache(filename):
199 cache={}
200 if not os.path.exists(filename):
201 return cache
202 f=open(filename,'r')
204 for line in f.readlines():
205 l+=1
206 fields=line.split(' ')
207 if fields==None or not len(fields)==2 or fields[0][0]!=':':
208 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
209 continue
210 # put key:value in cache, key without ^:
211 cache[fields[0][1:]]=fields[1].split('\n')[0]
212 f.close()
213 return cache
215 def save_cache(filename,cache):
216 f=open(filename,'w+')
217 map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
218 f.close()
220 def verify_heads(ui,repo,cache):
221 def getsha1(branch):
222 f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
223 sha1=f.readlines()[0].split('\n')[0]
224 f.close()
225 return sha1
227 for b in cache.keys():
228 sys.stderr.write('Verifying branch [%s]\n' % b)
229 sha1=getsha1(b)
230 c=cache.get(b)
231 if sha1!=c:
232 sys.stderr.write('Warning: Branch [%s] modified outside hg2git:'
233 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
234 return True
236 if __name__=='__main__':
237 if len(sys.argv)!=6: sys.exit(usage(1))
238 repourl,m,marksfile,headsfile,tipfile=sys.argv[1:]
239 _max=int(m)
241 marks_cache=load_cache(marksfile)
242 heads_cache=load_cache(headsfile)
243 state_cache=load_cache(tipfile)
245 ui,repo=setup_repo(repourl)
247 if not verify_heads(ui,repo,heads_cache):
248 sys.exit(1)
250 tip=repo.changelog.count()
252 min=int(state_cache.get('tip',0))
253 max=_max
254 if _max<0:
255 max=tip
257 c=int(state_cache.get('count',0))
258 last={}
259 for rev in range(min,max):
260 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c)
262 c=export_tags(ui,repo,marks_cache,c)
264 state_cache['tip']=max
265 state_cache['count']=c
266 state_cache['repo']=repourl
267 save_cache(tipfile,state_cache)