hg-reset.py: Use mangle_key() for load_cache() as hg-fast-export.py does
[fast-export/benizi.git] / hg-fast-export.py
blobc85e84e439dacf75e27adf76e1b28f467f9d8812
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset,load_cache,save_cache,get_git_sha1
8 from tempfile import mkstemp
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 # silly regex to catch Signed-off-by lines in log message
15 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
16 # insert 'checkpoint' command after this many commits or none at all if 0
17 cfg_checkpoint_count=0
18 # write some progress message every this many file contents written
19 cfg_export_boundary=1000
21 def gitmode(x):
22 return x and '100755' or '100644'
24 def wr(msg=''):
25 if msg == None:
26 msg = ''
27 print msg
28 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
30 def checkpoint(count):
31 count=count+1
32 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
33 sys.stderr.write("Checkpoint after %d commits\n" % count)
34 wr('checkpoint')
35 wr()
36 return count
38 def get_parent_mark(parent,marks):
39 """Get the mark for some parent.
40 If we saw it in the current session, return :%d syntax and
41 otherwise the SHA1 from the cache."""
42 return marks.get(str(parent),':%d' % (parent+1))
44 def mismatch(f1,f2):
45 """See if two revisions of a file are not equal."""
46 return node.hex(f1)!=node.hex(f2)
48 def outer_set(dleft,dright,l,c,r):
49 """Loop over our repository and find all changed and missing files."""
50 for left in dleft.keys():
51 right=dright.get(left,None)
52 if right==None:
53 # we have the file but our parent hasn't: add to left set
54 l.append(left)
55 elif mismatch(dleft[left],right):
56 # we have it but checksums mismatch: add to center set
57 c.append(left)
58 for right in dright.keys():
59 left=dleft.get(right,None)
60 if left==None:
61 # if parent has file but we don't: add to right set
62 r.append(right)
63 # change is already handled when comparing child against parent
64 return l,c,r
66 def get_filechanges(repo,revision,parents,mleft):
67 """Given some repository and revision, find all changed/deleted files."""
68 l,c,r=[],[],[]
69 for p in parents:
70 if p<0: continue
71 mright=repo.changectx(p).manifest()
72 dleft=mleft.keys()
73 dleft.sort()
74 dright=mright.keys()
75 dright.sort()
76 l,c,r=outer_set(mleft,mright,l,c,r)
77 return l,c,r
79 def get_author(logmessage,committer,authors):
80 """As git distincts between author and committer of a patch, try to
81 extract author by detecting Signed-off-by lines.
83 This walks from the end of the log message towards the top skipping
84 empty lines. Upon the first non-empty line, it walks all Signed-off-by
85 lines upwards to find the first one. For that (if found), it extracts
86 authorship information the usual way (authors table, cleaning, etc.)
88 If no Signed-off-by line is found, this defaults to the committer.
90 This may sound stupid (and it somehow is), but in log messages we
91 accidentially may have lines in the middle starting with
92 "Signed-off-by: foo" and thus matching our detection regex. Prevent
93 that."""
95 loglines=logmessage.split('\n')
96 i=len(loglines)
97 # from tail walk to top skipping empty lines
98 while i>=0:
99 i-=1
100 if len(loglines[i].strip())==0: continue
101 break
102 if i>=0:
103 # walk further upwards to find first sob line, store in 'first'
104 first=None
105 while i>=0:
106 m=sob_re.match(loglines[i])
107 if m==None: break
108 first=m
109 i-=1
110 # if the last non-empty line matches our Signed-Off-by regex: extract username
111 if first!=None:
112 r=fixup_user(first.group(1),authors)
113 return r
114 return committer
116 def export_file_contents(ctx,manifest,files):
117 count=0
118 files.sort()
119 max=len(files)
120 for file in files:
121 fctx=ctx.filectx(file)
122 d=fctx.data()
123 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
124 wr('data %d' % len(d)) # had some trouble with size()
125 wr(d)
126 count+=1
127 if count%cfg_export_boundary==0:
128 sys.stderr.write('Exported %d/%d files\n' % (count,max))
129 if max>cfg_export_boundary:
130 sys.stderr.write('Exported %d/%d files\n' % (count,max))
132 def is_merge(parents):
134 for parent in parents:
135 if parent>=0:
136 c+=1
137 return c>1
139 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
140 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
141 parents=repo.changelog.parentrevs(revision)
143 wr('commit refs/heads/%s' % branch)
144 wr('mark :%d' % (revision+1))
145 if sob:
146 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
147 wr('committer %s %d %s' % (user,time,timezone))
148 wr('data %d' % (len(desc)+1)) # wtf?
149 wr(desc)
150 wr()
152 pidx1, pidx2 = 0, 1
153 if parents[0] < parents[1]:
154 pidx1, pidx2 = 1, 0
156 src=heads.get(branch,'')
157 link=''
158 if src!='':
159 # if we have a cached head, this is an incremental import: initialize it
160 # and kill reference so we won't init it again
161 wr('from %s' % src)
162 heads[branch]=''
163 sys.stderr.write('%s: Initializing to parent [%s]\n' %
164 (branch,src))
165 link=src # avoid making a merge commit for incremental import
166 elif link=='' and not heads.has_key(branch) and revision>0:
167 # newly created branch and not the first one: connect to parent
168 tmp=get_parent_mark(parents[0],marks)
169 wr('from %s' % tmp)
170 sys.stderr.write('%s: Link new branch to parent [%s]\n' %
171 (branch,tmp))
172 link=tmp # avoid making a merge commit for branch fork
173 elif last.get(branch,revision) != parents[pidx1] and parents[pidx1] > 0 and revision > 0:
174 pm=get_parent_mark(parents[pidx1],marks)
175 sys.stderr.write('%s: Placing commit [r%d] in branch [%s] on top of [r%d]\n' %
176 (branch,revision,branch,parents[pidx1]));
177 wr('from %s' % pm)
179 if parents[pidx2] > 0:
180 pm=get_parent_mark(parents[pidx2],marks)
181 sys.stderr.write('%s: Merging with parent [%s] from [r%d]\n' %
182 (branch,pm,parents[pidx2]))
183 wr('merge %s' % pm)
185 last[branch]=revision
186 heads[branch]=''
187 # we need this later to write out tags
188 marks[str(revision)]=':%d'%(revision+1)
190 ctx=repo.changectx(str(revision))
191 man=ctx.manifest()
192 added,changed,removed,type=[],[],[],''
194 if revision==0:
195 # first revision: feed in full manifest
196 added=man.keys()
197 type='full'
198 elif is_merge(parents):
199 # later merge revision: feed in changed manifest
200 # for many files comparing checksums is expensive so only do it for
201 # merges where we really need it due to hg's revlog logic
202 added,changed,removed=get_filechanges(repo,revision,parents,man)
203 type='thorough delta'
204 else:
205 # later non-merge revision: feed in changed manifest
206 # if we have exactly one parent, just take the changes from the
207 # manifest without expensively comparing checksums
208 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
209 added,changed,removed=f[1],f[0],f[2]
210 type='simple delta'
212 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
213 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
215 map(lambda r: wr('D %s' % r),removed)
216 export_file_contents(ctx,man,added+changed)
217 wr()
219 return checkpoint(count)
221 def export_tags(ui,repo,marks_cache,start,end,count,authors):
222 l=repo.tagslist()
223 for tag,node in l:
224 # ignore latest revision
225 if tag=='tip': continue
226 rev=repo.changelog.rev(node)
227 # ignore those tags not in our import range
228 if rev<start or rev>=end: continue
230 ref=get_parent_mark(rev,marks_cache)
231 if ref==None:
232 sys.stderr.write('Failed to find reference for creating tag'
233 ' %s at r%d\n' % (tag,rev))
234 continue
235 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
236 wr('reset refs/tags/%s' % tag)
237 wr('from %s' % ref)
238 wr()
239 count=checkpoint(count)
240 return count
242 def load_authors(filename):
243 cache={}
244 if not os.path.exists(filename):
245 return cache
246 f=open(filename,'r')
248 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
249 for line in f.readlines():
250 l+=1
251 m=lre.match(line)
252 if m==None:
253 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
254 continue
255 # put key:value in cache, key without ^:
256 cache[m.group(1).strip()]=m.group(2).strip()
257 f.close()
258 sys.stderr.write('Loaded %d authors\n' % l)
259 return cache
261 def verify_heads(ui,repo,cache,force):
262 branches=repo.branchtags()
263 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
264 l.sort()
266 # get list of hg's branches to verify, don't take all git has
267 for _,_,b in l:
268 b=get_branch(b)
269 sha1=get_git_sha1(b)
270 c=cache.get(b)
271 if sha1!=None and c!=None:
272 sys.stderr.write('Verifying branch [%s]\n' % b)
273 if sha1!=c:
274 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
275 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
276 if not force: return False
278 # verify that branch has exactly one head
279 t={}
280 for h in repo.heads():
281 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
282 if t.get(branch,False):
283 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
284 repo.changelog.rev(h))
285 if not force: return False
286 t[branch]=True
288 return True
290 def mangle_mark(mark):
291 return str(int(mark)-1)
293 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
294 _max=int(m)
296 marks_cache=load_cache(marksfile,mangle_mark)
297 heads_cache=load_cache(headsfile)
298 state_cache=load_cache(tipfile)
300 ui,repo=setup_repo(repourl)
302 if not verify_heads(ui,repo,heads_cache,force):
303 return 1
305 tip=repo.changelog.count()
307 min=int(state_cache.get('tip',0))
308 max=_max
309 if _max<0:
310 max=tip
313 last={}
314 for rev in range(min,max):
315 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
317 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
319 sys.stderr.write('Issued %d commands\n' % c)
321 state_cache['tip']=max
322 state_cache['repo']=repourl
323 save_cache(tipfile,state_cache)
325 return 0
327 if __name__=='__main__':
328 def bail(parser,opt):
329 sys.stderr.write('Error: No %s option given\n' % opt)
330 parser.print_help()
331 sys.exit(2)
333 parser=OptionParser()
335 parser.add_option("-m","--max",type="int",dest="max",
336 help="Maximum hg revision to import")
337 parser.add_option("--marks",dest="marksfile",
338 help="File to read git-fast-import's marks from")
339 parser.add_option("--heads",dest="headsfile",
340 help="File to read last run's git heads from")
341 parser.add_option("--status",dest="statusfile",
342 help="File to read status from")
343 parser.add_option("-r","--repo",dest="repourl",
344 help="URL of repo to import")
345 parser.add_option("-s",action="store_true",dest="sob",
346 default=False,help="Enable parsing Signed-off-by lines")
347 parser.add_option("-A","--authors",dest="authorfile",
348 help="Read authormap from AUTHORFILE")
349 parser.add_option("-f","--force",action="store_true",dest="force",
350 default=False,help="Ignore validation errors by force")
352 (options,args)=parser.parse_args()
354 m=-1
355 if options.max!=None: m=options.max
357 if options.marksfile==None: bail(parser,'--marks')
358 if options.headsfile==None: bail(parser,'--heads')
359 if options.statusfile==None: bail(parser,'--status')
360 if options.repourl==None: bail(parser,'--repo')
362 a={}
363 if options.authorfile!=None:
364 a=load_authors(options.authorfile)
366 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
367 options.statusfile,authors=a,sob=options.sob,force=options.force))