hg-fast-import.py: Sanitize ref names
[fast-export/barak.git] / hg-fast-export.py
blob97dba5f5129fcecc5f97dd08fdfa0a0fb7911c9c
1 #!/usr/bin/env python
3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import repo,hg,cmdutil,util,ui,revlog,node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset,load_cache,save_cache,get_git_sha1
8 from tempfile import mkstemp
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 # silly regex to catch Signed-off-by lines in log message
15 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
16 # insert 'checkpoint' command after this many commits or none at all if 0
17 cfg_checkpoint_count=0
18 # write some progress message every this many file contents written
19 cfg_export_boundary=1000
21 def gitmode(x):
22 return x and '100755' or '100644'
24 def wr(msg=''):
25 if msg == None:
26 msg = ''
27 print msg
28 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
30 def checkpoint(count):
31 count=count+1
32 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
33 sys.stderr.write("Checkpoint after %d commits\n" % count)
34 wr('checkpoint')
35 wr()
36 return count
38 def get_parent_mark(parent,marks):
39 """Get the mark for some parent.
40 If we saw it in the current session, return :%d syntax and
41 otherwise the SHA1 from the cache."""
42 return marks.get(str(parent),':%d' % (parent+1))
44 def file_mismatch(f1,f2):
45 """See if two revisions of a file are not equal."""
46 return node.hex(f1)!=node.hex(f2)
48 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
49 """Loop over our repository and find all changed and missing files."""
50 for left in dleft.keys():
51 right=dright.get(left,None)
52 if right==None:
53 # we have the file but our parent hasn't: add to left set
54 l.append(left)
55 elif match(dleft[left],right):
56 # we have it but checksums mismatch: add to center set
57 c.append(left)
58 for right in dright.keys():
59 left=dleft.get(right,None)
60 if left==None:
61 # if parent has file but we don't: add to right set
62 r.append(right)
63 # change is already handled when comparing child against parent
64 return l,c,r
66 def get_filechanges(repo,revision,parents,mleft):
67 """Given some repository and revision, find all changed/deleted files."""
68 l,c,r=[],[],[]
69 for p in parents:
70 if p<0: continue
71 mright=repo.changectx(p).manifest()
72 l,c,r=split_dict(mleft,mright,l,c,r)
73 l.sort()
74 c.sort()
75 r.sort()
76 return l,c,r
78 def get_author(logmessage,committer,authors):
79 """As git distincts between author and committer of a patch, try to
80 extract author by detecting Signed-off-by lines.
82 This walks from the end of the log message towards the top skipping
83 empty lines. Upon the first non-empty line, it walks all Signed-off-by
84 lines upwards to find the first one. For that (if found), it extracts
85 authorship information the usual way (authors table, cleaning, etc.)
87 If no Signed-off-by line is found, this defaults to the committer.
89 This may sound stupid (and it somehow is), but in log messages we
90 accidentially may have lines in the middle starting with
91 "Signed-off-by: foo" and thus matching our detection regex. Prevent
92 that."""
94 loglines=logmessage.split('\n')
95 i=len(loglines)
96 # from tail walk to top skipping empty lines
97 while i>=0:
98 i-=1
99 if len(loglines[i].strip())==0: continue
100 break
101 if i>=0:
102 # walk further upwards to find first sob line, store in 'first'
103 first=None
104 while i>=0:
105 m=sob_re.match(loglines[i])
106 if m==None: break
107 first=m
108 i-=1
109 # if the last non-empty line matches our Signed-Off-by regex: extract username
110 if first!=None:
111 r=fixup_user(first.group(1),authors)
112 return r
113 return committer
115 def export_file_contents(ctx,manifest,files):
116 count=0
117 max=len(files)
118 for file in files:
119 d=ctx.filectx(file).data()
120 wr('M %s inline %s' % (gitmode(manifest.execf(file)),file))
121 wr('data %d' % len(d)) # had some trouble with size()
122 wr(d)
123 count+=1
124 if count%cfg_export_boundary==0:
125 sys.stderr.write('Exported %d/%d files\n' % (count,max))
126 if max>cfg_export_boundary:
127 sys.stderr.write('Exported %d/%d files\n' % (count,max))
129 def is_merge(parents):
131 for parent in parents:
132 if parent>=0:
133 c+=1
134 return c>1
136 def sanitize_name(name,what="branch"):
137 """Sanitize input roughly according to git-check-ref-format(1)"""
139 def dot(name):
140 if name[0] == '.': return '_'+name[1:]
141 return name
143 n=name
144 p=re.compile('([[ ^:?*]|\.\.)')
145 n=p.sub('_', n)
146 if n[-1] == '/': n=n[:-1]+'_'
147 n='/'.join(map(dot,n.split('/')))
148 p=re.compile('_+')
149 n=p.sub('_', n)
151 if n!=name:
152 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
153 return n
155 def export_commit(ui,repo,revision,marks,heads,last,max,count,authors,sob):
156 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
157 parents=repo.changelog.parentrevs(revision)
159 branch=sanitize_name(branch)
161 wr('commit refs/heads/%s' % branch)
162 wr('mark :%d' % (revision+1))
163 if sob:
164 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
165 wr('committer %s %d %s' % (user,time,timezone))
166 wr('data %d' % (len(desc)+1)) # wtf?
167 wr(desc)
168 wr()
170 pidx1, pidx2 = 0, 1
171 if parents[0] < parents[1]:
172 pidx1, pidx2 = 1, 0
174 src=heads.get(branch,'')
175 link=''
176 if src!='':
177 # if we have a cached head, this is an incremental import: initialize it
178 # and kill reference so we won't init it again
179 wr('from %s' % src)
180 heads[branch]=''
181 sys.stderr.write('%s: Initializing to parent [%s]\n' %
182 (branch,src))
183 link=src # avoid making a merge commit for incremental import
184 elif link=='' and not heads.has_key(branch) and revision>0:
185 # newly created branch and not the first one: connect to parent
186 tmp=get_parent_mark(parents[0],marks)
187 wr('from %s' % tmp)
188 sys.stderr.write('%s: Link new branch to parent [%s]\n' %
189 (branch,tmp))
190 link=tmp # avoid making a merge commit for branch fork
191 elif last.get(branch,revision) != parents[pidx1] and parents[pidx1] > 0 and revision > 0:
192 pm=get_parent_mark(parents[pidx1],marks)
193 sys.stderr.write('%s: Placing commit [r%d] in branch [%s] on top of [r%d]\n' %
194 (branch,revision,branch,parents[pidx1]));
195 wr('from %s' % pm)
197 if parents[pidx2] > 0:
198 pm=get_parent_mark(parents[pidx2],marks)
199 sys.stderr.write('%s: Merging with parent [%s] from [r%d]\n' %
200 (branch,pm,parents[pidx2]))
201 wr('merge %s' % pm)
203 last[branch]=revision
204 heads[branch]=''
205 # we need this later to write out tags
206 marks[str(revision)]=':%d'%(revision+1)
208 ctx=repo.changectx(str(revision))
209 man=ctx.manifest()
210 added,changed,removed,type=[],[],[],''
212 if revision==0:
213 # first revision: feed in full manifest
214 added=man.keys()
215 added.sort()
216 type='full'
217 elif is_merge(parents):
218 # later merge revision: feed in changed manifest
219 # for many files comparing checksums is expensive so only do it for
220 # merges where we really need it due to hg's revlog logic
221 added,changed,removed=get_filechanges(repo,revision,parents,man)
222 type='thorough delta'
223 else:
224 # later non-merge revision: feed in changed manifest
225 # if we have exactly one parent, just take the changes from the
226 # manifest without expensively comparing checksums
227 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
228 added,changed,removed=f[1],f[0],f[2]
229 type='simple delta'
231 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
232 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
234 map(lambda r: wr('D %s' % r),removed)
235 export_file_contents(ctx,man,added)
236 export_file_contents(ctx,man,changed)
237 wr()
239 return checkpoint(count)
241 def export_tags(ui,repo,marks_cache,start,end,count,authors):
242 l=repo.tagslist()
243 for tag,node in l:
244 tag=sanitize_name(tag,"tag")
245 # ignore latest revision
246 if tag=='tip': continue
247 rev=repo.changelog.rev(node)
248 # ignore those tags not in our import range
249 if rev<start or rev>=end: continue
251 ref=get_parent_mark(rev,marks_cache)
252 if ref==None:
253 sys.stderr.write('Failed to find reference for creating tag'
254 ' %s at r%d\n' % (tag,rev))
255 continue
256 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
257 wr('reset refs/tags/%s' % tag)
258 wr('from %s' % ref)
259 wr()
260 count=checkpoint(count)
261 return count
263 def load_authors(filename):
264 cache={}
265 if not os.path.exists(filename):
266 return cache
267 f=open(filename,'r')
269 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
270 for line in f.readlines():
271 l+=1
272 m=lre.match(line)
273 if m==None:
274 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
275 continue
276 # put key:value in cache, key without ^:
277 cache[m.group(1).strip()]=m.group(2).strip()
278 f.close()
279 sys.stderr.write('Loaded %d authors\n' % l)
280 return cache
282 def verify_heads(ui,repo,cache,force):
283 branches=repo.branchtags()
284 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
285 l.sort()
287 # get list of hg's branches to verify, don't take all git has
288 for _,_,b in l:
289 b=get_branch(b)
290 sha1=get_git_sha1(b)
291 c=cache.get(b)
292 if sha1!=None and c!=None:
293 sys.stderr.write('Verifying branch [%s]\n' % b)
294 if sha1!=c:
295 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
296 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
297 if not force: return False
299 # verify that branch has exactly one head
300 t={}
301 for h in repo.heads():
302 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
303 if t.get(branch,False):
304 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
305 repo.changelog.rev(h))
306 if not force: return False
307 t[branch]=True
309 return True
311 def mangle_mark(mark):
312 return str(int(mark)-1)
314 def hg2git(repourl,m,marksfile,headsfile,tipfile,authors={},sob=False,force=False):
315 _max=int(m)
317 marks_cache=load_cache(marksfile,mangle_mark)
318 heads_cache=load_cache(headsfile)
319 state_cache=load_cache(tipfile)
321 ui,repo=setup_repo(repourl)
323 if not verify_heads(ui,repo,heads_cache,force):
324 return 1
326 tip=repo.changelog.count()
328 min=int(state_cache.get('tip',0))
329 max=_max
330 if _max<0 or max>tip:
331 max=tip
334 last={}
335 for rev in range(min,max):
336 c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,max,c,authors,sob)
338 c=export_tags(ui,repo,marks_cache,min,max,c,authors)
340 sys.stderr.write('Issued %d commands\n' % c)
342 state_cache['tip']=max
343 state_cache['repo']=repourl
344 save_cache(tipfile,state_cache)
346 return 0
348 if __name__=='__main__':
349 def bail(parser,opt):
350 sys.stderr.write('Error: No %s option given\n' % opt)
351 parser.print_help()
352 sys.exit(2)
354 parser=OptionParser()
356 parser.add_option("-m","--max",type="int",dest="max",
357 help="Maximum hg revision to import")
358 parser.add_option("--marks",dest="marksfile",
359 help="File to read git-fast-import's marks from")
360 parser.add_option("--heads",dest="headsfile",
361 help="File to read last run's git heads from")
362 parser.add_option("--status",dest="statusfile",
363 help="File to read status from")
364 parser.add_option("-r","--repo",dest="repourl",
365 help="URL of repo to import")
366 parser.add_option("-s",action="store_true",dest="sob",
367 default=False,help="Enable parsing Signed-off-by lines")
368 parser.add_option("-A","--authors",dest="authorfile",
369 help="Read authormap from AUTHORFILE")
370 parser.add_option("-f","--force",action="store_true",dest="force",
371 default=False,help="Ignore validation errors by force")
373 (options,args)=parser.parse_args()
375 m=-1
376 if options.max!=None: m=options.max
378 if options.marksfile==None: bail(parser,'--marks')
379 if options.headsfile==None: bail(parser,'--heads')
380 if options.statusfile==None: bail(parser,'--status')
381 if options.repourl==None: bail(parser,'--repo')
383 a={}
384 if options.authorfile!=None:
385 a=load_authors(options.authorfile)
387 sys.exit(hg2git(options.repourl,m,options.marksfile,options.headsfile,
388 options.statusfile,authors=a,sob=options.sob,force=options.force))