hg-fast-export.py: do not generate invalid ref names
[fast-export/rorcz.git] / hg-fast-export.py
blobe52da83b10e21b156130bf872084716463b1873f
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name,set_unknown_addr
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
28 # ref manipulation regexs
29 ref_crud_re = re.compile(r'[[\x00-\x1f\x7f ~^:\\*?]+', re.S)
30 ref_dotdot_re = re.compile(r'\.\.')
31 ref_atbrace_re = re.compile(r'@\{')
32 ref_dotlock_re = re.compile(r'.*\.lock$', re.I)
33 ref_separators_re = re.compile(r'/+')
34 ref_collapse_re = re.compile(r'_+')
36 def gitmode(flags):
37 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
39 def wr(msg=''):
40 if msg:
41 sys.stdout.write(msg)
42 sys.stdout.write('\n')
43 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
45 def checkpoint(count):
46 count=count+1
47 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
48 sys.stderr.write("Checkpoint after %d commits\n" % count)
49 wr('checkpoint')
50 wr()
51 return count
53 def revnum_to_revref(rev, old_marks):
54 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
55 or a mark)"""
56 return old_marks.get(rev) or ':%d' % (rev+1)
58 def file_mismatch(f1,f2):
59 """See if two revisions of a file are not equal."""
60 return node.hex(f1)!=node.hex(f2)
62 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
63 """Loop over our repository and find all changed and missing files."""
64 for left in dleft.keys():
65 right=dright.get(left,None)
66 if right==None:
67 # we have the file but our parent hasn't: add to left set
68 l.append(left)
69 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
70 # we have it but checksums mismatch: add to center set
71 c.append(left)
72 for right in dright.keys():
73 left=dleft.get(right,None)
74 if left==None:
75 # if parent has file but we don't: add to right set
76 r.append(right)
77 # change is already handled when comparing child against parent
78 return l,c,r
80 def get_filechanges(repo,revision,parents,mleft):
81 """Given some repository and revision, find all changed/deleted files."""
82 l,c,r=[],[],[]
83 for p in parents:
84 if p<0: continue
85 mright=repo.changectx(p).manifest()
86 l,c,r=split_dict(mleft,mright,l,c,r)
87 l.sort()
88 c.sort()
89 r.sort()
90 return l,c,r
92 def get_author(logmessage,committer,authors):
93 """As git distincts between author and committer of a patch, try to
94 extract author by detecting Signed-off-by lines.
96 This walks from the end of the log message towards the top skipping
97 empty lines. Upon the first non-empty line, it walks all Signed-off-by
98 lines upwards to find the first one. For that (if found), it extracts
99 authorship information the usual way (authors table, cleaning, etc.)
101 If no Signed-off-by line is found, this defaults to the committer.
103 This may sound stupid (and it somehow is), but in log messages we
104 accidentially may have lines in the middle starting with
105 "Signed-off-by: foo" and thus matching our detection regex. Prevent
106 that."""
108 loglines=logmessage.split('\n')
109 i=len(loglines)
110 # from tail walk to top skipping empty lines
111 while i>=0:
112 i-=1
113 if len(loglines[i].strip())==0: continue
114 break
115 if i>=0:
116 # walk further upwards to find first sob line, store in 'first'
117 first=None
118 while i>=0:
119 m=sob_re.match(loglines[i])
120 if m==None: break
121 first=m
122 i-=1
123 # if the last non-empty line matches our Signed-Off-by regex: extract username
124 if first!=None:
125 r=fixup_user(first.group(1),authors)
126 return r
127 return committer
129 def export_file_contents(ctx,manifest,files,hgtags):
130 count=0
131 max=len(files)
132 for file in files:
133 # Skip .hgtags files. They only get us in trouble.
134 if not hgtags and file == ".hgtags":
135 sys.stderr.write('Skip %s\n' % (file))
136 continue
137 d=ctx.filectx(file).data()
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
139 wr('data %d' % len(d)) # had some trouble with size()
140 wr(d)
141 count+=1
142 if count%cfg_export_boundary==0:
143 sys.stderr.write('Exported %d/%d files\n' % (count,max))
144 if max>cfg_export_boundary:
145 sys.stderr.write('Exported %d/%d files\n' % (count,max))
147 def sanitize_name(name,what="branch"):
148 """Sanitize input roughly according to git-check-ref-format(1)"""
150 def dot(name):
151 if len(name) >= 1 and name[0] == '.': return '_'+name[1:]
152 return name
154 if name == '':
155 # be paranoid just in case
156 n = '_'
157 else:
158 n = name
159 n = ref_crud_re.sub('_', n)
160 n = ref_dotdot_re.sub('_', n)
161 n = ref_atbrace_re.sub('_{', n)
162 if ref_dotlock_re.match(n):
163 n = n[:-5] + '_' + n[-4:]
164 if n[-1] in ('/', '.'): n=n[:-1]+'_'
165 n='/'.join(map(dot,n.split('/')))
166 if n[0] == '/': n='_'+n[1:]
167 n = ref_separators_re.sub('/', n)
168 n = ref_collapse_re.sub('_', n)
170 if n!=name:
171 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
172 return n
174 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags):
175 def get_branchname(name):
176 if brmap.has_key(name):
177 return brmap[name]
178 n=sanitize_name(name)
179 brmap[name]=n
180 return n
182 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
184 branch=get_branchname(branch)
186 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
188 if len(parents)==0 and revision != 0:
189 wr('reset refs/heads/%s' % branch)
191 wr('commit refs/heads/%s' % branch)
192 wr('mark :%d' % (revision+1))
193 if sob:
194 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
195 wr('committer %s %d %s' % (user,time,timezone))
196 wr('data %d' % (len(desc)+1)) # wtf?
197 wr(desc)
198 wr()
200 ctx=repo.changectx(str(revision))
201 man=ctx.manifest()
202 added,changed,removed,type=[],[],[],''
204 if len(parents) == 0:
205 # first revision: feed in full manifest
206 added=man.keys()
207 added.sort()
208 type='full'
209 else:
210 wr('from %s' % revnum_to_revref(parents[0], old_marks))
211 if len(parents) == 1:
212 # later non-merge revision: feed in changed manifest
213 # if we have exactly one parent, just take the changes from the
214 # manifest without expensively comparing checksums
215 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
216 added,changed,removed=f[1],f[0],f[2]
217 type='simple delta'
218 else: # a merge with two parents
219 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
220 # later merge revision: feed in changed manifest
221 # for many files comparing checksums is expensive so only do it for
222 # merges where we really need it due to hg's revlog logic
223 added,changed,removed=get_filechanges(repo,revision,parents,man)
224 type='thorough delta'
226 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
227 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
229 map(lambda r: wr('D %s' % r),removed)
230 export_file_contents(ctx,man,added,hgtags)
231 export_file_contents(ctx,man,changed,hgtags)
232 wr()
234 return checkpoint(count)
236 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
237 l=repo.tagslist()
238 for tag,node in l:
239 tag=sanitize_name(tag,"tag")
240 # ignore latest revision
241 if tag=='tip': continue
242 # ignore tags to nodes that are missing (ie, 'in the future')
243 if node.encode('hex_codec') not in mapping_cache:
244 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
245 continue
247 rev=int(mapping_cache[node.encode('hex_codec')])
249 ref=revnum_to_revref(rev, old_marks)
250 if ref==None:
251 sys.stderr.write('Failed to find reference for creating tag'
252 ' %s at r%d\n' % (tag,rev))
253 continue
254 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
255 wr('reset refs/tags/%s' % tag)
256 wr('from %s' % ref)
257 wr()
258 count=checkpoint(count)
259 return count
261 def load_authors(filename):
262 cache={}
263 if not os.path.exists(filename):
264 return cache
265 f=open(filename,'r')
268 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
269 for line in f.readlines():
270 l+=1
271 line=line.strip()
272 if line=='' or line[0]=='#':
273 continue
274 m=lre.match(line)
275 if m==None:
276 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
277 continue
278 # put key:value in cache, key without ^:
279 cache[m.group(1).strip()]=m.group(2).strip()
280 a+=1
281 f.close()
282 sys.stderr.write('Loaded %d authors\n' % a)
283 return cache
285 def branchtip(repo, heads):
286 '''return the tipmost branch head in heads'''
287 tip = heads[-1]
288 for h in reversed(heads):
289 if 'close' not in repo.changelog.read(h)[5]:
290 tip = h
291 break
292 return tip
294 def verify_heads(ui,repo,cache,force):
295 branches={}
296 for bn, heads in repo.branchmap().iteritems():
297 branches[bn] = branchtip(repo, heads)
298 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
299 l.sort()
301 # get list of hg's branches to verify, don't take all git has
302 for _,_,b in l:
303 b=get_branch(b)
304 sha1=get_git_sha1(b)
305 c=cache.get(b)
306 if sha1!=c:
307 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
308 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
309 if not force: return False
311 # verify that branch has exactly one head
312 t={}
313 for h in repo.heads():
314 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
315 if t.get(branch,False):
316 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
317 repo.changelog.rev(h))
318 if not force: return False
319 t[branch]=True
321 return True
323 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False):
324 _max=int(m)
326 old_marks=load_cache(marksfile,lambda s: int(s)-1)
327 mapping_cache=load_cache(mappingfile)
328 heads_cache=load_cache(headsfile)
329 state_cache=load_cache(tipfile)
331 ui,repo=setup_repo(repourl)
333 if not verify_heads(ui,repo,heads_cache,force):
334 return 1
336 try:
337 tip=repo.changelog.count()
338 except AttributeError:
339 tip=len(repo)
341 min=int(state_cache.get('tip',0))
342 max=_max
343 if _max<0 or max>tip:
344 max=tip
346 for rev in range(0,max):
347 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
348 mapping_cache[revnode.encode('hex_codec')] = str(rev)
352 brmap={}
353 for rev in range(min,max):
354 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags)
356 state_cache['tip']=max
357 state_cache['repo']=repourl
358 save_cache(tipfile,state_cache)
359 save_cache(mappingfile,mapping_cache)
361 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
363 sys.stderr.write('Issued %d commands\n' % c)
365 return 0
367 if __name__=='__main__':
368 def bail(parser,opt):
369 sys.stderr.write('Error: No %s option given\n' % opt)
370 parser.print_help()
371 sys.exit(2)
373 parser=OptionParser()
375 parser.add_option("-m","--max",type="int",dest="max",
376 help="Maximum hg revision to import")
377 parser.add_option("--mapping",dest="mappingfile",
378 help="File to read last run's hg-to-git SHA1 mapping")
379 parser.add_option("--marks",dest="marksfile",
380 help="File to read git-fast-import's marks from")
381 parser.add_option("--heads",dest="headsfile",
382 help="File to read last run's git heads from")
383 parser.add_option("--status",dest="statusfile",
384 help="File to read status from")
385 parser.add_option("-r","--repo",dest="repourl",
386 help="URL of repo to import")
387 parser.add_option("-s",action="store_true",dest="sob",
388 default=False,help="Enable parsing Signed-off-by lines")
389 parser.add_option("--hgtags",action="store_true",dest="hgtags",
390 default=False,help="Enable exporting .hgtags files")
391 parser.add_option("-A","--authors",dest="authorfile",
392 help="Read authormap from AUTHORFILE")
393 parser.add_option("-U",dest="unknown",
394 help="Email address to use for unknown instead of 'devnull@localhost'")
395 parser.add_option("-f","--force",action="store_true",dest="force",
396 default=False,help="Ignore validation errors by force")
397 parser.add_option("-M","--default-branch",dest="default_branch",
398 help="Set the default branch")
399 parser.add_option("-o","--origin",dest="origin_name",
400 help="use <name> as namespace to track upstream")
402 (options,args)=parser.parse_args()
404 m=-1
405 if options.max!=None: m=options.max
407 if options.marksfile==None: bail(parser,'--marks')
408 if options.mappingfile==None: bail(parser,'--mapping')
409 if options.headsfile==None: bail(parser,'--heads')
410 if options.statusfile==None: bail(parser,'--status')
411 if options.repourl==None: bail(parser,'--repo')
413 a={}
414 if options.authorfile!=None:
415 a=load_authors(options.authorfile)
417 if options.unknown!=None:
418 if not set_unknown_addr(options.unknown):
419 sys.stderr.write("Error: Invalid email address '%s'\n" % options.unknown)
420 sys.exit(2)
422 if options.default_branch!=None:
423 set_default_branch(options.default_branch)
425 if options.origin_name!=None:
426 set_origin_name(options.origin_name)
428 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
429 options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags))