Survive corrupt source repositories
[fast-export/rorcz.git] / hg-fast-export.py
blob0e2e6f076c28428ec663d58180710b9f07a6c595
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name,set_unknown_addr
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
28 # ref manipulation regexs
29 ref_crud_re = re.compile(r'[[\x00-\x1f\x7f ~^:\\*?]+', re.S)
30 ref_dotdot_re = re.compile(r'\.\.')
31 ref_atbrace_re = re.compile(r'@\{')
32 ref_dotlock_re = re.compile(r'.*\.lock$', re.I)
33 ref_separators_re = re.compile(r'/+')
34 ref_collapse_re = re.compile(r'_+')
36 def gitmode(flags):
37 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
39 def wr_no_nl(msg=''):
40 if msg:
41 sys.stdout.write(msg)
43 def wr(msg=''):
44 wr_no_nl(msg)
45 sys.stdout.write('\n')
46 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
48 def checkpoint(count):
49 count=count+1
50 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
51 sys.stderr.write("Checkpoint after %d commits\n" % count)
52 wr('checkpoint')
53 wr()
54 return count
56 def revnum_to_revref(rev, old_marks):
57 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
58 or a mark)"""
59 return old_marks.get(rev) or ':%d' % (rev+1)
61 def file_mismatch(f1,f2):
62 """See if two revisions of a file are not equal."""
63 return node.hex(f1)!=node.hex(f2)
65 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
66 """Loop over our repository and find all changed and missing files."""
67 for left in dleft.keys():
68 right=dright.get(left,None)
69 if right==None:
70 # we have the file but our parent hasn't: add to left set
71 l.append(left)
72 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
73 # we have it but checksums mismatch: add to center set
74 c.append(left)
75 for right in dright.keys():
76 left=dleft.get(right,None)
77 if left==None:
78 # if parent has file but we don't: add to right set
79 r.append(right)
80 # change is already handled when comparing child against parent
81 return l,c,r
83 def get_filechanges(repo,revision,parents,mleft):
84 """Given some repository and revision, find all changed/deleted files."""
85 l,c,r=[],[],[]
86 for p in parents:
87 if p<0: continue
88 mright=repo.changectx(p).manifest()
89 l,c,r=split_dict(mleft,mright,l,c,r)
90 l.sort()
91 c.sort()
92 r.sort()
93 return l,c,r
95 def get_author(logmessage,committer,authors):
96 """As git distincts between author and committer of a patch, try to
97 extract author by detecting Signed-off-by lines.
99 This walks from the end of the log message towards the top skipping
100 empty lines. Upon the first non-empty line, it walks all Signed-off-by
101 lines upwards to find the first one. For that (if found), it extracts
102 authorship information the usual way (authors table, cleaning, etc.)
104 If no Signed-off-by line is found, this defaults to the committer.
106 This may sound stupid (and it somehow is), but in log messages we
107 accidentially may have lines in the middle starting with
108 "Signed-off-by: foo" and thus matching our detection regex. Prevent
109 that."""
111 loglines=logmessage.split('\n')
112 i=len(loglines)
113 # from tail walk to top skipping empty lines
114 while i>=0:
115 i-=1
116 if len(loglines[i].strip())==0: continue
117 break
118 if i>=0:
119 # walk further upwards to find first sob line, store in 'first'
120 first=None
121 while i>=0:
122 m=sob_re.match(loglines[i])
123 if m==None: break
124 first=m
125 i-=1
126 # if the last non-empty line matches our Signed-Off-by regex: extract username
127 if first!=None:
128 r=fixup_user(first.group(1),authors)
129 return r
130 return committer
132 def export_file_contents(ctx,manifest,files,hgtags):
133 count=0
134 max=len(files)
135 for file in files:
136 # Skip .hgtags files. They only get us in trouble.
137 if not hgtags and file == ".hgtags":
138 sys.stderr.write('Skip %s\n' % (file))
139 continue
140 d=ctx.filectx(file).data()
141 wr('M %s inline %s' % (gitmode(manifest.flags(file)),
142 strip_leading_slash(file)))
143 wr('data %d' % len(d)) # had some trouble with size()
144 wr(d)
145 count+=1
146 if count%cfg_export_boundary==0:
147 sys.stderr.write('Exported %d/%d files\n' % (count,max))
148 if max>cfg_export_boundary:
149 sys.stderr.write('Exported %d/%d files\n' % (count,max))
151 def sanitize_name(name,what="branch",flatten=False):
152 """Sanitize input roughly according to git-check-ref-format(1)"""
154 def dot(name):
155 if len(name) >= 1 and name[0] == '.': return '_'+name[1:]
156 return name
158 if name == '':
159 # be paranoid just in case
160 n = '_'
161 else:
162 n = name
163 n = ref_crud_re.sub('_', n)
164 n = ref_dotdot_re.sub('_', n)
165 n = ref_atbrace_re.sub('_{', n)
166 if ref_dotlock_re.match(n):
167 n = n[:-5] + '_' + n[-4:]
168 if n[-1] in ('/', '.'): n=n[:-1]+'_'
169 if flatten:
170 if n[0] == '.': n='_'+n[1:]
171 n = ref_separators_re.sub('_', n)
172 else:
173 n = '/'.join(map(dot, n.split('/')))
174 if n[0] == '/': n='_'+n[1:]
175 n = ref_separators_re.sub('/', n)
176 n = ref_collapse_re.sub('_', n)
178 if n!=name:
179 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
180 return n
182 def strip_leading_slash(filename):
183 if filename[0] == '/':
184 return filename[1:]
185 return filename
187 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,flatten,notes):
188 def get_branchname(name):
189 if brmap.has_key(name):
190 return brmap[name]
191 n=sanitize_name(name,flatten=flatten)
192 brmap[name]=n
193 return n
195 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
197 branch=get_branchname(branch)
199 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
201 if len(parents)==0 and revision != 0:
202 wr('reset refs/heads/%s' % branch)
204 wr('commit refs/heads/%s' % branch)
205 wr('mark :%d' % (revision+1))
206 if sob:
207 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
208 wr('committer %s %d %s' % (user,time,timezone))
209 wr('data %d' % (len(desc)+1)) # wtf?
210 wr(desc)
211 wr()
213 ctx=repo.changectx(str(revision))
214 man=ctx.manifest()
215 added,changed,removed,type=[],[],[],''
217 if len(parents) == 0:
218 # first revision: feed in full manifest
219 added=man.keys()
220 added.sort()
221 type='full'
222 else:
223 wr('from %s' % revnum_to_revref(parents[0], old_marks))
224 if len(parents) == 1:
225 # later non-merge revision: feed in changed manifest
226 # if we have exactly one parent, just take the changes from the
227 # manifest without expensively comparing checksums
228 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
229 added,changed,removed=f[1],f[0],f[2]
230 type='simple delta'
231 else: # a merge with two parents
232 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
233 # later merge revision: feed in changed manifest
234 # for many files comparing checksums is expensive so only do it for
235 # merges where we really need it due to hg's revlog logic
236 added,changed,removed=get_filechanges(repo,revision,parents,man)
237 type='thorough delta'
239 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
240 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
242 removed=[strip_leading_slash(x) for x in removed]
244 map(lambda r: wr('D %s' % r),removed)
245 export_file_contents(ctx,man,added,hgtags)
246 export_file_contents(ctx,man,changed,hgtags)
247 wr()
249 count=checkpoint(count)
250 count=generate_note(user,time,timezone,revision,ctx,count,notes)
251 return count
253 def generate_note(user,time,timezone,revision,ctx,count,notes):
254 if not notes:
255 return count
256 wr('commit refs/notes/hg')
257 wr('committer %s %d %s' % (user,time,timezone))
258 wr('data 0')
259 wr('N inline :%d' % (revision+1))
260 hg_hash=ctx.hex()
261 wr('data %d' % (len(hg_hash)))
262 wr_no_nl(hg_hash)
263 wr()
264 return checkpoint(count)
266 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,flatten):
267 l=repo.tagslist()
268 for tag,node in l:
269 tag=sanitize_name(tag,"tag",flatten=flatten)
270 # ignore latest revision
271 if tag=='tip': continue
272 # ignore tags to nodes that are missing (ie, 'in the future')
273 if node.encode('hex_codec') not in mapping_cache:
274 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
275 continue
277 rev=int(mapping_cache[node.encode('hex_codec')])
279 ref=revnum_to_revref(rev, old_marks)
280 if ref==None:
281 sys.stderr.write('Failed to find reference for creating tag'
282 ' %s at r%d\n' % (tag,rev))
283 continue
284 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
285 wr('reset refs/tags/%s' % tag)
286 wr('from %s' % ref)
287 wr()
288 count=checkpoint(count)
289 return count
291 def load_authors(filename):
292 cache={}
293 if not os.path.exists(filename):
294 return cache
295 f=open(filename,'r')
298 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
299 for line in f.readlines():
300 l+=1
301 line=line.strip()
302 if line=='' or line[0]=='#':
303 continue
304 m=lre.match(line)
305 if m==None:
306 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
307 continue
308 # put key:value in cache, key without ^:
309 cache[m.group(1).strip()]=m.group(2).strip()
310 a+=1
311 f.close()
312 sys.stderr.write('Loaded %d authors\n' % a)
313 return cache
315 def branchtip(repo, heads):
316 '''return the tipmost branch head in heads'''
317 tip = heads[-1]
318 for h in reversed(heads):
319 if 'close' not in repo.changelog.read(h)[5]:
320 tip = h
321 break
322 return tip
324 def verify_heads(ui,repo,cache,force):
325 branches={}
326 for bn, heads in repo.branchmap().iteritems():
327 branches[bn] = branchtip(repo, heads)
328 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
329 l.sort()
331 # get list of hg's branches to verify, don't take all git has
332 for _,_,b in l:
333 b=get_branch(b)
334 sha1=get_git_sha1(b)
335 c=cache.get(b)
336 if sha1!=c:
337 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
338 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
339 if not force: return False
341 # verify that branch has exactly one head
342 t={}
343 for h in repo.heads():
344 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
345 if t.get(branch,False):
346 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
347 repo.changelog.rev(h))
348 if not force: return False
349 t[branch]=True
351 return True
353 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,flatten=False,notes=False):
354 _max=int(m)
356 old_marks=load_cache(marksfile,lambda s: int(s)-1)
357 mapping_cache=load_cache(mappingfile)
358 heads_cache=load_cache(headsfile)
359 state_cache=load_cache(tipfile)
361 ui,repo=setup_repo(repourl)
363 if not verify_heads(ui,repo,heads_cache,force):
364 return 1
366 try:
367 tip=repo.changelog.count()
368 except AttributeError:
369 tip=len(repo)
371 min=int(state_cache.get('tip',0))
372 max=_max
373 if _max<0 or max>tip:
374 max=tip
376 for rev in range(0,max):
377 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
378 mapping_cache[revnode.encode('hex_codec')] = str(rev)
382 brmap={}
383 for rev in range(min,max):
384 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,flatten,notes)
386 state_cache['tip']=max
387 state_cache['repo']=repourl
388 save_cache(tipfile,state_cache)
389 save_cache(mappingfile,mapping_cache)
391 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,flatten)
393 sys.stderr.write('Issued %d commands\n' % c)
395 return 0
397 if __name__=='__main__':
398 def bail(parser,opt):
399 sys.stderr.write('Error: No %s option given\n' % opt)
400 parser.print_help()
401 sys.exit(2)
403 parser=OptionParser()
405 parser.add_option("-m","--max",type="int",dest="max",
406 help="Maximum hg revision to import")
407 parser.add_option("--mapping",dest="mappingfile",
408 help="File to read last run's hg-to-git SHA1 mapping")
409 parser.add_option("--marks",dest="marksfile",
410 help="File to read git-fast-import's marks from")
411 parser.add_option("--heads",dest="headsfile",
412 help="File to read last run's git heads from")
413 parser.add_option("--status",dest="statusfile",
414 help="File to read status from")
415 parser.add_option("-r","--repo",dest="repourl",
416 help="URL of repo to import")
417 parser.add_option("-s",action="store_true",dest="sob",
418 default=False,help="Enable parsing Signed-off-by lines")
419 parser.add_option("--hgtags",action="store_true",dest="hgtags",
420 default=False,help="Enable exporting .hgtags files")
421 parser.add_option("--flatten",action="store_true",dest="flatten",
422 default=False,help="Create one-level ref names (convert '/' to '_')")
423 parser.add_option("-A","--authors",dest="authorfile",
424 help="Read authormap from AUTHORFILE")
425 parser.add_option("-U",dest="unknown",
426 help="Email address to use for unknown instead of 'devnull@localhost'")
427 parser.add_option("-f","--force",action="store_true",dest="force",
428 default=False,help="Ignore validation errors by force")
429 parser.add_option("-M","--default-branch",dest="default_branch",
430 help="Set the default branch")
431 parser.add_option("-o","--origin",dest="origin_name",
432 help="use <name> as namespace to track upstream")
433 parser.add_option("--hg-hash",action="store_true",dest="notes",
434 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
436 (options,args)=parser.parse_args()
438 m=-1
439 if options.max!=None: m=options.max
441 if options.marksfile==None: bail(parser,'--marks')
442 if options.mappingfile==None: bail(parser,'--mapping')
443 if options.headsfile==None: bail(parser,'--heads')
444 if options.statusfile==None: bail(parser,'--status')
445 if options.repourl==None: bail(parser,'--repo')
447 a={}
448 if options.authorfile!=None:
449 a=load_authors(options.authorfile)
451 if options.unknown!=None:
452 if not set_unknown_addr(options.unknown):
453 sys.stderr.write("Error: Invalid email address '%s'\n" % options.unknown)
454 sys.exit(2)
456 if options.default_branch!=None:
457 set_default_branch(options.default_branch)
459 if options.origin_name!=None:
460 set_origin_name(options.origin_name)
462 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
463 options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags,flatten=options.flatten,notes=options.notes))