hg-fast-export.sh: minor clean up and code optimization
[fast-export/rorcz.git] / hg-fast-export.py
blob27d3d00ead46646f05d9babdbb74bdb616788280
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name,set_unknown_addr
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
28 # ref manipulation regexs
29 ref_crud_re = re.compile(r'[[\x00-\x1f\x7f ~^:\\*?]+', re.S)
30 ref_dotdot_re = re.compile(r'\.\.')
31 ref_atbrace_re = re.compile(r'@\{')
32 ref_dotlock_re = re.compile(r'.*\.lock$', re.I)
33 ref_separators_re = re.compile(r'/+')
34 ref_collapse_re = re.compile(r'_+')
36 def gitmode(flags):
37 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
39 def wr(msg=''):
40 if msg:
41 sys.stdout.write(msg)
42 sys.stdout.write('\n')
43 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
45 def checkpoint(count):
46 count=count+1
47 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
48 sys.stderr.write("Checkpoint after %d commits\n" % count)
49 wr('checkpoint')
50 wr()
51 return count
53 def revnum_to_revref(rev, old_marks):
54 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
55 or a mark)"""
56 return old_marks.get(rev) or ':%d' % (rev+1)
58 def file_mismatch(f1,f2):
59 """See if two revisions of a file are not equal."""
60 return node.hex(f1)!=node.hex(f2)
62 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
63 """Loop over our repository and find all changed and missing files."""
64 for left in dleft.keys():
65 right=dright.get(left,None)
66 if right==None:
67 # we have the file but our parent hasn't: add to left set
68 l.append(left)
69 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
70 # we have it but checksums mismatch: add to center set
71 c.append(left)
72 for right in dright.keys():
73 left=dleft.get(right,None)
74 if left==None:
75 # if parent has file but we don't: add to right set
76 r.append(right)
77 # change is already handled when comparing child against parent
78 return l,c,r
80 def get_filechanges(repo,revision,parents,mleft):
81 """Given some repository and revision, find all changed/deleted files."""
82 l,c,r=[],[],[]
83 for p in parents:
84 if p<0: continue
85 mright=repo.changectx(p).manifest()
86 l,c,r=split_dict(mleft,mright,l,c,r)
87 l.sort()
88 c.sort()
89 r.sort()
90 return l,c,r
92 def get_author(logmessage,committer,authors):
93 """As git distincts between author and committer of a patch, try to
94 extract author by detecting Signed-off-by lines.
96 This walks from the end of the log message towards the top skipping
97 empty lines. Upon the first non-empty line, it walks all Signed-off-by
98 lines upwards to find the first one. For that (if found), it extracts
99 authorship information the usual way (authors table, cleaning, etc.)
101 If no Signed-off-by line is found, this defaults to the committer.
103 This may sound stupid (and it somehow is), but in log messages we
104 accidentially may have lines in the middle starting with
105 "Signed-off-by: foo" and thus matching our detection regex. Prevent
106 that."""
108 loglines=logmessage.split('\n')
109 i=len(loglines)
110 # from tail walk to top skipping empty lines
111 while i>=0:
112 i-=1
113 if len(loglines[i].strip())==0: continue
114 break
115 if i>=0:
116 # walk further upwards to find first sob line, store in 'first'
117 first=None
118 while i>=0:
119 m=sob_re.match(loglines[i])
120 if m==None: break
121 first=m
122 i-=1
123 # if the last non-empty line matches our Signed-Off-by regex: extract username
124 if first!=None:
125 r=fixup_user(first.group(1),authors)
126 return r
127 return committer
129 def export_file_contents(ctx,manifest,files,hgtags):
130 count=0
131 max=len(files)
132 for file in files:
133 # Skip .hgtags files. They only get us in trouble.
134 if not hgtags and file == ".hgtags":
135 sys.stderr.write('Skip %s\n' % (file))
136 continue
137 d=ctx.filectx(file).data()
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),file))
139 wr('data %d' % len(d)) # had some trouble with size()
140 wr(d)
141 count+=1
142 if count%cfg_export_boundary==0:
143 sys.stderr.write('Exported %d/%d files\n' % (count,max))
144 if max>cfg_export_boundary:
145 sys.stderr.write('Exported %d/%d files\n' % (count,max))
147 def sanitize_name(name,what="branch",flatten=False):
148 """Sanitize input roughly according to git-check-ref-format(1)"""
150 def dot(name):
151 if len(name) >= 1 and name[0] == '.': return '_'+name[1:]
152 return name
154 if name == '':
155 # be paranoid just in case
156 n = '_'
157 else:
158 n = name
159 n = ref_crud_re.sub('_', n)
160 n = ref_dotdot_re.sub('_', n)
161 n = ref_atbrace_re.sub('_{', n)
162 if ref_dotlock_re.match(n):
163 n = n[:-5] + '_' + n[-4:]
164 if n[-1] in ('/', '.'): n=n[:-1]+'_'
165 if flatten:
166 if n[0] == '.': n='_'+n[1:]
167 n = ref_separators_re.sub('_', n)
168 else:
169 n = '/'.join(map(dot, n.split('/')))
170 if n[0] == '/': n='_'+n[1:]
171 n = ref_separators_re.sub('/', n)
172 n = ref_collapse_re.sub('_', n)
174 if n!=name:
175 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
176 return n
178 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,flatten):
179 def get_branchname(name):
180 if brmap.has_key(name):
181 return brmap[name]
182 n=sanitize_name(name,flatten=flatten)
183 brmap[name]=n
184 return n
186 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors)
188 branch=get_branchname(branch)
190 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
192 if len(parents)==0 and revision != 0:
193 wr('reset refs/heads/%s' % branch)
195 wr('commit refs/heads/%s' % branch)
196 wr('mark :%d' % (revision+1))
197 if sob:
198 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
199 wr('committer %s %d %s' % (user,time,timezone))
200 wr('data %d' % (len(desc)+1)) # wtf?
201 wr(desc)
202 wr()
204 ctx=repo.changectx(str(revision))
205 man=ctx.manifest()
206 added,changed,removed,type=[],[],[],''
208 if len(parents) == 0:
209 # first revision: feed in full manifest
210 added=man.keys()
211 added.sort()
212 type='full'
213 else:
214 wr('from %s' % revnum_to_revref(parents[0], old_marks))
215 if len(parents) == 1:
216 # later non-merge revision: feed in changed manifest
217 # if we have exactly one parent, just take the changes from the
218 # manifest without expensively comparing checksums
219 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
220 added,changed,removed=f[1],f[0],f[2]
221 type='simple delta'
222 else: # a merge with two parents
223 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
224 # later merge revision: feed in changed manifest
225 # for many files comparing checksums is expensive so only do it for
226 # merges where we really need it due to hg's revlog logic
227 added,changed,removed=get_filechanges(repo,revision,parents,man)
228 type='thorough delta'
230 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
231 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
233 map(lambda r: wr('D %s' % r),removed)
234 export_file_contents(ctx,man,added,hgtags)
235 export_file_contents(ctx,man,changed,hgtags)
236 wr()
238 return checkpoint(count)
240 def export_tags(ui,repo,old_marks,mapping_cache,count,authors,flatten):
241 l=repo.tagslist()
242 for tag,node in l:
243 tag=sanitize_name(tag,"tag",flatten=flatten)
244 # ignore latest revision
245 if tag=='tip': continue
246 # ignore tags to nodes that are missing (ie, 'in the future')
247 if node.encode('hex_codec') not in mapping_cache:
248 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
249 continue
251 rev=int(mapping_cache[node.encode('hex_codec')])
253 ref=revnum_to_revref(rev, old_marks)
254 if ref==None:
255 sys.stderr.write('Failed to find reference for creating tag'
256 ' %s at r%d\n' % (tag,rev))
257 continue
258 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
259 wr('reset refs/tags/%s' % tag)
260 wr('from %s' % ref)
261 wr()
262 count=checkpoint(count)
263 return count
265 def load_authors(filename):
266 cache={}
267 if not os.path.exists(filename):
268 return cache
269 f=open(filename,'r')
272 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
273 for line in f.readlines():
274 l+=1
275 line=line.strip()
276 if line=='' or line[0]=='#':
277 continue
278 m=lre.match(line)
279 if m==None:
280 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
281 continue
282 # put key:value in cache, key without ^:
283 cache[m.group(1).strip()]=m.group(2).strip()
284 a+=1
285 f.close()
286 sys.stderr.write('Loaded %d authors\n' % a)
287 return cache
289 def branchtip(repo, heads):
290 '''return the tipmost branch head in heads'''
291 tip = heads[-1]
292 for h in reversed(heads):
293 if 'close' not in repo.changelog.read(h)[5]:
294 tip = h
295 break
296 return tip
298 def verify_heads(ui,repo,cache,force):
299 branches={}
300 for bn, heads in repo.branchmap().iteritems():
301 branches[bn] = branchtip(repo, heads)
302 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
303 l.sort()
305 # get list of hg's branches to verify, don't take all git has
306 for _,_,b in l:
307 b=get_branch(b)
308 sha1=get_git_sha1(b)
309 c=cache.get(b)
310 if sha1!=c:
311 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
312 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
313 if not force: return False
315 # verify that branch has exactly one head
316 t={}
317 for h in repo.heads():
318 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
319 if t.get(branch,False):
320 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
321 repo.changelog.rev(h))
322 if not force: return False
323 t[branch]=True
325 return True
327 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,flatten=False):
328 _max=int(m)
330 old_marks=load_cache(marksfile,lambda s: int(s)-1)
331 mapping_cache=load_cache(mappingfile)
332 heads_cache=load_cache(headsfile)
333 state_cache=load_cache(tipfile)
335 ui,repo=setup_repo(repourl)
337 if not verify_heads(ui,repo,heads_cache,force):
338 return 1
340 try:
341 tip=repo.changelog.count()
342 except AttributeError:
343 tip=len(repo)
345 min=int(state_cache.get('tip',0))
346 max=_max
347 if _max<0 or max>tip:
348 max=tip
350 for rev in range(0,max):
351 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
352 mapping_cache[revnode.encode('hex_codec')] = str(rev)
356 brmap={}
357 for rev in range(min,max):
358 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,flatten)
360 state_cache['tip']=max
361 state_cache['repo']=repourl
362 save_cache(tipfile,state_cache)
363 save_cache(mappingfile,mapping_cache)
365 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors,flatten)
367 sys.stderr.write('Issued %d commands\n' % c)
369 return 0
371 if __name__=='__main__':
372 def bail(parser,opt):
373 sys.stderr.write('Error: No %s option given\n' % opt)
374 parser.print_help()
375 sys.exit(2)
377 parser=OptionParser()
379 parser.add_option("-m","--max",type="int",dest="max",
380 help="Maximum hg revision to import")
381 parser.add_option("--mapping",dest="mappingfile",
382 help="File to read last run's hg-to-git SHA1 mapping")
383 parser.add_option("--marks",dest="marksfile",
384 help="File to read git-fast-import's marks from")
385 parser.add_option("--heads",dest="headsfile",
386 help="File to read last run's git heads from")
387 parser.add_option("--status",dest="statusfile",
388 help="File to read status from")
389 parser.add_option("-r","--repo",dest="repourl",
390 help="URL of repo to import")
391 parser.add_option("-s",action="store_true",dest="sob",
392 default=False,help="Enable parsing Signed-off-by lines")
393 parser.add_option("--hgtags",action="store_true",dest="hgtags",
394 default=False,help="Enable exporting .hgtags files")
395 parser.add_option("--flatten",action="store_true",dest="flatten",
396 default=False,help="Create one-level ref names (convert '/' to '_')")
397 parser.add_option("-A","--authors",dest="authorfile",
398 help="Read authormap from AUTHORFILE")
399 parser.add_option("-U",dest="unknown",
400 help="Email address to use for unknown instead of 'devnull@localhost'")
401 parser.add_option("-f","--force",action="store_true",dest="force",
402 default=False,help="Ignore validation errors by force")
403 parser.add_option("-M","--default-branch",dest="default_branch",
404 help="Set the default branch")
405 parser.add_option("-o","--origin",dest="origin_name",
406 help="use <name> as namespace to track upstream")
408 (options,args)=parser.parse_args()
410 m=-1
411 if options.max!=None: m=options.max
413 if options.marksfile==None: bail(parser,'--marks')
414 if options.mappingfile==None: bail(parser,'--mapping')
415 if options.headsfile==None: bail(parser,'--heads')
416 if options.statusfile==None: bail(parser,'--status')
417 if options.repourl==None: bail(parser,'--repo')
419 a={}
420 if options.authorfile!=None:
421 a=load_authors(options.authorfile)
423 if options.unknown!=None:
424 if not set_unknown_addr(options.unknown):
425 sys.stderr.write("Error: Invalid email address '%s'\n" % options.unknown)
426 sys.exit(2)
428 if options.default_branch!=None:
429 set_default_branch(options.default_branch)
431 if options.origin_name!=None:
432 set_origin_name(options.origin_name)
434 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,options.headsfile,
435 options.statusfile,authors=a,sob=options.sob,force=options.force,hgtags=options.hgtags,flatten=options.flatten))