Make -e option apply to imported filenames
[fast-export.git] / hg-fast-export.py
blob0c683c5f23f31531efa753b4d5d5915888686c23
1 #!/usr/bin/env python
3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial import node
7 from hg2git import setup_repo,fixup_user,get_branch,get_changeset
8 from hg2git import load_cache,save_cache,get_git_sha1,set_default_branch,set_origin_name
9 from optparse import OptionParser
10 import re
11 import sys
12 import os
14 if sys.platform == "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
19 import msvcrt
20 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re=re.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary=1000
29 def gitmode(flags):
30 return 'l' in flags and '120000' or 'x' in flags and '100755' or '100644'
32 def wr_no_nl(msg=''):
33 if msg:
34 sys.stdout.write(msg)
36 def wr(msg=''):
37 wr_no_nl(msg)
38 sys.stdout.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count):
42 count=count+1
43 if cfg_checkpoint_count>0 and count%cfg_checkpoint_count==0:
44 sys.stderr.write("Checkpoint after %d commits\n" % count)
45 wr('checkpoint')
46 wr()
47 return count
49 def revnum_to_revref(rev, old_marks):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
51 or a mark)"""
52 return old_marks.get(rev) or ':%d' % (rev+1)
54 def file_mismatch(f1,f2):
55 """See if two revisions of a file are not equal."""
56 return node.hex(f1)!=node.hex(f2)
58 def split_dict(dleft,dright,l=[],c=[],r=[],match=file_mismatch):
59 """Loop over our repository and find all changed and missing files."""
60 for left in dleft.keys():
61 right=dright.get(left,None)
62 if right==None:
63 # we have the file but our parent hasn't: add to left set
64 l.append(left)
65 elif match(dleft[left],right) or gitmode(dleft.flags(left))!=gitmode(dright.flags(left)):
66 # we have it but checksums mismatch: add to center set
67 c.append(left)
68 for right in dright.keys():
69 left=dleft.get(right,None)
70 if left==None:
71 # if parent has file but we don't: add to right set
72 r.append(right)
73 # change is already handled when comparing child against parent
74 return l,c,r
76 def get_filechanges(repo,revision,parents,mleft):
77 """Given some repository and revision, find all changed/deleted files."""
78 l,c,r=[],[],[]
79 for p in parents:
80 if p<0: continue
81 mright=repo.changectx(p).manifest()
82 l,c,r=split_dict(mleft,mright,l,c,r)
83 l.sort()
84 c.sort()
85 r.sort()
86 return l,c,r
88 def get_author(logmessage,committer,authors):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
102 that."""
104 loglines=logmessage.split('\n')
105 i=len(loglines)
106 # from tail walk to top skipping empty lines
107 while i>=0:
108 i-=1
109 if len(loglines[i].strip())==0: continue
110 break
111 if i>=0:
112 # walk further upwards to find first sob line, store in 'first'
113 first=None
114 while i>=0:
115 m=sob_re.match(loglines[i])
116 if m==None: break
117 first=m
118 i-=1
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
120 if first!=None:
121 r=fixup_user(first.group(1),authors)
122 return r
123 return committer
125 def export_file_contents(ctx,manifest,files,hgtags,encoding=''):
126 count=0
127 max=len(files)
128 for file in files:
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags and file == ".hgtags":
131 sys.stderr.write('Skip %s\n' % (file))
132 continue
133 d=ctx.filectx(file).data()
134 if encoding:
135 filename=file.decode(encoding).encode('utf8')
136 else:
137 filename=file
138 wr('M %s inline %s' % (gitmode(manifest.flags(file)),filename))
139 wr('data %d' % len(d)) # had some trouble with size()
140 wr(d)
141 count+=1
142 if count%cfg_export_boundary==0:
143 sys.stderr.write('Exported %d/%d files\n' % (count,max))
144 if max>cfg_export_boundary:
145 sys.stderr.write('Exported %d/%d files\n' % (count,max))
147 def sanitize_name(name,what="branch"):
148 """Sanitize input roughly according to git-check-ref-format(1)"""
150 def dot(name):
151 if name[0] == '.': return '_'+name[1:]
152 return name
154 n=name
155 p=re.compile('([[ ~^:?\\\\*]|\.\.)')
156 n=p.sub('_', n)
157 if n[-1] in ('/', '.'): n=n[:-1]+'_'
158 n='/'.join(map(dot,n.split('/')))
159 p=re.compile('_+')
160 n=p.sub('_', n)
162 if n!=name:
163 sys.stderr.write('Warning: sanitized %s [%s] to [%s]\n' % (what,name,n))
164 return n
166 def export_commit(ui,repo,revision,old_marks,max,count,authors,sob,brmap,hgtags,notes,encoding=''):
167 def get_branchname(name):
168 if brmap.has_key(name):
169 return brmap[name]
170 n=sanitize_name(name)
171 brmap[name]=n
172 return n
174 (revnode,_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision,authors,encoding)
176 branch=get_branchname(branch)
178 parents = [p for p in repo.changelog.parentrevs(revision) if p >= 0]
180 if len(parents)==0 and revision != 0:
181 wr('reset refs/heads/%s' % branch)
183 wr('commit refs/heads/%s' % branch)
184 wr('mark :%d' % (revision+1))
185 if sob:
186 wr('author %s %d %s' % (get_author(desc,user,authors),time,timezone))
187 wr('committer %s %d %s' % (user,time,timezone))
188 wr('data %d' % (len(desc)+1)) # wtf?
189 wr(desc)
190 wr()
192 ctx=repo.changectx(str(revision))
193 man=ctx.manifest()
194 added,changed,removed,type=[],[],[],''
196 if len(parents) == 0:
197 # first revision: feed in full manifest
198 added=man.keys()
199 added.sort()
200 type='full'
201 else:
202 wr('from %s' % revnum_to_revref(parents[0], old_marks))
203 if len(parents) == 1:
204 # later non-merge revision: feed in changed manifest
205 # if we have exactly one parent, just take the changes from the
206 # manifest without expensively comparing checksums
207 f=repo.status(repo.lookup(parents[0]),revnode)[:3]
208 added,changed,removed=f[1],f[0],f[2]
209 type='simple delta'
210 else: # a merge with two parents
211 wr('merge %s' % revnum_to_revref(parents[1], old_marks))
212 # later merge revision: feed in changed manifest
213 # for many files comparing checksums is expensive so only do it for
214 # merges where we really need it due to hg's revlog logic
215 added,changed,removed=get_filechanges(repo,revision,parents,man)
216 type='thorough delta'
218 sys.stderr.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
219 (branch,type,revision+1,max,len(added),len(changed),len(removed)))
221 if encoding:
222 removed=[r.decode(encoding).encode('utf8') for r in removed]
224 map(lambda r: wr('D %s' % r),removed)
225 export_file_contents(ctx,man,added,hgtags,encoding)
226 export_file_contents(ctx,man,changed,hgtags,encoding)
227 wr()
229 count=checkpoint(count)
230 count=generate_note(user,time,timezone,revision,ctx,count,notes)
231 return count
233 def generate_note(user,time,timezone,revision,ctx,count,notes):
234 if not notes:
235 return count
236 wr('commit refs/notes/hg')
237 wr('committer %s %d %s' % (user,time,timezone))
238 wr('data 0')
239 wr('N inline :%d' % (revision+1))
240 hg_hash=ctx.hex()
241 wr('data %d' % (len(hg_hash)))
242 wr_no_nl(hg_hash)
243 wr()
244 return checkpoint(count)
246 def export_tags(ui,repo,old_marks,mapping_cache,count,authors):
247 l=repo.tagslist()
248 for tag,node in l:
249 tag=sanitize_name(tag,"tag")
250 # ignore latest revision
251 if tag=='tip': continue
252 # ignore tags to nodes that are missing (ie, 'in the future')
253 if node.encode('hex_codec') not in mapping_cache:
254 sys.stderr.write('Tag %s refers to unseen node %s\n' % (tag, node.encode('hex_codec')))
255 continue
257 rev=int(mapping_cache[node.encode('hex_codec')])
259 ref=revnum_to_revref(rev, old_marks)
260 if ref==None:
261 sys.stderr.write('Failed to find reference for creating tag'
262 ' %s at r%d\n' % (tag,rev))
263 continue
264 sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
265 wr('reset refs/tags/%s' % tag)
266 wr('from %s' % ref)
267 wr()
268 count=checkpoint(count)
269 return count
271 def load_authors(filename):
272 cache={}
273 if not os.path.exists(filename):
274 return cache
275 f=open(filename,'r')
278 lre=re.compile('^([^=]+)[ ]*=[ ]*(.+)$')
279 for line in f.readlines():
280 l+=1
281 line=line.strip()
282 if line=='' or line[0]=='#':
283 continue
284 m=lre.match(line)
285 if m==None:
286 sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
287 continue
288 # put key:value in cache, key without ^:
289 cache[m.group(1).strip()]=m.group(2).strip()
290 a+=1
291 f.close()
292 sys.stderr.write('Loaded %d authors\n' % a)
293 return cache
295 def branchtip(repo, heads):
296 '''return the tipmost branch head in heads'''
297 tip = heads[-1]
298 for h in reversed(heads):
299 if 'close' not in repo.changelog.read(h)[5]:
300 tip = h
301 break
302 return tip
304 def verify_heads(ui,repo,cache,force):
305 branches={}
306 for bn, heads in repo.branchmap().iteritems():
307 branches[bn] = branchtip(repo, heads)
308 l=[(-repo.changelog.rev(n), n, t) for t, n in branches.items()]
309 l.sort()
311 # get list of hg's branches to verify, don't take all git has
312 for _,_,b in l:
313 b=get_branch(b)
314 sha1=get_git_sha1(b)
315 c=cache.get(b)
316 if sha1!=c:
317 sys.stderr.write('Error: Branch [%s] modified outside hg-fast-export:'
318 '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
319 if not force: return False
321 # verify that branch has exactly one head
322 t={}
323 for h in repo.heads():
324 (_,_,_,_,_,_,branch,_)=get_changeset(ui,repo,h)
325 if t.get(branch,False):
326 sys.stderr.write('Error: repository has at least one unnamed head: hg r%s\n' %
327 repo.changelog.rev(h))
328 if not force: return False
329 t[branch]=True
331 return True
333 def hg2git(repourl,m,marksfile,mappingfile,headsfile,tipfile,authors={},sob=False,force=False,hgtags=False,notes=False,encoding=''):
334 _max=int(m)
336 old_marks=load_cache(marksfile,lambda s: int(s)-1)
337 mapping_cache=load_cache(mappingfile)
338 heads_cache=load_cache(headsfile)
339 state_cache=load_cache(tipfile)
341 ui,repo=setup_repo(repourl)
343 if not verify_heads(ui,repo,heads_cache,force):
344 return 1
346 try:
347 tip=repo.changelog.count()
348 except AttributeError:
349 tip=len(repo)
351 min=int(state_cache.get('tip',0))
352 max=_max
353 if _max<0 or max>tip:
354 max=tip
356 for rev in range(0,max):
357 (revnode,_,_,_,_,_,_,_)=get_changeset(ui,repo,rev,authors)
358 mapping_cache[revnode.encode('hex_codec')] = str(rev)
362 brmap={}
363 for rev in range(min,max):
364 c=export_commit(ui,repo,rev,old_marks,max,c,authors,sob,brmap,hgtags,notes,encoding)
366 state_cache['tip']=max
367 state_cache['repo']=repourl
368 save_cache(tipfile,state_cache)
369 save_cache(mappingfile,mapping_cache)
371 c=export_tags(ui,repo,old_marks,mapping_cache,c,authors)
373 sys.stderr.write('Issued %d commands\n' % c)
375 return 0
377 if __name__=='__main__':
378 def bail(parser,opt):
379 sys.stderr.write('Error: No %s option given\n' % opt)
380 parser.print_help()
381 sys.exit(2)
383 parser=OptionParser()
385 parser.add_option("-m","--max",type="int",dest="max",
386 help="Maximum hg revision to import")
387 parser.add_option("--mapping",dest="mappingfile",
388 help="File to read last run's hg-to-git SHA1 mapping")
389 parser.add_option("--marks",dest="marksfile",
390 help="File to read git-fast-import's marks from")
391 parser.add_option("--heads",dest="headsfile",
392 help="File to read last run's git heads from")
393 parser.add_option("--status",dest="statusfile",
394 help="File to read status from")
395 parser.add_option("-r","--repo",dest="repourl",
396 help="URL of repo to import")
397 parser.add_option("-s",action="store_true",dest="sob",
398 default=False,help="Enable parsing Signed-off-by lines")
399 parser.add_option("--hgtags",action="store_true",dest="hgtags",
400 default=False,help="Enable exporting .hgtags files")
401 parser.add_option("-A","--authors",dest="authorfile",
402 help="Read authormap from AUTHORFILE")
403 parser.add_option("-f","--force",action="store_true",dest="force",
404 default=False,help="Ignore validation errors by force")
405 parser.add_option("-M","--default-branch",dest="default_branch",
406 help="Set the default branch")
407 parser.add_option("-o","--origin",dest="origin_name",
408 help="use <name> as namespace to track upstream")
409 parser.add_option("--hg-hash",action="store_true",dest="notes",
410 default=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
411 parser.add_option("-e",dest="encoding",
412 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
414 (options,args)=parser.parse_args()
416 m=-1
417 if options.max!=None: m=options.max
419 if options.marksfile==None: bail(parser,'--marks')
420 if options.mappingfile==None: bail(parser,'--mapping')
421 if options.headsfile==None: bail(parser,'--heads')
422 if options.statusfile==None: bail(parser,'--status')
423 if options.repourl==None: bail(parser,'--repo')
425 a={}
426 if options.authorfile!=None:
427 a=load_authors(options.authorfile)
429 if options.default_branch!=None:
430 set_default_branch(options.default_branch)
432 if options.origin_name!=None:
433 set_origin_name(options.origin_name)
435 encoding=''
436 if options.encoding!=None:
437 encoding=options.encoding
439 sys.exit(hg2git(options.repourl,m,options.marksfile,options.mappingfile,
440 options.headsfile, options.statusfile,authors=a,
441 sob=options.sob,force=options.force,hgtags=options.hgtags,
442 notes=options.notes,encoding=encoding))