3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial
import node
7 from hg2git
import setup_repo
,fixup_user
,get_branch
,get_changeset
8 from hg2git
import load_cache
,save_cache
,get_git_sha1
,set_default_branch
,set_origin_name
,set_unknown_addr
9 from optparse
import OptionParser
14 if sys
.platform
== "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
20 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re
=re
.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count
=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary
=1000
28 # ref manipulation regexs
29 ref_crud_re
= re
.compile(r
'[[\x00-\x1f\x7f ~^:\\*?]+', re
.S
)
30 ref_dotdot_re
= re
.compile(r
'\.\.')
31 ref_atbrace_re
= re
.compile(r
'@\{')
32 ref_dotlock_re
= re
.compile(r
'.*\.lock$', re
.I
)
33 ref_separators_re
= re
.compile(r
'/+')
34 ref_collapse_re
= re
.compile(r
'_+')
37 return 'l' in flags
and '120000' or 'x' in flags
and '100755' or '100644'
42 sys
.stdout
.write('\n')
43 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
45 def checkpoint(count
):
47 if cfg_checkpoint_count
>0 and count
%cfg_checkpoint
_count
==0:
48 sys
.stderr
.write("Checkpoint after %d commits\n" % count
)
53 def revnum_to_revref(rev
, old_marks
):
54 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
56 return old_marks
.get(rev
) or ':%d' % (rev
+1)
58 def file_mismatch(f1
,f2
):
59 """See if two revisions of a file are not equal."""
60 return node
.hex(f1
)!=node
.hex(f2
)
62 def split_dict(dleft
,dright
,l
=[],c
=[],r
=[],match
=file_mismatch
):
63 """Loop over our repository and find all changed and missing files."""
64 for left
in dleft
.keys():
65 right
=dright
.get(left
,None)
67 # we have the file but our parent hasn't: add to left set
69 elif match(dleft
[left
],right
) or gitmode(dleft
.flags(left
))!=gitmode(dright
.flags(left
)):
70 # we have it but checksums mismatch: add to center set
72 for right
in dright
.keys():
73 left
=dleft
.get(right
,None)
75 # if parent has file but we don't: add to right set
77 # change is already handled when comparing child against parent
80 def get_filechanges(repo
,revision
,parents
,mleft
):
81 """Given some repository and revision, find all changed/deleted files."""
85 mright
=repo
.changectx(p
).manifest()
86 l
,c
,r
=split_dict(mleft
,mright
,l
,c
,r
)
92 def get_author(logmessage
,committer
,authors
):
93 """As git distincts between author and committer of a patch, try to
94 extract author by detecting Signed-off-by lines.
96 This walks from the end of the log message towards the top skipping
97 empty lines. Upon the first non-empty line, it walks all Signed-off-by
98 lines upwards to find the first one. For that (if found), it extracts
99 authorship information the usual way (authors table, cleaning, etc.)
101 If no Signed-off-by line is found, this defaults to the committer.
103 This may sound stupid (and it somehow is), but in log messages we
104 accidentially may have lines in the middle starting with
105 "Signed-off-by: foo" and thus matching our detection regex. Prevent
108 loglines
=logmessage
.split('\n')
110 # from tail walk to top skipping empty lines
113 if len(loglines
[i
].strip())==0: continue
116 # walk further upwards to find first sob line, store in 'first'
119 m
=sob_re
.match(loglines
[i
])
123 # if the last non-empty line matches our Signed-Off-by regex: extract username
125 r
=fixup_user(first
.group(1),authors
)
129 def export_file_contents(ctx
,manifest
,files
,hgtags
):
133 # Skip .hgtags files. They only get us in trouble.
134 if not hgtags
and file == ".hgtags":
135 sys
.stderr
.write('Skip %s\n' % (file))
137 d
=ctx
.filectx(file).data()
138 wr('M %s inline %s' % (gitmode(manifest
.flags(file)),file))
139 wr('data %d' % len(d
)) # had some trouble with size()
142 if count
%cfg_export
_boundary
==0:
143 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
144 if max>cfg_export_boundary
:
145 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
147 def sanitize_name(name
,what
="branch",flatten
=False):
148 """Sanitize input roughly according to git-check-ref-format(1)"""
151 if len(name
) >= 1 and name
[0] == '.': return '_'+name
[1:]
155 # be paranoid just in case
159 n
= ref_crud_re
.sub('_', n
)
160 n
= ref_dotdot_re
.sub('_', n
)
161 n
= ref_atbrace_re
.sub('_{', n
)
162 if ref_dotlock_re
.match(n
):
163 n
= n
[:-5] + '_' + n
[-4:]
164 if n
[-1] in ('/', '.'): n
=n
[:-1]+'_'
166 if n
[0] == '.': n
='_'+n
[1:]
167 n
= ref_separators_re
.sub('_', n
)
169 n
= '/'.join(map(dot
, n
.split('/')))
170 if n
[0] == '/': n
='_'+n
[1:]
171 n
= ref_separators_re
.sub('/', n
)
172 n
= ref_collapse_re
.sub('_', n
)
175 sys
.stderr
.write('Warning: sanitized %s [%s] to [%s]\n' % (what
,name
,n
))
178 def export_commit(ui
,repo
,revision
,old_marks
,max,count
,authors
,sob
,brmap
,hgtags
,flatten
):
179 def get_branchname(name
):
180 if brmap
.has_key(name
):
182 n
=sanitize_name(name
,flatten
=flatten
)
186 (revnode
,_
,user
,(time
,timezone
),files
,desc
,branch
,_
)=get_changeset(ui
,repo
,revision
,authors
)
188 branch
=get_branchname(branch
)
190 parents
= [p
for p
in repo
.changelog
.parentrevs(revision
) if p
>= 0]
192 if len(parents
)==0 and revision
!= 0:
193 wr('reset refs/heads/%s' % branch
)
195 wr('commit refs/heads/%s' % branch
)
196 wr('mark :%d' % (revision
+1))
198 wr('author %s %d %s' % (get_author(desc
,user
,authors
),time
,timezone
))
199 wr('committer %s %d %s' % (user
,time
,timezone
))
200 wr('data %d' % (len(desc
)+1)) # wtf?
204 ctx
=repo
.changectx(str(revision
))
206 added
,changed
,removed
,type=[],[],[],''
208 if len(parents
) == 0:
209 # first revision: feed in full manifest
214 wr('from %s' % revnum_to_revref(parents
[0], old_marks
))
215 if len(parents
) == 1:
216 # later non-merge revision: feed in changed manifest
217 # if we have exactly one parent, just take the changes from the
218 # manifest without expensively comparing checksums
219 f
=repo
.status(repo
.lookup(parents
[0]),revnode
)[:3]
220 added
,changed
,removed
=f
[1],f
[0],f
[2]
222 else: # a merge with two parents
223 wr('merge %s' % revnum_to_revref(parents
[1], old_marks
))
224 # later merge revision: feed in changed manifest
225 # for many files comparing checksums is expensive so only do it for
226 # merges where we really need it due to hg's revlog logic
227 added
,changed
,removed
=get_filechanges(repo
,revision
,parents
,man
)
228 type='thorough delta'
230 sys
.stderr
.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
231 (branch
,type,revision
+1,max,len(added
),len(changed
),len(removed
)))
233 map(lambda r
: wr('D %s' % r
),removed
)
234 export_file_contents(ctx
,man
,added
,hgtags
)
235 export_file_contents(ctx
,man
,changed
,hgtags
)
238 return checkpoint(count
)
240 def export_tags(ui
,repo
,old_marks
,mapping_cache
,count
,authors
,flatten
):
243 tag
=sanitize_name(tag
,"tag",flatten
=flatten
)
244 # ignore latest revision
245 if tag
=='tip': continue
246 # ignore tags to nodes that are missing (ie, 'in the future')
247 if node
.encode('hex_codec') not in mapping_cache
:
248 sys
.stderr
.write('Tag %s refers to unseen node %s\n' % (tag
, node
.encode('hex_codec')))
251 rev
=int(mapping_cache
[node
.encode('hex_codec')])
253 ref
=revnum_to_revref(rev
, old_marks
)
255 sys
.stderr
.write('Failed to find reference for creating tag'
256 ' %s at r%d\n' % (tag
,rev
))
258 sys
.stderr
.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag
,rev
,ref
))
259 wr('reset refs/tags/%s' % tag
)
262 count
=checkpoint(count
)
265 def load_authors(filename
):
267 if not os
.path
.exists(filename
):
272 lre
=re
.compile('^([^=]+)[ ]*=[ ]*(.+)$')
273 for line
in f
.readlines():
276 if line
=='' or line
[0]=='#':
280 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
282 # put key:value in cache, key without ^:
283 cache
[m
.group(1).strip()]=m
.group(2).strip()
286 sys
.stderr
.write('Loaded %d authors\n' % a
)
289 def branchtip(repo
, heads
):
290 '''return the tipmost branch head in heads'''
292 for h
in reversed(heads
):
293 if 'close' not in repo
.changelog
.read(h
)[5]:
298 def verify_heads(ui
,repo
,cache
,force
):
300 for bn
, heads
in repo
.branchmap().iteritems():
301 branches
[bn
] = branchtip(repo
, heads
)
302 l
=[(-repo
.changelog
.rev(n
), n
, t
) for t
, n
in branches
.items()]
305 # get list of hg's branches to verify, don't take all git has
311 sys
.stderr
.write('Error: Branch [%s] modified outside hg-fast-export:'
312 '\n%s (repo) != %s (cache)\n' % (b
,sha1
,c
))
313 if not force
: return False
315 # verify that branch has exactly one head
317 for h
in repo
.heads():
318 (_
,_
,_
,_
,_
,_
,branch
,_
)=get_changeset(ui
,repo
,h
)
319 if t
.get(branch
,False):
320 sys
.stderr
.write('Error: repository has at least one unnamed head: hg r%s\n' %
321 repo
.changelog
.rev(h
))
322 if not force
: return False
327 def hg2git(repourl
,m
,marksfile
,mappingfile
,headsfile
,tipfile
,authors
={},sob
=False,force
=False,hgtags
=False,flatten
=False):
330 old_marks
=load_cache(marksfile
,lambda s
: int(s
)-1)
331 mapping_cache
=load_cache(mappingfile
)
332 heads_cache
=load_cache(headsfile
)
333 state_cache
=load_cache(tipfile
)
335 ui
,repo
=setup_repo(repourl
)
337 if not verify_heads(ui
,repo
,heads_cache
,force
):
341 tip
=repo
.changelog
.count()
342 except AttributeError:
345 min=int(state_cache
.get('tip',0))
347 if _max
<0 or max>tip
:
350 for rev
in range(0,max):
351 (revnode
,_
,_
,_
,_
,_
,_
,_
)=get_changeset(ui
,repo
,rev
,authors
)
352 mapping_cache
[revnode
.encode('hex_codec')] = str(rev
)
357 for rev
in range(min,max):
358 c
=export_commit(ui
,repo
,rev
,old_marks
,max,c
,authors
,sob
,brmap
,hgtags
,flatten
)
360 state_cache
['tip']=max
361 state_cache
['repo']=repourl
362 save_cache(tipfile
,state_cache
)
363 save_cache(mappingfile
,mapping_cache
)
365 c
=export_tags(ui
,repo
,old_marks
,mapping_cache
,c
,authors
,flatten
)
367 sys
.stderr
.write('Issued %d commands\n' % c
)
371 if __name__
=='__main__':
372 def bail(parser
,opt
):
373 sys
.stderr
.write('Error: No %s option given\n' % opt
)
377 parser
=OptionParser()
379 parser
.add_option("-m","--max",type="int",dest
="max",
380 help="Maximum hg revision to import")
381 parser
.add_option("--mapping",dest
="mappingfile",
382 help="File to read last run's hg-to-git SHA1 mapping")
383 parser
.add_option("--marks",dest
="marksfile",
384 help="File to read git-fast-import's marks from")
385 parser
.add_option("--heads",dest
="headsfile",
386 help="File to read last run's git heads from")
387 parser
.add_option("--status",dest
="statusfile",
388 help="File to read status from")
389 parser
.add_option("-r","--repo",dest
="repourl",
390 help="URL of repo to import")
391 parser
.add_option("-s",action
="store_true",dest
="sob",
392 default
=False,help="Enable parsing Signed-off-by lines")
393 parser
.add_option("--hgtags",action
="store_true",dest
="hgtags",
394 default
=False,help="Enable exporting .hgtags files")
395 parser
.add_option("--flatten",action
="store_true",dest
="flatten",
396 default
=False,help="Create one-level ref names (convert '/' to '_')")
397 parser
.add_option("-A","--authors",dest
="authorfile",
398 help="Read authormap from AUTHORFILE")
399 parser
.add_option("-U",dest
="unknown",
400 help="Email address to use for unknown instead of 'devnull@localhost'")
401 parser
.add_option("-f","--force",action
="store_true",dest
="force",
402 default
=False,help="Ignore validation errors by force")
403 parser
.add_option("-M","--default-branch",dest
="default_branch",
404 help="Set the default branch")
405 parser
.add_option("-o","--origin",dest
="origin_name",
406 help="use <name> as namespace to track upstream")
408 (options
,args
)=parser
.parse_args()
411 if options
.max!=None: m
=options
.max
413 if options
.marksfile
==None: bail(parser
,'--marks')
414 if options
.mappingfile
==None: bail(parser
,'--mapping')
415 if options
.headsfile
==None: bail(parser
,'--heads')
416 if options
.statusfile
==None: bail(parser
,'--status')
417 if options
.repourl
==None: bail(parser
,'--repo')
420 if options
.authorfile
!=None:
421 a
=load_authors(options
.authorfile
)
423 if options
.unknown
!=None:
424 if not set_unknown_addr(options
.unknown
):
425 sys
.stderr
.write("Error: Invalid email address '%s'\n" % options
.unknown
)
428 if options
.default_branch
!=None:
429 set_default_branch(options
.default_branch
)
431 if options
.origin_name
!=None:
432 set_origin_name(options
.origin_name
)
434 sys
.exit(hg2git(options
.repourl
,m
,options
.marksfile
,options
.mappingfile
,options
.headsfile
,
435 options
.statusfile
,authors
=a
,sob
=options
.sob
,force
=options
.force
,hgtags
=options
.hgtags
,flatten
=options
.flatten
))