3 # Copyright (c) 2007, 2008 Rocco Rutte <pdmef@gmx.net> and others.
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial
import node
7 from hg2git
import setup_repo
,fixup_user
,get_branch
,get_changeset
8 from hg2git
import load_cache
,save_cache
,get_git_sha1
,set_default_branch
,set_origin_name
9 from optparse
import OptionParser
14 if sys
.platform
== "win32":
15 # On Windows, sys.stdout is initially opened in text mode, which means that
16 # when a LF (\n) character is written to sys.stdout, it will be converted
17 # into CRLF (\r\n). That makes git blow up, so use this platform-specific
18 # code to change the mode of sys.stdout to binary.
20 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
22 # silly regex to catch Signed-off-by lines in log message
23 sob_re
=re
.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count
=0
26 # write some progress message every this many file contents written
27 cfg_export_boundary
=1000
30 return 'l' in flags
and '120000' or 'x' in flags
and '100755' or '100644'
38 sys
.stdout
.write('\n')
39 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
41 def checkpoint(count
):
43 if cfg_checkpoint_count
>0 and count
%cfg_checkpoint
_count
==0:
44 sys
.stderr
.write("Checkpoint after %d commits\n" % count
)
49 def revnum_to_revref(rev
, old_marks
):
50 """Convert an hg revnum to a git-fast-import rev reference (an SHA1
52 return old_marks
.get(rev
) or ':%d' % (rev
+1)
54 def file_mismatch(f1
,f2
):
55 """See if two revisions of a file are not equal."""
56 return node
.hex(f1
)!=node
.hex(f2
)
58 def split_dict(dleft
,dright
,l
=[],c
=[],r
=[],match
=file_mismatch
):
59 """Loop over our repository and find all changed and missing files."""
60 for left
in dleft
.keys():
61 right
=dright
.get(left
,None)
63 # we have the file but our parent hasn't: add to left set
65 elif match(dleft
[left
],right
) or gitmode(dleft
.flags(left
))!=gitmode(dright
.flags(left
)):
66 # we have it but checksums mismatch: add to center set
68 for right
in dright
.keys():
69 left
=dleft
.get(right
,None)
71 # if parent has file but we don't: add to right set
73 # change is already handled when comparing child against parent
76 def get_filechanges(repo
,revision
,parents
,mleft
):
77 """Given some repository and revision, find all changed/deleted files."""
81 mright
=repo
.changectx(p
).manifest()
82 l
,c
,r
=split_dict(mleft
,mright
,l
,c
,r
)
88 def get_author(logmessage
,committer
,authors
):
89 """As git distincts between author and committer of a patch, try to
90 extract author by detecting Signed-off-by lines.
92 This walks from the end of the log message towards the top skipping
93 empty lines. Upon the first non-empty line, it walks all Signed-off-by
94 lines upwards to find the first one. For that (if found), it extracts
95 authorship information the usual way (authors table, cleaning, etc.)
97 If no Signed-off-by line is found, this defaults to the committer.
99 This may sound stupid (and it somehow is), but in log messages we
100 accidentially may have lines in the middle starting with
101 "Signed-off-by: foo" and thus matching our detection regex. Prevent
104 loglines
=logmessage
.split('\n')
106 # from tail walk to top skipping empty lines
109 if len(loglines
[i
].strip())==0: continue
112 # walk further upwards to find first sob line, store in 'first'
115 m
=sob_re
.match(loglines
[i
])
119 # if the last non-empty line matches our Signed-Off-by regex: extract username
121 r
=fixup_user(first
.group(1),authors
)
125 def export_file_contents(ctx
,manifest
,files
,hgtags
,encoding
=''):
129 # Skip .hgtags files. They only get us in trouble.
130 if not hgtags
and file == ".hgtags":
131 sys
.stderr
.write('Skip %s\n' % (file))
133 d
=ctx
.filectx(file).data()
135 filename
=file.decode(encoding
).encode('utf8')
138 wr('M %s inline %s' % (gitmode(manifest
.flags(file)),
139 strip_leading_slash(filename
)))
140 wr('data %d' % len(d
)) # had some trouble with size()
143 if count
%cfg_export
_boundary
==0:
144 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
145 if max>cfg_export_boundary
:
146 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
148 def sanitize_name(name
,what
="branch", mapping
={}):
149 """Sanitize input roughly according to git-check-ref-format(1)"""
151 # NOTE: Do not update this transform to work around
152 # incompatibilities on your platform. If you change it and it starts
153 # modifying names which previously were not touched it will break
154 # preexisting setups which are doing incremental imports.
156 # Use the -B and -T options to mangle branch and tag names
157 # instead. If you have a source repository where this is too much
158 # work to do manually, write a tool that does it for you.
161 if not name
: return name
162 if name
[0] == '.': return '_'+name
[1:]
165 n
=mapping
.get(name
,name
)
166 p
=re
.compile('([[ ~^:?\\\\*]|\.\.)')
168 if n
[-1] in ('/', '.'): n
=n
[:-1]+'_'
169 n
='/'.join(map(dot
,n
.split('/')))
174 sys
.stderr
.write('Warning: sanitized %s [%s] to [%s]\n' % (what
,name
,n
))
177 def strip_leading_slash(filename
):
178 if filename
[0] == '/':
182 def export_commit(ui
,repo
,revision
,old_marks
,max,count
,authors
,
183 branchesmap
,sob
,brmap
,hgtags
,encoding
='',fn_encoding
=''):
184 def get_branchname(name
):
185 if brmap
.has_key(name
):
187 n
=sanitize_name(name
, "branch", branchesmap
)
191 (revnode
,_
,user
,(time
,timezone
),files
,desc
,branch
,_
)=get_changeset(ui
,repo
,revision
,authors
,encoding
)
193 branch
=get_branchname(branch
)
195 parents
= [p
for p
in repo
.changelog
.parentrevs(revision
) if p
>= 0]
197 if len(parents
)==0 and revision
!= 0:
198 wr('reset refs/heads/%s' % branch
)
200 wr('commit refs/heads/%s' % branch
)
201 wr('mark :%d' % (revision
+1))
203 wr('author %s %d %s' % (get_author(desc
,user
,authors
),time
,timezone
))
204 wr('committer %s %d %s' % (user
,time
,timezone
))
205 wr('data %d' % (len(desc
)+1)) # wtf?
209 ctx
=repo
.changectx(str(revision
))
211 added
,changed
,removed
,type=[],[],[],''
213 if len(parents
) == 0:
214 # first revision: feed in full manifest
219 wr('from %s' % revnum_to_revref(parents
[0], old_marks
))
220 if len(parents
) == 1:
221 # later non-merge revision: feed in changed manifest
222 # if we have exactly one parent, just take the changes from the
223 # manifest without expensively comparing checksums
224 f
=repo
.status(repo
.lookup(parents
[0]),revnode
)[:3]
225 added
,changed
,removed
=f
[1],f
[0],f
[2]
227 else: # a merge with two parents
228 wr('merge %s' % revnum_to_revref(parents
[1], old_marks
))
229 # later merge revision: feed in changed manifest
230 # for many files comparing checksums is expensive so only do it for
231 # merges where we really need it due to hg's revlog logic
232 added
,changed
,removed
=get_filechanges(repo
,revision
,parents
,man
)
233 type='thorough delta'
235 sys
.stderr
.write('%s: Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
236 (branch
,type,revision
+1,max,len(added
),len(changed
),len(removed
)))
239 removed
=[r
.decode(fn_encoding
).encode('utf8') for r
in removed
]
241 removed
=[strip_leading_slash(x
) for x
in removed
]
243 map(lambda r
: wr('D %s' % r
),removed
)
244 export_file_contents(ctx
,man
,added
,hgtags
,fn_encoding
)
245 export_file_contents(ctx
,man
,changed
,hgtags
,fn_encoding
)
248 return checkpoint(count
)
250 def export_note(ui
,repo
,revision
,count
,authors
,encoding
,is_first
):
251 (revnode
,_
,user
,(time
,timezone
),_
,_
,_
,_
)=get_changeset(ui
,repo
,revision
,authors
,encoding
)
253 parents
= [p
for p
in repo
.changelog
.parentrevs(revision
) if p
>= 0]
255 wr('commit refs/notes/hg')
256 wr('committer %s %d %s' % (user
,time
,timezone
))
259 wr('from refs/notes/hg^0')
260 wr('N inline :%d' % (revision
+1))
261 hg_hash
=repo
.changectx(str(revision
)).hex()
262 wr('data %d' % (len(hg_hash
)))
265 return checkpoint(count
)
267 wr('data %d' % (len(desc
)+1)) # wtf?
271 def export_tags(ui
,repo
,old_marks
,mapping_cache
,count
,authors
,tagsmap
):
274 # Remap the branch name
275 tag
=sanitize_name(tag
,"tag",tagsmap
)
276 # ignore latest revision
277 if tag
=='tip': continue
278 # ignore tags to nodes that are missing (ie, 'in the future')
279 if node
.encode('hex_codec') not in mapping_cache
:
280 sys
.stderr
.write('Tag %s refers to unseen node %s\n' % (tag
, node
.encode('hex_codec')))
283 rev
=int(mapping_cache
[node
.encode('hex_codec')])
285 ref
=revnum_to_revref(rev
, old_marks
)
287 sys
.stderr
.write('Failed to find reference for creating tag'
288 ' %s at r%d\n' % (tag
,rev
))
290 sys
.stderr
.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag
,rev
,ref
))
291 wr('reset refs/tags/%s' % tag
)
294 count
=checkpoint(count
)
297 def load_mapping(name
, filename
):
298 raw_regexp
=re
.compile('^([^=]+)[ ]*=[ ]*(.+)$')
300 def parse_raw_line(line
):
301 m
=raw_regexp
.match(line
)
304 return (m
.group(1).strip(), m
.group(2).strip())
307 if not os
.path
.exists(filename
):
308 sys
.stderr
.write('Could not open mapping file [%s]\n' % (filename
))
313 for line
in f
.readlines():
316 if l
==1 and line
[0]=='#' and line
=='# quoted-escaped-strings':
318 elif line
=='' or line
[0]=='#':
320 m
=parse_raw_line(line
)
322 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
324 # put key:value in cache, key without ^:
328 sys
.stderr
.write('Loaded %d %s\n' % (a
, name
))
331 def branchtip(repo
, heads
):
332 '''return the tipmost branch head in heads'''
334 for h
in reversed(heads
):
335 if 'close' not in repo
.changelog
.read(h
)[5]:
340 def verify_heads(ui
,repo
,cache
,force
,branchesmap
):
342 for bn
, heads
in repo
.branchmap().iteritems():
343 branches
[bn
] = branchtip(repo
, heads
)
344 l
=[(-repo
.changelog
.rev(n
), n
, t
) for t
, n
in branches
.items()]
347 # get list of hg's branches to verify, don't take all git has
350 sanitized_name
=sanitize_name(b
,"branch",branchesmap
)
351 sha1
=get_git_sha1(sanitized_name
)
352 c
=cache
.get(sanitized_name
)
354 sys
.stderr
.write('Error: Branch [%s] modified outside hg-fast-export:'
355 '\n%s (repo) != %s (cache)\n' % (b
,sha1
,c
))
356 if not force
: return False
358 # verify that branch has exactly one head
360 for h
in repo
.heads():
361 (_
,_
,_
,_
,_
,_
,branch
,_
)=get_changeset(ui
,repo
,h
)
362 if t
.get(branch
,False):
363 sys
.stderr
.write('Error: repository has at least one unnamed head: hg r%s\n' %
364 repo
.changelog
.rev(h
))
365 if not force
: return False
370 def hg2git(repourl
,m
,marksfile
,mappingfile
,headsfile
,tipfile
,
371 authors
={},branchesmap
={},tagsmap
={},
372 sob
=False,force
=False,hgtags
=False,notes
=False,encoding
='',fn_encoding
=''):
373 def check_cache(filename
, contents
):
374 if len(contents
) == 0:
375 sys
.stderr
.write('Warning: %s does not contain any data, this will probably make an incremental import fail\n' % filename
)
379 old_marks
=load_cache(marksfile
,lambda s
: int(s
)-1)
380 mapping_cache
=load_cache(mappingfile
)
381 heads_cache
=load_cache(headsfile
)
382 state_cache
=load_cache(tipfile
)
384 if len(state_cache
) != 0:
385 for (name
, data
) in [(marksfile
, old_marks
),
386 (mappingfile
, mapping_cache
),
387 (headsfile
, state_cache
)]:
388 check_cache(name
, data
)
390 ui
,repo
=setup_repo(repourl
)
392 if not verify_heads(ui
,repo
,heads_cache
,force
,branchesmap
):
396 tip
=repo
.changelog
.count()
397 except AttributeError:
400 min=int(state_cache
.get('tip',0))
402 if _max
<0 or max>tip
:
405 for rev
in range(0,max):
406 (revnode
,_
,_
,_
,_
,_
,_
,_
)=get_changeset(ui
,repo
,rev
,authors
)
407 mapping_cache
[revnode
.encode('hex_codec')] = str(rev
)
412 for rev
in range(min,max):
413 c
=export_commit(ui
,repo
,rev
,old_marks
,max,c
,authors
,branchesmap
,
414 sob
,brmap
,hgtags
,encoding
,fn_encoding
)
416 for rev
in range(min,max):
417 c
=export_note(ui
,repo
,rev
,c
,authors
, encoding
, rev
== min and min != 0)
419 state_cache
['tip']=max
420 state_cache
['repo']=repourl
421 save_cache(tipfile
,state_cache
)
422 save_cache(mappingfile
,mapping_cache
)
424 c
=export_tags(ui
,repo
,old_marks
,mapping_cache
,c
,authors
,tagsmap
)
426 sys
.stderr
.write('Issued %d commands\n' % c
)
430 if __name__
=='__main__':
431 def bail(parser
,opt
):
432 sys
.stderr
.write('Error: No %s option given\n' % opt
)
436 parser
=OptionParser()
438 parser
.add_option("-m","--max",type="int",dest
="max",
439 help="Maximum hg revision to import")
440 parser
.add_option("--mapping",dest
="mappingfile",
441 help="File to read last run's hg-to-git SHA1 mapping")
442 parser
.add_option("--marks",dest
="marksfile",
443 help="File to read git-fast-import's marks from")
444 parser
.add_option("--heads",dest
="headsfile",
445 help="File to read last run's git heads from")
446 parser
.add_option("--status",dest
="statusfile",
447 help="File to read status from")
448 parser
.add_option("-r","--repo",dest
="repourl",
449 help="URL of repo to import")
450 parser
.add_option("-s",action
="store_true",dest
="sob",
451 default
=False,help="Enable parsing Signed-off-by lines")
452 parser
.add_option("--hgtags",action
="store_true",dest
="hgtags",
453 default
=False,help="Enable exporting .hgtags files")
454 parser
.add_option("-A","--authors",dest
="authorfile",
455 help="Read authormap from AUTHORFILE")
456 parser
.add_option("-B","--branches",dest
="branchesfile",
457 help="Read branch map from BRANCHESFILE")
458 parser
.add_option("-T","--tags",dest
="tagsfile",
459 help="Read tags map from TAGSFILE")
460 parser
.add_option("-f","--force",action
="store_true",dest
="force",
461 default
=False,help="Ignore validation errors by force")
462 parser
.add_option("-M","--default-branch",dest
="default_branch",
463 help="Set the default branch")
464 parser
.add_option("-o","--origin",dest
="origin_name",
465 help="use <name> as namespace to track upstream")
466 parser
.add_option("--hg-hash",action
="store_true",dest
="notes",
467 default
=False,help="Annotate commits with the hg hash as git notes in the hg namespace")
468 parser
.add_option("-e",dest
="encoding",
469 help="Assume commit and author strings retrieved from Mercurial are encoded in <encoding>")
470 parser
.add_option("--fe",dest
="fn_encoding",
471 help="Assume file names from Mercurial are encoded in <filename_encoding>")
473 (options
,args
)=parser
.parse_args()
476 if options
.max!=None: m
=options
.max
478 if options
.marksfile
==None: bail(parser
,'--marks')
479 if options
.mappingfile
==None: bail(parser
,'--mapping')
480 if options
.headsfile
==None: bail(parser
,'--heads')
481 if options
.statusfile
==None: bail(parser
,'--status')
482 if options
.repourl
==None: bail(parser
,'--repo')
485 if options
.authorfile
!=None:
486 a
=load_mapping('authors', options
.authorfile
)
489 if options
.branchesfile
!=None:
490 b
=load_mapping('branches', options
.branchesfile
)
493 if options
.tagsfile
!=None:
494 t
=load_mapping('tags', options
.tagsfile
)
496 if options
.default_branch
!=None:
497 set_default_branch(options
.default_branch
)
499 if options
.origin_name
!=None:
500 set_origin_name(options
.origin_name
)
503 if options
.encoding
!=None:
504 encoding
=options
.encoding
507 if options
.fn_encoding
!=None:
508 fn_encoding
=options
.fn_encoding
510 sys
.exit(hg2git(options
.repourl
,m
,options
.marksfile
,options
.mappingfile
,
511 options
.headsfile
, options
.statusfile
,
512 authors
=a
,branchesmap
=b
,tagsmap
=t
,
513 sob
=options
.sob
,force
=options
.force
,hgtags
=options
.hgtags
,
514 notes
=options
.notes
,encoding
=encoding
,fn_encoding
=fn_encoding
))