3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
10 from mercurial
import repo
,hg
,cmdutil
,util
,ui
,revlog
,node
11 from tempfile
import mkstemp
12 from optparse
import OptionParser
17 # silly regex to catch Signed-off-by lines in log message
18 sob_re
=re
.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
19 # silly regex to see if user field has email address
20 user_re
=re
.compile('([^<]+) (<[^>]+>)$')
21 # silly regex to clean out user names
22 user_clean_re
=re
.compile('^["]([^"]+)["]$')
23 # git branch for hg's default 'HEAD' branch
25 # insert 'checkpoint' command after this many commits or none at all if 0
26 cfg_checkpoint_count
=0
27 # write some progress message every this many file contents written
28 cfg_export_boundary
=1000
31 sys
.stderr
.write(__doc__
)
36 return myui
,hg
.repository(myui
,url
)
38 def fixup_user(user
,authors
):
40 # if we have an authors table, try to get mapping
41 # by defaulting to the current value of 'user'
42 user
=authors
.get(user
,user
)
43 name
,mail
,m
='','',user_re
.match(user
)
45 # if we don't have 'Name <mail>' syntax, use 'user
46 # <devnull@localhost>' if use contains no at and
47 # 'user <user>' otherwise
50 mail
='<devnull@localhost>'
54 # if we have 'Name <mail>' syntax, everything is fine :)
55 name
,mail
=m
.group(1),m
.group(2)
57 # remove any silly quoting from username
58 m2
=user_clean_re
.match(name
)
61 return '%s %s' % (name
,mail
)
68 def get_changeset(ui
,repo
,revision
,authors
={}):
69 node
=repo
.lookup(revision
)
70 (manifest
,user
,(time
,timezone
),files
,desc
,extra
)=repo
.changelog
.read(node
)
71 tz
="%+03d%02d" % (-timezone
/ 3600, ((-timezone
% 3600) / 60))
72 branch
=get_branch(extra
.get('branch','master'))
73 return (node
,manifest
,fixup_user(user
,authors
),(time
,tz
),files
,desc
,branch
,extra
)
76 return x
and '100755' or '100644'
80 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
82 def checkpoint(count
):
84 if cfg_checkpoint_count
>0 and count
%cfg_checkpoint
_count
==0:
85 sys
.stderr
.write("Checkpoint after %d commits\n" % count
)
90 def get_parent_mark(parent
,marks
):
91 """Get the mark for some parent.
92 If we saw it in the current session, return :%d syntax and
93 otherwise the SHA1 from the cache."""
94 return marks
.get(str(parent
+1),':%d' % (parent
+1))
97 """See if two revisions of a file are not equal."""
98 return node
.hex(f1
)!=node
.hex(f2
)
100 def outer_set(dleft
,dright
,l
,c
,r
):
101 """Loop over our repository and find all changed and missing files."""
102 for left
in dleft
.keys():
103 right
=dright
.get(left
,None)
105 # we have the file but our parent hasn't: add to left set
107 elif mismatch(dleft
[left
],right
):
108 # we have it but checksums mismatch: add to center set
110 for right
in dright
.keys():
111 left
=dleft
.get(right
,None)
113 # if parent has file but we don't: add to right set
115 # change is already handled when comparing child against parent
118 def get_filechanges(repo
,revision
,parents
,mleft
):
119 """Given some repository and revision, find all changed/deleted files."""
123 mright
=repo
.changectx(p
).manifest()
128 l
,c
,r
=outer_set(mleft
,mright
,l
,c
,r
)
131 def get_author(logmessage
,committer
,authors
):
132 """As git distincts between author and committer of a patch, try to
133 extract author by detecting Signed-off-by lines.
135 This walks from the end of the log message towards the top skipping
136 empty lines. Upon the first non-empty line, it walks all Signed-off-by
137 lines upwards to find the first one. For that (if found), it extracts
138 authorship information the usual way (authors table, cleaning, etc.)
140 If no Signed-off-by line is found, this defaults to the committer.
142 This may sound stupid (and it somehow is), but in log messages we
143 accidentially may have lines in the middle starting with
144 "Signed-off-by: foo" and thus matching our detection regex. Prevent
147 loglines
=logmessage
.split('\n')
149 # from tail walk to top skipping empty lines
152 if len(loglines
[i
].strip())==0: continue
155 # walk further upwards to find first sob line, store in 'first'
158 m
=sob_re
.match(loglines
[i
])
162 # if the last non-empty line matches our Signed-Off-by regex: extract username
164 r
=fixup_user(first
.group(1),authors
)
168 def export_file_contents(ctx
,manifest
,files
):
173 fctx
=ctx
.filectx(file)
175 wr('M %s inline %s' % (gitmode(manifest
.execf(file)),file))
176 wr('data %d' % len(d
)) # had some trouble with size()
179 if count
%cfg_export
_boundary
==0:
180 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
181 if max>cfg_export_boundary
:
182 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
184 def is_merge(parents
):
186 for parent
in parents
:
191 def export_commit(ui
,repo
,revision
,marks
,heads
,last
,max,count
,authors
,sob
):
192 (revnode
,_
,user
,(time
,timezone
),files
,desc
,branch
,_
)=get_changeset(ui
,repo
,revision
,authors
)
193 parents
=repo
.changelog
.parentrevs(revision
)
195 wr('commit refs/heads/%s' % branch
)
196 wr('mark :%d' % (revision
+1))
198 wr('author %s %d %s' % (get_author(desc
,user
,authors
),time
,timezone
))
199 wr('committer %s %d %s' % (user
,time
,timezone
))
200 wr('data %d' % (len(desc
)+1)) # wtf?
204 src
=heads
.get(branch
,'')
207 # if we have a cached head, this is an incremental import: initialize it
208 # and kill reference so we won't init it again
211 sys
.stderr
.write('Initializing branch [%s] to parent [%s]\n' %
213 link
=src
# avoid making a merge commit for incremental import
214 elif link
=='' and not heads
.has_key(branch
) and revision
>0:
215 # newly created branch and not the first one: connect to parent
216 tmp
=get_parent_mark(parents
[0],marks
)
218 sys
.stderr
.write('Link new branch [%s] to parent [%s]\n' %
220 link
=tmp
# avoid making a merge commit for branch fork
223 l
=last
.get(branch
,revision
)
225 # 1) as this commit implicitely is the child of the most recent
226 # commit of this branch, ignore this parent
227 # 2) ignore nonexistent parents
229 if p
==l
or p
==revision
or p
<0:
231 tmp
=get_parent_mark(p
,marks
)
232 # if we fork off a branch, don't merge with our parent via 'merge'
233 # as we have 'from' already above
236 sys
.stderr
.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
240 last
[branch
]=revision
242 # we need this later to write out tags
243 marks
[str(revision
)]=':%d'%(revision
+1)
245 ctx
=repo
.changectx(str(revision
))
247 added
,changed
,removed
,type=[],[],[],''
250 # first revision: feed in full manifest
253 elif is_merge(parents
):
254 # later merge revision: feed in changed manifest
255 # for many files comparing checksums is expensive so only do it for
256 # merges where we really need it due to hg's revlog logic
257 added
,changed
,removed
=get_filechanges(repo
,revision
,parents
,man
)
258 type='thorough delta'
260 # later non-merge revision: feed in changed manifest
261 # if we have exactly one parent, just take the changes from the
262 # manifest without expensively comparing checksums
263 f
=repo
.status(repo
.lookup(parents
[0]),revnode
)[:3]
264 added
,changed
,removed
=f
[1],f
[0],f
[2]
267 sys
.stderr
.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
268 (type,revision
+1,max,len(added
),len(changed
),len(removed
)))
270 map(lambda r
: wr('D %s' % r
),removed
)
271 export_file_contents(ctx
,man
,added
+changed
)
274 return checkpoint(count
)
276 def export_tags(ui
,repo
,marks_cache
,start
,end
,count
,authors
):
279 # ignore latest revision
280 if tag
=='tip': continue
281 rev
=repo
.changelog
.rev(node
)
282 # ignore those tags not in our import range
283 if rev
<start
or rev
>=end
: continue
285 ref
=get_parent_mark(rev
,marks_cache
)
287 sys
.stderr
.write('Failed to find reference for creating tag'
288 ' %s at r%d\n' % (tag
,rev
))
290 sys
.stderr
.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag
,rev
,ref
))
291 wr('reset refs/tags/%s' % tag
)
294 count
=checkpoint(count
)
297 def load_authors(filename
):
299 if not os
.path
.exists(filename
):
303 lre
=re
.compile('^([^=]+)[ ]*=[ ]*(.+)$')
304 for line
in f
.readlines():
308 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
310 # put key:value in cache, key without ^:
311 cache
[m
.group(1).strip()]=m
.group(2).strip()
313 sys
.stderr
.write('Loaded %d authors\n' % l
)
316 def load_cache(filename
):
318 if not os
.path
.exists(filename
):
322 for line
in f
.readlines():
324 fields
=line
.split(' ')
325 if fields
==None or not len(fields
)==2 or fields
[0][0]!=':':
326 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
328 # put key:value in cache, key without ^:
329 cache
[fields
[0][1:]]=fields
[1].split('\n')[0]
333 def save_cache(filename
,cache
):
334 f
=open(filename
,'w+')
335 map(lambda x
: f
.write(':%s %s\n' % (str(x
),str(cache
.get(x
)))),cache
.keys())
338 def verify_heads(ui
,repo
,cache
,force
):
341 f
=open(os
.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch
)
342 sha1
=f
.readlines()[0].split('\n')[0]
348 branches
=repo
.branchtags()
349 l
=[(-repo
.changelog
.rev(n
), n
, t
) for t
, n
in branches
.items()]
352 # get list of hg's branches to verify, don't take all git has
355 sys
.stderr
.write('Verifying branch [%s]\n' % b
)
359 sys
.stderr
.write('Error: Branch [%s] modified outside hg2git:'
360 '\n%s (repo) != %s (cache)\n' % (b
,sha1
,c
))
361 if not force
: return False
363 # verify that branch has exactly one head
365 for h
in repo
.heads():
366 (_
,_
,_
,_
,_
,_
,branch
,_
)=get_changeset(ui
,repo
,h
)
367 if t
.get(branch
,False):
368 sys
.stderr
.write('Error: repository has at least one unnamed head: hg r%s\n' %
369 repo
.changelog
.rev(h
))
370 if not force
: return False
375 def hg2git(repourl
,m
,marksfile
,headsfile
,tipfile
,authors
={},sob
=False,force
=False):
378 marks_cache
=load_cache(marksfile
)
379 heads_cache
=load_cache(headsfile
)
380 state_cache
=load_cache(tipfile
)
382 ui
,repo
=setup_repo(repourl
)
384 if not verify_heads(ui
,repo
,heads_cache
,force
):
387 tip
=repo
.changelog
.count()
389 min=int(state_cache
.get('tip',0))
396 for rev
in range(min,max):
397 c
=export_commit(ui
,repo
,rev
,marks_cache
,heads_cache
,last
,max,c
,authors
,sob
)
399 c
=export_tags(ui
,repo
,marks_cache
,min,max,c
,authors
)
401 sys
.stderr
.write('Issued %d commands\n' % c
)
403 state_cache
['tip']=max
404 state_cache
['repo']=repourl
405 save_cache(tipfile
,state_cache
)
409 if __name__
=='__main__':
410 def bail(parser
,opt
):
411 sys
.stderr
.write('Error: No %s option given\n' % opt
)
415 parser
=OptionParser()
417 parser
.add_option("-m","--max",type="int",dest
="max",
418 help="Maximum hg revision to import")
419 parser
.add_option("--marks",dest
="marksfile",
420 help="File to read git-fast-import's marks from")
421 parser
.add_option("--heads",dest
="headsfile",
422 help="File to read last run's git heads from")
423 parser
.add_option("--status",dest
="statusfile",
424 help="File to read status from")
425 parser
.add_option("-r","--repo",dest
="repourl",
426 help="URL of repo to import")
427 parser
.add_option("-s",action
="store_true",dest
="sob",
428 default
=False,help="Enable parsing Signed-off-by lines")
429 parser
.add_option("-A","--authors",dest
="authorfile",
430 help="Read authormap from AUTHORFILE")
431 parser
.add_option("-f","--force",action
="store_true",dest
="force",
432 default
=False,help="Ignore validation errors by force")
434 (options
,args
)=parser
.parse_args()
437 if options
.max!=None: m
=options
.max
439 if options
.marksfile
==None: bail(parser
,'--marks')
440 if options
.marksfile
==None: bail(parser
,'--heads')
441 if options
.marksfile
==None: bail(parser
,'--status')
442 if options
.marksfile
==None: bail(parser
,'--repo')
445 if options
.authorfile
!=None:
446 a
=load_authors(options
.authorfile
)
448 sys
.exit(hg2git(options
.repourl
,m
,options
.marksfile
,options
.headsfile
,
449 options
.statusfile
,authors
=a
,sob
=options
.sob
,force
=options
.force
))