3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
10 from mercurial
import repo
,hg
,cmdutil
,util
,ui
,revlog
,node
11 from tempfile
import mkstemp
12 from optparse
import OptionParser
17 # silly regex to catch Signed-off-by lines in log message
18 sob_re
=re
.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
19 # silly regex to see if user field has email address
20 user_re
=re
.compile('([^<]+) (<[^>]+>)$')
21 # silly regex to clean out user names
22 user_clean_re
=re
.compile('^["]([^"]+)["]$')
23 # git branch for hg's default 'HEAD' branch
25 # insert 'checkpoint' command after this many commits or none at all if 0
26 cfg_checkpoint_count
=0
27 # write some progress message every this many file contents written
28 cfg_export_boundary
=1000
31 sys
.stderr
.write(__doc__
)
36 return myui
,hg
.repository(myui
,url
)
38 def fixup_user(user
,authors
):
40 # if we have an authors table, try to get mapping
41 # by defaulting to the current value of 'user'
42 user
=authors
.get(user
,user
)
43 name
,mail
,m
='','',user_re
.match(user
)
45 # if we don't have 'Name <mail>' syntax, use 'user
46 # <devnull@localhost>' if use contains no at and
47 # 'user <user>' otherwise
50 mail
='<devnull@localhost>'
54 # if we have 'Name <mail>' syntax, everything is fine :)
55 name
,mail
=m
.group(1),m
.group(2)
57 # remove any silly quoting from username
58 m2
=user_clean_re
.match(name
)
61 return '%s %s' % (name
,mail
)
68 def get_changeset(ui
,repo
,revision
,authors
):
69 node
=repo
.lookup(revision
)
70 (manifest
,user
,(time
,timezone
),files
,desc
,extra
)=repo
.changelog
.read(node
)
71 tz
="%+03d%02d" % (-timezone
/ 3600, ((-timezone
% 3600) / 60))
72 branch
=get_branch(extra
.get('branch','master'))
73 return (manifest
,fixup_user(user
,authors
),(time
,tz
),files
,desc
,branch
,extra
)
76 return x
and '100755' or '100644'
80 map(lambda x
: sys
.stderr
.write('\t[%s]\n' % x
),msg
.split('\n'))
82 def checkpoint(count
):
84 if cfg_checkpoint_count
>0 and count
%cfg_checkpoint
_count
==0:
85 sys
.stderr
.write("Checkpoint after %d commits\n" % count
)
90 def get_parent_mark(parent
,marks
):
91 """Get the mark for some parent.
92 If we saw it in the current session, return :%d syntax and
93 otherwise the SHA1 from the cache."""
94 return marks
.get(str(parent
+1),':%d' % (parent
+1))
97 """See if two revisions of a file are not equal."""
98 return node
.hex(f1
)!=node
.hex(f2
)
100 def outer_set(dleft
,dright
,l
,c
,r
):
101 """Loop over our repository and find all changed and missing files."""
102 for left
in dleft
.keys():
103 right
=dright
.get(left
,None)
105 # we have the file but our parent hasn't: add to left set
107 elif mismatch(dleft
[left
],right
):
108 # we have it but checksums mismatch: add to center set
110 for right
in dright
.keys():
111 left
=dleft
.get(right
,None)
113 # if parent has file but we don't: add to right set
115 # change is already handled when comparing child against parent
118 def get_filechanges(repo
,revision
,parents
,mleft
):
119 """Given some repository and revision, find all changed/deleted files."""
123 mright
=repo
.changectx(p
).manifest()
128 l
,c
,r
=outer_set(mleft
,mright
,l
,c
,r
)
131 def get_author(logmessage
,committer
,authors
):
132 """As git distincts between author and committer of a patch, try to
133 extract author by detecting Signed-off-by lines.
135 This walks from the end of the log message towards the top skipping
136 empty lines. Upon the first non-empty line, it walks all Signed-off-by
137 lines upwards to find the first one. For that (if found), it extracts
138 authorship information the usual way (authors table, cleaning, etc.)
140 If no Signed-off-by line is found, this defaults to the committer.
142 This may sound stupid (and it somehow is), but in log messages we
143 accidentially may have lines in the middle starting with
144 "Signed-off-by: foo" and thus matching our detection regex. Prevent
147 loglines
=logmessage
.split('\n')
149 # from tail walk to top skipping empty lines
152 if len(loglines
[i
].strip())==0: continue
155 # walk further upwards to find first sob line, store in 'first'
158 m
=sob_re
.match(loglines
[i
])
162 # if the last non-empty line matches our Signed-Off-by regex: extract username
164 r
=fixup_user(first
.group(1),authors
)
168 def export_file_contents(ctx
,manifest
,files
):
172 fctx
=ctx
.filectx(file)
174 wr('M %s inline %s' % (gitmode(manifest
.execf(file)),file))
175 wr('data %d' % len(d
)) # had some trouble with size()
178 if count
%cfg_export
_boundary
==0:
179 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
180 if max>cfg_export_boundary
:
181 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
183 def export_commit(ui
,repo
,revision
,marks
,heads
,last
,max,count
,authors
,sob
):
184 (_
,user
,(time
,timezone
),files
,desc
,branch
,_
)=get_changeset(ui
,repo
,revision
,authors
)
185 parents
=repo
.changelog
.parentrevs(revision
)
187 wr('commit refs/heads/%s' % branch
)
188 wr('mark :%d' % (revision
+1))
190 wr('author %s %d %s' % (get_author(desc
,user
,authors
),time
,timezone
))
191 wr('committer %s %d %s' % (user
,time
,timezone
))
192 wr('data %d' % (len(desc
)+1)) # wtf?
196 src
=heads
.get(branch
,'')
199 # if we have a cached head, this is an incremental import: initialize it
200 # and kill reference so we won't init it again
203 sys
.stderr
.write('Initializing branch [%s] to parent [%s]\n' %
205 link
=src
# avoid making a merge commit for incremental import
206 elif link
=='' and not heads
.has_key(branch
) and revision
>0:
207 # newly created branch and not the first one: connect to parent
208 tmp
=get_parent_mark(parents
[0],marks
)
210 sys
.stderr
.write('Link new branch [%s] to parent [%s]\n' %
212 link
=tmp
# avoid making a merge commit for branch fork
215 l
=last
.get(branch
,revision
)
217 # 1) as this commit implicitely is the child of the most recent
218 # commit of this branch, ignore this parent
219 # 2) ignore nonexistent parents
221 if p
==l
or p
==revision
or p
<0:
223 tmp
=get_parent_mark(p
,marks
)
224 # if we fork off a branch, don't merge with our parent via 'merge'
225 # as we have 'from' already above
228 sys
.stderr
.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
232 last
[branch
]=revision
234 # we need this later to write out tags
235 marks
[str(revision
)]=':%d'%(revision
+1)
237 ctx
=repo
.changectx(str(revision
))
241 # first revision: feed in full manifest
242 sys
.stderr
.write('Exporting full revision %d/%d with %d added files\n' %
243 (revision
,max,len(man
.keys())))
244 export_file_contents(ctx
,man
,man
.keys())
246 # later revision: feed in changed manifest
247 added
,changed
,removed
=get_filechanges(repo
,revision
,parents
,man
)
248 sys
.stderr
.write('Exporting delta revision %d/%d with %d/%d/%d added/changed/removed files\n' %
249 (revision
,max,len(added
),len(changed
),len(removed
)))
250 export_file_contents(ctx
,man
,added
+changed
)
255 return checkpoint(count
)
257 def export_tags(ui
,repo
,marks_cache
,start
,end
,count
,authors
):
260 # ignore latest revision
261 if tag
=='tip': continue
262 rev
=repo
.changelog
.rev(node
)
263 # ignore those tags not in our import range
264 if rev
<start
or rev
>=end
: continue
266 ref
=marks_cache
.get(str(rev
),None)
268 sys
.stderr
.write('Failed to find reference for creating tag'
269 ' %s at r%d\n' % (tag
,rev
))
271 (_
,user
,(time
,timezone
),_
,desc
,branch
,_
)=get_changeset(ui
,repo
,rev
,authors
)
272 sys
.stderr
.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag
,rev
,ref
))
275 wr('tagger %s %d %s' % (user
,time
,timezone
))
276 msg
='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag
,
277 rev
,branch
,desc
.split('\n')[0])
278 wr('data %d' % (len(msg
)+1))
281 count
=checkpoint(count
)
284 def load_authors(filename
):
286 if not os
.path
.exists(filename
):
290 lre
=re
.compile('^([^=]+)[ ]*=[ ]*(.+)$')
291 for line
in f
.readlines():
295 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
297 # put key:value in cache, key without ^:
298 cache
[m
.group(1).strip()]=m
.group(2).strip()
300 sys
.stderr
.write('Loaded %d authors\n' % l
)
303 def load_cache(filename
):
305 if not os
.path
.exists(filename
):
309 for line
in f
.readlines():
311 fields
=line
.split(' ')
312 if fields
==None or not len(fields
)==2 or fields
[0][0]!=':':
313 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
315 # put key:value in cache, key without ^:
316 cache
[fields
[0][1:]]=fields
[1].split('\n')[0]
320 def save_cache(filename
,cache
):
321 f
=open(filename
,'w+')
322 map(lambda x
: f
.write(':%s %s\n' % (str(x
),str(cache
.get(x
)))),cache
.keys())
325 def verify_heads(ui
,repo
,cache
):
328 f
=open(os
.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch
)
329 sha1
=f
.readlines()[0].split('\n')[0]
335 # get list of hg's branches to verify, don't take all git has
336 branches
=repo
.branchtags()
337 l
=[(-repo
.changelog
.rev(n
), n
, t
) for t
, n
in branches
.items()]
342 sys
.stderr
.write('Verifying branch [%s]\n' % b
)
346 sys
.stderr
.write('Warning: Branch [%s] modified outside hg2git:'
347 '\n%s (repo) != %s (cache)\n' % (b
,sha1
,c
))
350 def hg2git(repourl
,m
,marksfile
,headsfile
,tipfile
,authors
={},sob
=False):
353 marks_cache
=load_cache(marksfile
)
354 heads_cache
=load_cache(headsfile
)
355 state_cache
=load_cache(tipfile
)
357 ui
,repo
=setup_repo(repourl
)
359 if not verify_heads(ui
,repo
,heads_cache
):
362 tip
=repo
.changelog
.count()
364 min=int(state_cache
.get('tip',0))
371 for rev
in range(min,max):
372 c
=export_commit(ui
,repo
,rev
,marks_cache
,heads_cache
,last
,max,c
,authors
,sob
)
374 c
=export_tags(ui
,repo
,marks_cache
,min,max,c
,authors
)
376 sys
.stderr
.write('Issued %d commands\n' % c
)
378 state_cache
['tip']=max
379 state_cache
['repo']=repourl
380 save_cache(tipfile
,state_cache
)
384 if __name__
=='__main__':
385 def bail(parser
,opt
):
386 sys
.stderr
.write('Error: No %s option given\n' % opt
)
390 parser
=OptionParser()
392 parser
.add_option("-m","--max",type="int",dest
="max",
393 help="Maximum hg revision to import")
394 parser
.add_option("--marks",dest
="marksfile",
395 help="File to read git-fast-import's marks from")
396 parser
.add_option("--heads",dest
="headsfile",
397 help="File to read last run's git heads from")
398 parser
.add_option("--status",dest
="statusfile",
399 help="File to read status from")
400 parser
.add_option("-r","--repo",dest
="repourl",
401 help="URL of repo to import")
402 parser
.add_option("-s",action
="store_true",dest
="sob",
403 default
=False,help="Enable parsing Signed-off-by lines")
404 parser
.add_option("-A","--authors",dest
="authorfile",
405 help="Read authormap from AUTHORFILE")
407 (options
,args
)=parser
.parse_args()
410 if options
.max!=None: m
=options
.max
412 if options
.marksfile
==None: bail(parser
,'--marks')
413 if options
.marksfile
==None: bail(parser
,'--heads')
414 if options
.marksfile
==None: bail(parser
,'--status')
415 if options
.marksfile
==None: bail(parser
,'--repo')
418 if options
.authorfile
!=None:
419 a
=load_authors(options
.authorfile
)
421 sys
.exit(hg2git(options
.repourl
,m
,options
.marksfile
,options
.headsfile
,
422 options
.statusfile
,authors
=a
,sob
=options
.sob
))