3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
10 from mercurial
import repo
,hg
,cmdutil
,util
,ui
,revlog
,node
11 from tempfile
import mkstemp
12 from optparse
import OptionParser
17 # silly regex to catch Signed-off-by lines in log message
18 sob_re
=re
.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
19 # silly regex to see if user field has email address
20 user_re
=re
.compile('([^<]+) (<[^>]+>)$')
21 # silly regex to clean out user names
22 user_clean_re
=re
.compile('^["]([^"]+)["]$')
23 # git branch for hg's default 'HEAD' branch
25 # insert 'checkpoint' command after this many commits or none at all if 0
26 cfg_checkpoint_count
=0
29 sys
.stderr
.write(__doc__
)
34 return myui
,hg
.repository(myui
,url
)
36 def fixup_user(user
,authors
):
38 # if we have an authors table, try to get mapping
39 # by defaulting to the current value of 'user'
40 user
=authors
.get(user
,user
)
41 name
,mail
,m
='','',user_re
.match(user
)
43 # if we don't have 'Name <mail>' syntax, use 'user
44 # <devnull@localhost>' if use contains no at and
45 # 'user <user>' otherwise
48 mail
='<devnull@localhost>'
52 # if we have 'Name <mail>' syntax, everything is fine :)
53 name
,mail
=m
.group(1),m
.group(2)
55 # remove any silly quoting from username
56 m2
=user_clean_re
.match(name
)
59 return '%s %s' % (name
,mail
)
66 def get_changeset(ui
,repo
,revision
,authors
):
67 node
=repo
.lookup(revision
)
68 (manifest
,user
,(time
,timezone
),files
,desc
,extra
)=repo
.changelog
.read(node
)
69 tz
="%+03d%02d" % (-timezone
/ 3600, ((-timezone
% 3600) / 60))
70 branch
=get_branch(extra
.get('branch','master'))
71 return (manifest
,fixup_user(user
,authors
),(time
,tz
),files
,desc
,branch
,extra
)
74 return x
and '100755' or '100644'
78 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
80 def checkpoint(count
):
82 if cfg_checkpoint_count
>0 and count
%cfg_checkpoint
_count
==0:
83 sys
.stderr
.write("Checkpoint after %d commits\n" % count
)
88 def get_parent_mark(parent
,marks
):
89 """Get the mark for some parent.
90 If we saw it in the current session, return :%d syntax and
91 otherwise the SHA1 from the cache."""
92 return marks
.get(str(parent
+1),':%d' % (parent
+1))
95 """See if two revisions of a file are not equal."""
96 return node
.hex(f1
)!=node
.hex(f2
)
98 def outer_set(dleft
,dright
,l
,c
,r
):
99 """Loop over our repository and find all changed and missing files."""
100 for left
in dleft
.keys():
101 right
=dright
.get(left
,None)
103 # we have the file but our parent hasn't: add to left set
105 elif mismatch(dleft
[left
],right
):
106 # we have it but checksums mismatch: add to center set
108 for right
in dright
.keys():
109 left
=dleft
.get(right
,None)
111 # if parent has file but we don't: add to right set
113 # change is already handled when comparing child against parent
116 def get_filechanges(repo
,revision
,parents
,mleft
):
117 """Given some repository and revision, find all changed/deleted files."""
121 mright
=repo
.changectx(p
).manifest()
126 l
,c
,r
=outer_set(mleft
,mright
,l
,c
,r
)
129 def get_author(logmessage
,committer
,authors
):
130 """As git distincts between author and committer of a patch, try to
131 extract author by detecting Signed-off-by lines.
133 This walks from the end of the log message towards the top skipping
134 empty lines. Upon the first non-empty line, it walks all Signed-off-by
135 lines upwards to find the first one. For that (if found), it extracts
136 authorship information the usual way (authors table, cleaning, etc.)
138 If no Signed-off-by line is found, this defaults to the committer.
140 This may sound stupid (and it somehow is), but in log messages we
141 accidentially may have lines in the middle starting with
142 "Signed-off-by: foo" and thus matching our detection regex. Prevent
145 loglines
=logmessage
.split('\n')
147 # from tail walk to top skipping empty lines
150 if len(loglines
[i
].strip())==0: continue
153 # walk further upwards to find first sob line, store in 'first'
156 m
=sob_re
.match(loglines
[i
])
160 # if the last non-empty line matches our Signed-Off-by regex: extract username
162 r
=fixup_user(first
.group(1),authors
)
166 def export_commit(ui
,repo
,revision
,marks
,heads
,last
,max,count
,authors
,sob
):
167 (_
,user
,(time
,timezone
),files
,desc
,branch
,_
)=get_changeset(ui
,repo
,revision
,authors
)
168 parents
=repo
.changelog
.parentrevs(revision
)
170 wr('commit refs/heads/%s' % branch
)
171 wr('mark :%d' % (revision
+1))
173 wr('author %s %d %s' % (get_author(desc
,user
,authors
),time
,timezone
))
174 wr('committer %s %d %s' % (user
,time
,timezone
))
175 wr('data %d' % (len(desc
)+1)) # wtf?
179 src
=heads
.get(branch
,'')
182 # if we have a cached head, this is an incremental import: initialize it
183 # and kill reference so we won't init it again
186 sys
.stderr
.write('Initializing branch [%s] to parent [%s]\n' %
188 link
=src
# avoid making a merge commit for incremental import
189 elif link
=='' and not heads
.has_key(branch
) and revision
>0:
190 # newly created branch and not the first one: connect to parent
191 tmp
=get_parent_mark(parents
[0],marks
)
193 sys
.stderr
.write('Link new branch [%s] to parent [%s]\n' %
195 link
=tmp
# avoid making a merge commit for branch fork
198 l
=last
.get(branch
,revision
)
200 # 1) as this commit implicitely is the child of the most recent
201 # commit of this branch, ignore this parent
202 # 2) ignore nonexistent parents
204 if p
==l
or p
==revision
or p
<0:
206 tmp
=get_parent_mark(p
,marks
)
207 # if we fork off a branch, don't merge with our parent via 'merge'
208 # as we have 'from' already above
211 sys
.stderr
.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
215 last
[branch
]=revision
217 # we need this later to write out tags
218 marks
[str(revision
)]=':%d'%(revision
+1)
220 ctx
=repo
.changectx(str(revision
))
222 added
,changed
,removed
=get_filechanges(repo
,revision
,parents
,man
)
224 sys
.stderr
.write('Exporting revision %d/%d with %d/%d/%d added/changed/removed files\n' %
225 (revision
,max,len(added
),len(changed
),len(removed
)))
227 for a
in added
+changed
:
230 wr('M %s inline %s' % (gitmode(man
.execf(a
)),a
))
231 wr('data %d' % len(d
)) # had some trouble with size()
238 return checkpoint(count
)
240 def export_tags(ui
,repo
,marks_cache
,start
,end
,count
,authors
):
243 # ignore latest revision
244 if tag
=='tip': continue
245 rev
=repo
.changelog
.rev(node
)
246 # ignore those tags not in our import range
247 if rev
<start
or rev
>=end
: continue
249 ref
=marks_cache
.get(str(rev
),None)
251 sys
.stderr
.write('Failed to find reference for creating tag'
252 ' %s at r%d\n' % (tag
,rev
))
254 (_
,user
,(time
,timezone
),_
,desc
,branch
,_
)=get_changeset(ui
,repo
,rev
,authors
)
255 sys
.stderr
.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag
,rev
,ref
))
258 wr('tagger %s %d %s' % (user
,time
,timezone
))
259 msg
='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag
,
260 rev
,branch
,desc
.split('\n')[0])
261 wr('data %d' % (len(msg
)+1))
264 count
=checkpoint(count
)
267 def load_cache(filename
):
269 if not os
.path
.exists(filename
):
273 for line
in f
.readlines():
275 fields
=line
.split(' ')
276 if fields
==None or not len(fields
)==2 or fields
[0][0]!=':':
277 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
279 # put key:value in cache, key without ^:
280 cache
[fields
[0][1:]]=fields
[1].split('\n')[0]
284 def save_cache(filename
,cache
):
285 f
=open(filename
,'w+')
286 map(lambda x
: f
.write(':%s %s\n' % (str(x
),str(cache
.get(x
)))),cache
.keys())
289 def verify_heads(ui
,repo
,cache
):
291 f
=open(os
.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch
)
292 sha1
=f
.readlines()[0].split('\n')[0]
296 # get list of hg's branches to verify, don't take all git has
297 branches
=repo
.branchtags()
298 l
=[(-repo
.changelog
.rev(n
), n
, t
) for t
, n
in branches
.items()]
303 sys
.stderr
.write('Verifying branch [%s]\n' % b
)
307 sys
.stderr
.write('Warning: Branch [%s] modified outside hg2git:'
308 '\n%s (repo) != %s (cache)\n' % (b
,sha1
,c
))
311 def hg2git(repourl
,m
,marksfile
,headsfile
,tipfile
,authors
={},sob
=False):
314 marks_cache
=load_cache(marksfile
)
315 heads_cache
=load_cache(headsfile
)
316 state_cache
=load_cache(tipfile
)
318 ui
,repo
=setup_repo(repourl
)
320 if not verify_heads(ui
,repo
,heads_cache
):
323 tip
=repo
.changelog
.count()
325 min=int(state_cache
.get('tip',0))
332 for rev
in range(min,max):
333 c
=export_commit(ui
,repo
,rev
,marks_cache
,heads_cache
,last
,tip
,c
,authors
,sob
)
335 c
=export_tags(ui
,repo
,marks_cache
,min,max,c
,authors
)
337 sys
.stderr
.write('Issued %d commands\n' % c
)
339 state_cache
['tip']=max
340 state_cache
['repo']=repourl
341 save_cache(tipfile
,state_cache
)
345 if __name__
=='__main__':
346 def bail(parser
,opt
):
347 sys
.stderr
.write('Error: No %s option given\n' % opt
)
351 parser
=OptionParser()
353 parser
.add_option("-m","--max",type="int",dest
="max",
354 help="Maximum hg revision to import")
355 parser
.add_option("--marks",dest
="marksfile",
356 help="File to read git-fast-import's marks from")
357 parser
.add_option("--heads",dest
="headsfile",
358 help="File to read last run's git heads from")
359 parser
.add_option("--status",dest
="statusfile",
360 help="File to read status from")
361 parser
.add_option("-r","--repo",dest
="repourl",
362 help="URL of repo to import")
363 parser
.add_option("-s",action
="store_true",dest
="sob",
364 default
=False,help="Enable parsing Signed-off-by lines")
366 (options
,args
)=parser
.parse_args()
369 if options
.max!=None: m
=options
.max
371 if options
.marksfile
==None: bail(parser
,'--marks')
372 if options
.marksfile
==None: bail(parser
,'--heads')
373 if options
.marksfile
==None: bail(parser
,'--status')
374 if options
.marksfile
==None: bail(parser
,'--repo')
376 sys
.exit(hg2git(options
.repourl
,m
,options
.marksfile
,options
.headsfile
,
377 options
.headsfile
,sob
=options
.sob
))