3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
6 """hg2git.py - A mercurial-to-git filter for git-fast-import(1)
7 Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
10 from mercurial
import repo
,hg
,cmdutil
,util
,ui
,revlog
,node
11 from tempfile
import mkstemp
16 # silly regex to catch Signed-off-by lines in log message
17 sob_re
=re
.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
18 # silly regex to see if user field has email address
19 user_re
=re
.compile('([^<]+) (<[^>]+>)$')
20 # silly regex to clean out user names
21 user_clean_re
=re
.compile('^["]([^"]+)["]$')
22 # git branch for hg's default 'HEAD' branch
24 # insert 'checkpoint' command after this many commits or none at all if 0
25 cfg_checkpoint_count
=0
28 sys
.stderr
.write(__doc__
)
33 return myui
,hg
.repository(myui
,url
)
35 def fixup_user(user
,authors
):
37 # if we have an authors table, try to get mapping
38 # by defaulting to the current value of 'user'
39 user
=authors
.get(user
,user
)
40 name
,mail
,m
='','',user_re
.match(user
)
42 # if we don't have 'Name <mail>' syntax, use 'user
43 # <devnull@localhost>' if use contains no at and
44 # 'user <user>' otherwise
47 mail
='<devnull@localhost>'
51 # if we have 'Name <mail>' syntax, everything is fine :)
52 name
,mail
=m
.group(1),m
.group(2)
54 # remove any silly quoting from username
55 m2
=user_clean_re
.match(name
)
58 return '%s %s' % (name
,mail
)
60 def get_changeset(ui
,repo
,revision
,authors
):
65 node
=repo
.lookup(revision
)
66 (manifest
,user
,(time
,timezone
),files
,desc
,extra
)=repo
.changelog
.read(node
)
67 tz
="%+03d%02d" % (-timezone
/ 3600, ((-timezone
% 3600) / 60))
68 branch
=get_branch(extra
.get('branch','master'))
69 return (manifest
,fixup_user(user
,authors
),(time
,tz
),files
,desc
,branch
,extra
)
72 return x
and '100755' or '100644'
76 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
78 def checkpoint(count
):
80 if cfg_checkpoint_count
>0 and count
%cfg_checkpoint
_count
==0:
81 sys
.stderr
.write("Checkpoint after %d commits\n" % count
)
86 def get_parent_mark(parent
,marks
):
87 """Get the mark for some parent.
88 If we saw it in the current session, return :%d syntax and
89 otherwise the SHA1 from the cache."""
90 return marks
.get(str(parent
+1),':%d' % (parent
+1))
93 """See if two revisions of a file are not equal."""
94 return node
.hex(f1
)!=node
.hex(f2
)
96 def outer_set(dleft
,dright
,l
,c
,r
):
97 """Loop over our repository and find all changed and missing files."""
98 for left
in dleft
.keys():
99 right
=dright
.get(left
,None)
101 # we have the file but our parent hasn't: add to left set
103 elif mismatch(dleft
[left
],right
):
104 # we have it but checksums mismatch: add to center set
106 for right
in dright
.keys():
107 left
=dleft
.get(right
,None)
109 # if parent has file but we don't: add to right set
111 # change is already handled when comparing child against parent
114 def get_filechanges(repo
,revision
,parents
,mleft
):
115 """Given some repository and revision, find all changed/deleted files."""
119 mright
=repo
.changectx(p
).manifest()
124 l
,c
,r
=outer_set(mleft
,mright
,l
,c
,r
)
127 def get_author(logmessage
,committer
,authors
):
128 """As git distincts between author and committer of a patch, try to
129 extract author by detecting Signed-off-by lines.
131 This walks from the end of the log message towards the top skipping
132 empty lines. Upon the first non-empty line, it walks all Signed-off-by
133 lines upwards to find the first one. For that (if found), it extracts
134 authorship information the usual way (authors table, cleaning, etc.)
136 If no Signed-off-by line is found, this defaults to the committer.
138 This may sound stupid (and it somehow is), but in log messages we
139 accidentially may have lines in the middle starting with
140 "Signed-off-by: foo" and thus matching our detection regex. Prevent
143 loglines
=logmessage
.split('\n')
145 # from tail walk to top skipping empty lines
148 if len(loglines
[i
].strip())==0: continue
151 # walk further upwards to find first sob line, store in 'first'
154 m
=sob_re
.match(loglines
[i
])
158 # if the last non-empty line matches our Signed-Off-by regex: extract username
160 r
=fixup_user(first
.group(1),authors
)
164 def export_commit(ui
,repo
,revision
,marks
,heads
,last
,max,count
,authors
):
165 (_
,user
,(time
,timezone
),files
,desc
,branch
,_
)=get_changeset(ui
,repo
,revision
,authors
)
166 parents
=repo
.changelog
.parentrevs(revision
)
168 wr('commit refs/heads/%s' % branch
)
169 wr('mark :%d' % (revision
+1))
170 wr('author %s %d %s' % (get_author(desc
,user
,authors
),time
,timezone
))
171 wr('committer %s %d %s' % (user
,time
,timezone
))
172 wr('data %d' % (len(desc
)+1)) # wtf?
176 src
=heads
.get(branch
,'')
179 # if we have a cached head, this is an incremental import: initialize it
180 # and kill reference so we won't init it again
183 sys
.stderr
.write('Initializing branch [%s] to parent [%s]\n' %
185 link
=src
# avoid making a merge commit for incremental import
186 elif link
=='' and not heads
.has_key(branch
) and revision
>0:
187 # newly created branch and not the first one: connect to parent
188 tmp
=get_parent_mark(parents
[0],marks
)
190 sys
.stderr
.write('Link new branch [%s] to parent [%s]\n' %
192 link
=tmp
# avoid making a merge commit for branch fork
195 l
=last
.get(branch
,revision
)
197 # 1) as this commit implicitely is the child of the most recent
198 # commit of this branch, ignore this parent
199 # 2) ignore nonexistent parents
201 if p
==l
or p
==revision
or p
<0:
203 tmp
=get_parent_mark(p
,marks
)
204 # if we fork off a branch, don't merge with our parent via 'merge'
205 # as we have 'from' already above
208 sys
.stderr
.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
212 last
[branch
]=revision
214 # we need this later to write out tags
215 marks
[str(revision
)]=':%d'%(revision
+1)
217 ctx
=repo
.changectx(str(revision
))
219 added
,changed
,removed
=get_filechanges(repo
,revision
,parents
,man
)
221 sys
.stderr
.write('Exporting revision %d with %d/%d/%d added/changed/removed files\n' %
222 (revision
,len(added
),len(changed
),len(removed
)))
224 for a
in added
+changed
:
227 wr('M %s inline %s' % (gitmode(man
.execf(a
)),a
))
228 wr('data %d' % len(d
)) # had some trouble with size()
235 return checkpoint(count
)
237 def export_tags(ui
,repo
,marks_cache
,start
,end
,count
,authors
):
240 # ignore latest revision
241 if tag
=='tip': continue
242 rev
=repo
.changelog
.rev(node
)
243 # ignore those tags not in our import range
244 if rev
<start
or rev
>=end
: continue
246 ref
=marks_cache
.get(str(rev
),None)
248 sys
.stderr
.write('Failed to find reference for creating tag'
249 ' %s at r%d\n' % (tag
,rev
))
251 (_
,user
,(time
,timezone
),_
,desc
,branch
,_
)=get_changeset(ui
,repo
,rev
,authors
)
252 sys
.stderr
.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag
,rev
,ref
))
255 wr('tagger %s %d %s' % (user
,time
,timezone
))
256 msg
='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag
,
257 rev
,branch
,desc
.split('\n')[0])
258 wr('data %d' % (len(msg
)+1))
261 count
=checkpoint(count
)
264 def load_cache(filename
):
266 if not os
.path
.exists(filename
):
270 for line
in f
.readlines():
272 fields
=line
.split(' ')
273 if fields
==None or not len(fields
)==2 or fields
[0][0]!=':':
274 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
276 # put key:value in cache, key without ^:
277 cache
[fields
[0][1:]]=fields
[1].split('\n')[0]
281 def save_cache(filename
,cache
):
282 f
=open(filename
,'w+')
283 map(lambda x
: f
.write(':%s %s\n' % (str(x
),str(cache
.get(x
)))),cache
.keys())
286 def verify_heads(ui
,repo
,cache
):
288 f
=open(os
.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch
)
289 sha1
=f
.readlines()[0].split('\n')[0]
293 for b
in cache
.keys():
294 sys
.stderr
.write('Verifying branch [%s]\n' % b
)
298 sys
.stderr
.write('Warning: Branch [%s] modified outside hg2git:'
299 '\n%s (repo) != %s (cache)\n' % (b
,sha1
,c
))
302 def hg2git(repourl
,m
,marksfile
,headsfile
,tipfile
,authors
={}):
305 marks_cache
=load_cache(marksfile
)
306 heads_cache
=load_cache(headsfile
)
307 state_cache
=load_cache(tipfile
)
309 ui
,repo
=setup_repo(repourl
)
311 if not verify_heads(ui
,repo
,heads_cache
):
314 tip
=repo
.changelog
.count()
316 min=int(state_cache
.get('tip',0))
323 for rev
in range(min,max):
324 c
=export_commit(ui
,repo
,rev
,marks_cache
,heads_cache
,last
,tip
,c
,authors
)
326 c
=export_tags(ui
,repo
,marks_cache
,min,max,c
,authors
)
328 sys
.stderr
.write('Issued %d commands\n' % c
)
330 state_cache
['tip']=max
331 state_cache
['repo']=repourl
332 save_cache(tipfile
,state_cache
)
336 if __name__
=='__main__':
337 if len(sys
.argv
)!=6: sys
.exit(usage(1))
338 repourl
,m
,marksfile
,headsfile
,tipfile
=sys
.argv
[1:]
339 sys
.exit(hg2git(repourl
,m
,marksfile
,headsfile
,tipfile
))