3 # Copyright (c) 2007 Rocco Rutte <pdmef@gmx.net>
4 # License: MIT <http://www.opensource.org/licenses/mit-license.php>
6 from mercurial
import repo
,hg
,cmdutil
,util
,ui
,revlog
,node
7 from tempfile
import mkstemp
8 from optparse
import OptionParser
13 # silly regex to catch Signed-off-by lines in log message
14 sob_re
=re
.compile('^Signed-[Oo]ff-[Bb]y: (.+)$')
15 # silly regex to see if user field has email address
16 user_re
=re
.compile('([^<]+) (<[^>]+>)$')
17 # silly regex to clean out user names
18 user_clean_re
=re
.compile('^["]([^"]+)["]$')
19 # git branch for hg's default 'HEAD' branch
21 # insert 'checkpoint' command after this many commits or none at all if 0
22 cfg_checkpoint_count
=0
23 # write some progress message every this many file contents written
24 cfg_export_boundary
=1000
28 return myui
,hg
.repository(myui
,url
)
30 def fixup_user(user
,authors
):
32 # if we have an authors table, try to get mapping
33 # by defaulting to the current value of 'user'
34 user
=authors
.get(user
,user
)
35 name
,mail
,m
='','',user_re
.match(user
)
37 # if we don't have 'Name <mail>' syntax, use 'user
38 # <devnull@localhost>' if use contains no at and
39 # 'user <user>' otherwise
42 mail
='<devnull@localhost>'
46 # if we have 'Name <mail>' syntax, everything is fine :)
47 name
,mail
=m
.group(1),m
.group(2)
49 # remove any silly quoting from username
50 m2
=user_clean_re
.match(name
)
53 return '%s %s' % (name
,mail
)
60 def get_changeset(ui
,repo
,revision
,authors
={}):
61 node
=repo
.lookup(revision
)
62 (manifest
,user
,(time
,timezone
),files
,desc
,extra
)=repo
.changelog
.read(node
)
63 tz
="%+03d%02d" % (-timezone
/ 3600, ((-timezone
% 3600) / 60))
64 branch
=get_branch(extra
.get('branch','master'))
65 return (node
,manifest
,fixup_user(user
,authors
),(time
,tz
),files
,desc
,branch
,extra
)
68 return x
and '100755' or '100644'
72 #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))
74 def checkpoint(count
):
76 if cfg_checkpoint_count
>0 and count
%cfg_checkpoint
_count
==0:
77 sys
.stderr
.write("Checkpoint after %d commits\n" % count
)
82 def get_parent_mark(parent
,marks
):
83 """Get the mark for some parent.
84 If we saw it in the current session, return :%d syntax and
85 otherwise the SHA1 from the cache."""
86 return marks
.get(str(parent
+1),':%d' % (parent
+1))
89 """See if two revisions of a file are not equal."""
90 return node
.hex(f1
)!=node
.hex(f2
)
92 def outer_set(dleft
,dright
,l
,c
,r
):
93 """Loop over our repository and find all changed and missing files."""
94 for left
in dleft
.keys():
95 right
=dright
.get(left
,None)
97 # we have the file but our parent hasn't: add to left set
99 elif mismatch(dleft
[left
],right
):
100 # we have it but checksums mismatch: add to center set
102 for right
in dright
.keys():
103 left
=dleft
.get(right
,None)
105 # if parent has file but we don't: add to right set
107 # change is already handled when comparing child against parent
110 def get_filechanges(repo
,revision
,parents
,mleft
):
111 """Given some repository and revision, find all changed/deleted files."""
115 mright
=repo
.changectx(p
).manifest()
120 l
,c
,r
=outer_set(mleft
,mright
,l
,c
,r
)
123 def get_author(logmessage
,committer
,authors
):
124 """As git distincts between author and committer of a patch, try to
125 extract author by detecting Signed-off-by lines.
127 This walks from the end of the log message towards the top skipping
128 empty lines. Upon the first non-empty line, it walks all Signed-off-by
129 lines upwards to find the first one. For that (if found), it extracts
130 authorship information the usual way (authors table, cleaning, etc.)
132 If no Signed-off-by line is found, this defaults to the committer.
134 This may sound stupid (and it somehow is), but in log messages we
135 accidentially may have lines in the middle starting with
136 "Signed-off-by: foo" and thus matching our detection regex. Prevent
139 loglines
=logmessage
.split('\n')
141 # from tail walk to top skipping empty lines
144 if len(loglines
[i
].strip())==0: continue
147 # walk further upwards to find first sob line, store in 'first'
150 m
=sob_re
.match(loglines
[i
])
154 # if the last non-empty line matches our Signed-Off-by regex: extract username
156 r
=fixup_user(first
.group(1),authors
)
160 def export_file_contents(ctx
,manifest
,files
):
165 fctx
=ctx
.filectx(file)
167 wr('M %s inline %s' % (gitmode(manifest
.execf(file)),file))
168 wr('data %d' % len(d
)) # had some trouble with size()
171 if count
%cfg_export
_boundary
==0:
172 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
173 if max>cfg_export_boundary
:
174 sys
.stderr
.write('Exported %d/%d files\n' % (count
,max))
176 def is_merge(parents
):
178 for parent
in parents
:
183 def export_commit(ui
,repo
,revision
,marks
,heads
,last
,max,count
,authors
,sob
):
184 (revnode
,_
,user
,(time
,timezone
),files
,desc
,branch
,_
)=get_changeset(ui
,repo
,revision
,authors
)
185 parents
=repo
.changelog
.parentrevs(revision
)
187 wr('commit refs/heads/%s' % branch
)
188 wr('mark :%d' % (revision
+1))
190 wr('author %s %d %s' % (get_author(desc
,user
,authors
),time
,timezone
))
191 wr('committer %s %d %s' % (user
,time
,timezone
))
192 wr('data %d' % (len(desc
)+1)) # wtf?
196 src
=heads
.get(branch
,'')
199 # if we have a cached head, this is an incremental import: initialize it
200 # and kill reference so we won't init it again
203 sys
.stderr
.write('Initializing branch [%s] to parent [%s]\n' %
205 link
=src
# avoid making a merge commit for incremental import
206 elif link
=='' and not heads
.has_key(branch
) and revision
>0:
207 # newly created branch and not the first one: connect to parent
208 tmp
=get_parent_mark(parents
[0],marks
)
210 sys
.stderr
.write('Link new branch [%s] to parent [%s]\n' %
212 link
=tmp
# avoid making a merge commit for branch fork
215 l
=last
.get(branch
,revision
)
217 # 1) as this commit implicitely is the child of the most recent
218 # commit of this branch, ignore this parent
219 # 2) ignore nonexistent parents
221 if p
==l
or p
==revision
or p
<0:
223 tmp
=get_parent_mark(p
,marks
)
224 # if we fork off a branch, don't merge with our parent via 'merge'
225 # as we have 'from' already above
228 sys
.stderr
.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
232 last
[branch
]=revision
234 # we need this later to write out tags
235 marks
[str(revision
)]=':%d'%(revision
+1)
237 ctx
=repo
.changectx(str(revision
))
239 added
,changed
,removed
,type=[],[],[],''
242 # first revision: feed in full manifest
245 elif is_merge(parents
):
246 # later merge revision: feed in changed manifest
247 # for many files comparing checksums is expensive so only do it for
248 # merges where we really need it due to hg's revlog logic
249 added
,changed
,removed
=get_filechanges(repo
,revision
,parents
,man
)
250 type='thorough delta'
252 # later non-merge revision: feed in changed manifest
253 # if we have exactly one parent, just take the changes from the
254 # manifest without expensively comparing checksums
255 f
=repo
.status(repo
.lookup(parents
[0]),revnode
)[:3]
256 added
,changed
,removed
=f
[1],f
[0],f
[2]
259 sys
.stderr
.write('Exporting %s revision %d/%d with %d/%d/%d added/changed/removed files\n' %
260 (type,revision
+1,max,len(added
),len(changed
),len(removed
)))
262 map(lambda r
: wr('D %s' % r
),removed
)
263 export_file_contents(ctx
,man
,added
+changed
)
266 return checkpoint(count
)
268 def export_tags(ui
,repo
,marks_cache
,start
,end
,count
,authors
):
271 # ignore latest revision
272 if tag
=='tip': continue
273 rev
=repo
.changelog
.rev(node
)
274 # ignore those tags not in our import range
275 if rev
<start
or rev
>=end
: continue
277 ref
=get_parent_mark(rev
,marks_cache
)
279 sys
.stderr
.write('Failed to find reference for creating tag'
280 ' %s at r%d\n' % (tag
,rev
))
282 sys
.stderr
.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag
,rev
,ref
))
283 wr('reset refs/tags/%s' % tag
)
286 count
=checkpoint(count
)
289 def load_authors(filename
):
291 if not os
.path
.exists(filename
):
295 lre
=re
.compile('^([^=]+)[ ]*=[ ]*(.+)$')
296 for line
in f
.readlines():
300 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
302 # put key:value in cache, key without ^:
303 cache
[m
.group(1).strip()]=m
.group(2).strip()
305 sys
.stderr
.write('Loaded %d authors\n' % l
)
308 def load_cache(filename
):
310 if not os
.path
.exists(filename
):
314 for line
in f
.readlines():
316 fields
=line
.split(' ')
317 if fields
==None or not len(fields
)==2 or fields
[0][0]!=':':
318 sys
.stderr
.write('Invalid file format in [%s], line %d\n' % (filename
,l
))
320 # put key:value in cache, key without ^:
321 cache
[fields
[0][1:]]=fields
[1].split('\n')[0]
325 def save_cache(filename
,cache
):
326 f
=open(filename
,'w+')
327 map(lambda x
: f
.write(':%s %s\n' % (str(x
),str(cache
.get(x
)))),cache
.keys())
330 def verify_heads(ui
,repo
,cache
,force
):
333 f
=open(os
.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch
)
334 sha1
=f
.readlines()[0].split('\n')[0]
340 branches
=repo
.branchtags()
341 l
=[(-repo
.changelog
.rev(n
), n
, t
) for t
, n
in branches
.items()]
344 # get list of hg's branches to verify, don't take all git has
349 if sha1
!=None and c
!=None:
350 sys
.stderr
.write('Verifying branch [%s]\n' % b
)
352 sys
.stderr
.write('Error: Branch [%s] modified outside hg-fast-export:'
353 '\n%s (repo) != %s (cache)\n' % (b
,sha1
,c
))
354 if not force
: return False
356 # verify that branch has exactly one head
358 for h
in repo
.heads():
359 (_
,_
,_
,_
,_
,_
,branch
,_
)=get_changeset(ui
,repo
,h
)
360 if t
.get(branch
,False):
361 sys
.stderr
.write('Error: repository has at least one unnamed head: hg r%s\n' %
362 repo
.changelog
.rev(h
))
363 if not force
: return False
368 def hg2git(repourl
,m
,marksfile
,headsfile
,tipfile
,authors
={},sob
=False,force
=False):
371 marks_cache
=load_cache(marksfile
)
372 heads_cache
=load_cache(headsfile
)
373 state_cache
=load_cache(tipfile
)
375 ui
,repo
=setup_repo(repourl
)
377 if not verify_heads(ui
,repo
,heads_cache
,force
):
380 tip
=repo
.changelog
.count()
382 min=int(state_cache
.get('tip',0))
389 for rev
in range(min,max):
390 c
=export_commit(ui
,repo
,rev
,marks_cache
,heads_cache
,last
,max,c
,authors
,sob
)
392 c
=export_tags(ui
,repo
,marks_cache
,min,max,c
,authors
)
394 sys
.stderr
.write('Issued %d commands\n' % c
)
396 state_cache
['tip']=max
397 state_cache
['repo']=repourl
398 save_cache(tipfile
,state_cache
)
402 if __name__
=='__main__':
403 def bail(parser
,opt
):
404 sys
.stderr
.write('Error: No %s option given\n' % opt
)
408 parser
=OptionParser()
410 parser
.add_option("-m","--max",type="int",dest
="max",
411 help="Maximum hg revision to import")
412 parser
.add_option("--marks",dest
="marksfile",
413 help="File to read git-fast-import's marks from")
414 parser
.add_option("--heads",dest
="headsfile",
415 help="File to read last run's git heads from")
416 parser
.add_option("--status",dest
="statusfile",
417 help="File to read status from")
418 parser
.add_option("-r","--repo",dest
="repourl",
419 help="URL of repo to import")
420 parser
.add_option("-s",action
="store_true",dest
="sob",
421 default
=False,help="Enable parsing Signed-off-by lines")
422 parser
.add_option("-A","--authors",dest
="authorfile",
423 help="Read authormap from AUTHORFILE")
424 parser
.add_option("-f","--force",action
="store_true",dest
="force",
425 default
=False,help="Ignore validation errors by force")
427 (options
,args
)=parser
.parse_args()
430 if options
.max!=None: m
=options
.max
432 if options
.marksfile
==None: bail(parser
,'--marks')
433 if options
.marksfile
==None: bail(parser
,'--heads')
434 if options
.marksfile
==None: bail(parser
,'--status')
435 if options
.marksfile
==None: bail(parser
,'--repo')
438 if options
.authorfile
!=None:
439 a
=load_authors(options
.authorfile
)
441 sys
.exit(hg2git(options
.repourl
,m
,options
.marksfile
,options
.headsfile
,
442 options
.statusfile
,authors
=a
,sob
=options
.sob
,force
=options
.force
))