From 8d433b85c23aff3b43aa2b48520048db9ce50752 Mon Sep 17 00:00:00 2001 From: Rocco Rutte Date: Wed, 7 Mar 2007 01:52:58 +0000 Subject: [PATCH] Speedup: Use manifest and compare checksums to detect changes Instead of feeding in everything or only something and getting merges wrong, build up a list of changed (incl. added) and deleted files by 1) comparing manifest (deleted, added) 2) comparing checksums if file is present in parent and child (change) The hg-crew and mutt imports now go in <15 minutes and md5 sums match. Thanks to Theodore Tso for the hint. While at it, fix a regression that upon incremental import start we always merged a branch plus initializing it. A single test showed that the new detection get starting off from a merge commit right, too. Signed-off-by: Rocco Rutte --- hg2git.py | 82 +++++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/hg2git.py b/hg2git.py index b00a82e..4d67d9b 100644 --- a/hg2git.py +++ b/hg2git.py @@ -7,7 +7,7 @@ Usage: hg2git.py """ -from mercurial import repo,hg,cmdutil,util,ui,revlog +from mercurial import repo,hg,cmdutil,util,ui,revlog,node from tempfile import mkstemp import re import sys @@ -61,21 +61,48 @@ def checkpoint(count): return count def get_parent_mark(parent,marks): - p=marks.get(str(parent),None) - if p==None: - # if we didn't see parent previously, assume we saw it in this run - p=':%d' % (parent+1) - return p + """Get the mark for some parent. + If we saw it in the current session, return :%d syntax and + otherwise the SHA1 from the cache.""" + return marks.get(str(parent+1),':%d' % (parent+1)) + +def mismatch(x,f1,f2): + """See if two revisions of a file are not equal.""" + return node.hex(f1)!=node.hex(f2) + +def outer_set(dleft,dright,l,r): + """Loop over our repository in and find all changed and missing files.""" + for left in dleft.keys(): + right=dright.get(left,None) + if right==None or mismatch('A',dleft[left],right): + # if either have the current file not in parent or the + # checksums differ: add it to changed files + l.append(left) + for right in dright.keys(): + left=dleft.get(right,None) + if left==None: + # if we have a file in the parent but not our manifest, + # add it to deleted files; checksums are checked earlier + r.append(right) + return l,r + +def get_filechanges(repo,revision,parents,mleft): + """Given some repository and revision, find all changed/deleted files.""" + l,r=[],[] + for p in parents: + if p<0: continue + mright=repo.changectx(p).manifest() + dleft=mleft.keys() + dleft.sort() + dright=mright.keys() + dright.sort() + l,r=outer_set(mleft,mright,l,r) + return l,r def export_commit(ui,repo,revision,marks,heads,last,max,count): - sys.stderr.write('Exporting revision %d (tip %d) as [:%d]\n' % (revision,max,revision+1)) - (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision) parents=repo.changelog.parentrevs(revision) - # we need this later to write out tags - marks[str(revision)]=':%d'%(revision+1) - wr('commit refs/heads/%s' % branch) wr('mark :%d' % (revision+1)) wr('committer %s %d %s' % (user,time,timezone)) @@ -93,7 +120,7 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): sys.stderr.write('Initializing branch [%s] to parent [%s]\n' % (branch,src)) link=src # avoid making a merge commit for incremental import - elif not heads.has_key(branch) and revision>0: + elif link=='' and not heads.has_key(branch) and revision>0: # newly created branch and not the first one: connect to parent tmp=get_parent_mark(parents[0],marks) wr('from %s' % tmp) @@ -111,8 +138,8 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): if p==l or p==revision or p<0: continue tmp=get_parent_mark(p,marks) - # if we fork off a branch, don't merge via 'merge' as we have - # 'from' already above + # if we fork off a branch, don't merge with our parent via 'merge' + # as we have 'from' already above if tmp==link: continue sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' % @@ -121,27 +148,26 @@ def export_commit(ui,repo,revision,marks,heads,last,max,count): last[branch]=revision heads[branch]='' - - # just wipe the branch clean, all full manifest contents - wr('deleteall') + # we need this later to write out tags + marks[str(revision)]=':%d'%(revision+1) ctx=repo.changectx(str(revision)) man=ctx.manifest() + added,removed=get_filechanges(repo,revision,parents,man) - #for f in man.keys(): - # fctx=ctx.filectx(f) - # d=fctx.data() - # wr('M %s inline %s' % (gitmode(man.execf(f)),f)) - # wr('data %d' % len(d)) # had some trouble with size() - # wr(d) + sys.stderr.write('Exporting revision %d with %d changed/%d removed files\n' % + (revision,len(added),len(removed))) - for fctx in ctx.filectxs(): - f=fctx.path() + for a in added: + fctx=ctx.filectx(a) d=fctx.data() - wr('M %s inline %s' % (gitmode(man.execf(f)),f)) + wr('M %s inline %s' % (gitmode(man.execf(a)),a)) wr('data %d' % len(d)) # had some trouble with size() wr(d) + for r in removed: + wr('D %s' % r) + wr() return checkpoint(count) @@ -153,8 +179,8 @@ def export_tags(ui,repo,cache,count): rev=repo.changelog.rev(node) ref=cache.get(str(rev),None) if ref==None: - sys.stderr.write('Failed to find reference for creating tag' - ' %s at r%d\n' % (tag,rev)) + #sys.stderr.write('Failed to find reference for creating tag' + # ' %s at r%d\n' % (tag,rev)) continue (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev) sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) -- 2.11.4.GIT