mirroring: reduce bloat caused by mirrors using git fast-import
authorKyle J. McKay <mackyle@gmail.com>
Thu, 17 Apr 2014 23:59:23 +0000 (17 16:59 -0700)
committerKyle J. McKay <mackyle@gmail.com>
Thu, 17 Apr 2014 23:59:23 +0000 (17 16:59 -0700)
When mirroring is enabled, several of the foreign VCS mirror types
make use of git fast-import to import revisions from the foreign VCS.

Unfortunately, by design, git fast-import creates horrific packs.
They have a poor object order and have poor object deltas.  This
creates unnecessary bloat.  The bloat can be more than 4x!

While a standard git repack -a -d will correct the object order,
it will not correct the bloat since that's primarily caused by
poor object deltas.

To fix this, arrange for any git fast-import created packs to be
repacked with the --no-reuse-delta option before they participate
in the normal repack that combines everything but does not use the
--no-reuse-delta option.

Only do this at normal gc time.

jobd/gc.sh
jobd/update.sh
shlib.sh
taskd/clone.sh

index 698f3ff..8adca30 100755 (executable)
@@ -5,6 +5,9 @@
 set -e
 trap 'if [ $? != 0 ]; then echo "gc failed dir: $PWD" >&2; fi; rm -f "$bang_log"' EXIT
 
+# packing options
+packopts='--window=50 --window-memory=1g --depth=50'
+
 umask 002
 [ "$cfg_permission_control" != "Hooks" ] || umask 000
 
@@ -34,6 +37,25 @@ createlock() {
        return 1
 }
 
+# if the current directory is_gfi_mirror then repack all packs listed in gfi-packs
+repack_gfi_packs() {
+       is_gfi_mirror || return 0
+       [ -s gfi-packs ] || return 0
+       while IFS=': ' read -r _pack _junk; do
+               if [ -s "$_pack" -a -s "${_pack%.pack}.idx" ]; then
+                       git show-index < "${_pack%.pack}.idx" | cut -d ' ' -f 2
+               fi
+       done < gfi-packs | \
+       git pack-objects $packopts --no-reuse-delta --threads=1 $quiet objects/pack/packtmp | \
+       while read -r _newpack; do
+               rm -f objects/pack/pack-$_newpack.*
+               ln objects/pack/packtmp-$_newpack.pack objects/pack/pack-$_newpack.pack
+               ln objects/pack/packtmp-$_newpack.idx objects/pack/pack-$_newpack.idx
+               rm -f objects/pack/packtmp-$_newpack.*
+       done
+       rm -f gfi-packs
+}
+
 proj="$1"
 cd "$cfg_reporoot/$proj.git"
 
@@ -113,7 +135,8 @@ fi
 
 quiet=; [ -n "$show_progress" ] || quiet=-q
 git pack-refs --all
-git repack -a -d --window=50 --window-memory=1G --depth=50 -l $quiet
+repack_gfi_packs
+git repack $packopts -a -d -l $quiet
 git prune
 git update-server-info
 # darcs:// mirrors have a xxx.log file that will grow endlessly
index 56c7bb5..d9dc4d5 100755 (executable)
@@ -16,8 +16,22 @@ git_darcs_fetch() {
        { read -r _err1 || :; read -r _err2 || :; } <<-EOT
        $(
                exec 4>&3 3>&1 1>&4 4>&-
-               { _e1=0; "$cfg_basedir"/bin/darcs-fast-export --export-marks=$(pwd)/dfe-marks --import-marks=$(pwd)/dfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \
-               { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks --import-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; }
+               {
+                       _e1=0
+                       "$cfg_basedir"/bin/darcs-fast-export \
+                               --export-marks="$(pwd)/dfe-marks" \
+                               --import-marks="$(pwd)/dfe-marks" "$1" 3>&- || _e1=$?
+                       echo $_e1 >&3
+               } | \
+               {
+                       _e2=0
+                       git fast-import \
+                               --export-marks="$(pwd)/gfi-marks" \
+                               --export-pack-edges="$(pwd)/gfi-packs" \
+                               --import-marks="$(pwd)/gfi-marks" \
+                               --force 3>&- || _e2=$?
+                       echo $_e2 >&3
+               }
        )
        EOT
        exec 3>&-
@@ -33,8 +47,22 @@ git_bzr_fetch() {
        { read -r _err1 || :; read -r _err2 || :; } <<-EOT
        $(
                exec 4>&3 3>&1 1>&4 4>&-
-               { _e1=0; bzr fast-export --export-marks=$(pwd)/dfe-marks --import-marks=$(pwd)/dfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \
-               { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks --import-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; }
+               {
+                       _e1=0
+                       bzr fast-export \
+                               --export-marks="$(pwd)/dfe-marks" \
+                               --import-marks="$(pwd)/dfe-marks" "$1" 3>&- || _e1=$?
+                       echo $_e1 >&3
+               } | \
+               {
+                       _e2=0
+                       git fast-import \
+                               --export-marks="$(pwd)/gfi-marks" \
+                               --export-pack-edges="$(pwd)/gfi-packs" \
+                               --import-marks="$(pwd)/gfi-marks" \
+                               --force 3>&- || _e2=$?
+                       echo $_e2 >&3
+               }
        )
        EOT
        exec 3>&-
index 373e267..e19b8bf 100644 (file)
--- a/shlib.sh
+++ b/shlib.sh
@@ -222,6 +222,33 @@ get_repo_list() {
        done
 }
 
+# returns true if the passed in git dir (defaults to ".") is a mirror using git fast-import
+is_gfi_mirror() {
+       _gitdir="${1-.}"
+       # always return false for non-mirrors
+       [ ! -e "$_gitdir/.nofetch" ] || return 1
+       _url="$(GIT_DIR="$_gitdir" config_get baseurl 2>/dev/null || :)"
+       case "$_url" in
+               svn://* | svn+http://* | svn+https://*)
+                       # Don't think git-svn currently uses git fast-import
+                       return 1
+                       ;;
+               darcs://*)
+                       # darcs mirrors use git fast-import
+                       return 0
+                       ;;
+               bzr://*)
+                       # bzr mirrors use git fast-import
+                       return 0
+                       ;;
+               hg+http://* | hg+https://*)
+                       # hg mirrors use git fast-import
+                       return 0
+                       ;;
+       esac
+       # assume it does not use git fast-import
+       return 1
+}
 
 # hg-fast-export | git fast-import with error handling in current directory GIT_DIR
 git_hg_fetch() {
@@ -250,7 +277,10 @@ git_hg_fetch() {
                {
                        _e2=0
                        rm -f hg2git-marks.new
-                       git fast-import --force --export-marks="$(pwd)/hg2git-marks.new" 3>&- || _e2=$?
+                       git fast-import \
+                               --export-marks="$(pwd)/hg2git-marks.new" \
+                               --export-pack-edges="$(pwd)/gfi-packs" \
+                               --force 3>&- || _e2=$?
                        echo $_e2 >&3
                }
        )
index 9e2ec94..2facaf8 100755 (executable)
@@ -12,8 +12,20 @@ git_darcs_fetch() {
        { read -r _err1 || :; read -r _err2 || :; } <<-EOT
        $(
                exec 4>&3 3>&1 1>&4 4>&-
-               { _e1=0; "$cfg_basedir"/bin/darcs-fast-export --export-marks=$(pwd)/dfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \
-               { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; }
+               {
+                       _e1=0
+                       "$cfg_basedir"/bin/darcs-fast-export \
+                               --export-marks="$(pwd)/dfe-marks" "$1" 3>&- || _e1=$?
+                       echo $_e1 >&3
+               } | \
+               {
+                       _e2=0
+                       git fast-import \
+                               --export-marks="$(pwd)/gfi-marks" \
+                               --export-pack-edges="$(pwd)/gfi-packs" \
+                               --force 3>&- || _e2=$?
+                       echo $_e2 >&3
+               }
        )
        EOT
        exec 3>&-
@@ -29,8 +41,20 @@ git_bzr_fetch() {
        { read -r _err1 || :; read -r _err2 || :; } <<-EOT
        $(
                exec 4>&3 3>&1 1>&4 4>&-
-               { _e1=0; bzr fast-export --export-marks=$(pwd)/bfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \
-               { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; }
+               {
+                       _e1=0
+                       bzr fast-export \
+                               --export-marks="$(pwd)/bfe-marks" "$1" 3>&- || _e1=$?
+                       echo $_e1 >&3
+               } | \
+               {
+                       _e2=0
+                       git fast-import \
+                               --export-marks="$(pwd)/gfi-marks" \
+                               --export-pack-edges="$(pwd)/gfi-packs" \
+                               --force 3>&- || _e2=$?
+                       echo $_e2 >&3
+               }
        )
        EOT
        exec 3>&-
@@ -145,6 +169,8 @@ case "$url" in
                httpurl="http://${url#darcs://}"
                # Remove any left-over .darcs dirs from a previous failed attempt
                rm -rf *.darcs
+               # Remove any left-over export files from a previous failed attempt
+               rm -f dfe-marks gfi-marks gfi-packs
                git_darcs_fetch "$httpurl"
                ;;
        bzr://*)
@@ -152,6 +178,8 @@ case "$url" in
                # we just remove bzr:// here, a typical bzr url is just
                # "lp:foo"
                bzrurl="${url#bzr://}"
+               # Remove any left-over export files from a previous failed attempt
+               rm -f bfe-marks gfi-marks gfi-packs
                git_bzr_fetch "$bzrurl"
                ;;
        hg+http://* | hg+https://*)
@@ -161,7 +189,7 @@ case "$url" in
                # Remove any left-over repo.hg dir from a previous failed attempt
                rm -rf repo.hg
                # Remove any left-over export files from a previous failed attempt
-               rm -f hg2git-heads hg2git-mapping hg2git-marks* hg2git-state
+               rm -f gfi-packs hg2git-heads hg2git-mapping hg2git-marks* hg2git-state
                # Perform the initial hg clone
                hg clone -U "$hgurl" "$(pwd)/repo.hg"
                # Do the fast-export | fast-import