From: Kyle J. McKay Date: Thu, 17 Apr 2014 23:59:23 +0000 (-0700) Subject: mirroring: reduce bloat caused by mirrors using git fast-import X-Git-Url: https://repo.or.cz/w/girocco.git/commitdiff_plain/5d8d5126524436be88adad78d0e05b240e5206a9 mirroring: reduce bloat caused by mirrors using git fast-import When mirroring is enabled, several of the foreign VCS mirror types make use of git fast-import to import revisions from the foreign VCS. Unfortunately, by design, git fast-import creates horrific packs. They have a poor object order and have poor object deltas. This creates unnecessary bloat. The bloat can be more than 4x! While a standard git repack -a -d will correct the object order, it will not correct the bloat since that's primarily caused by poor object deltas. To fix this, arrange for any git fast-import created packs to be repacked with the --no-reuse-delta option before they participate in the normal repack that combines everything but does not use the --no-reuse-delta option. Only do this at normal gc time. --- diff --git a/jobd/gc.sh b/jobd/gc.sh index 698f3ff..8adca30 100755 --- a/jobd/gc.sh +++ b/jobd/gc.sh @@ -5,6 +5,9 @@ set -e trap 'if [ $? != 0 ]; then echo "gc failed dir: $PWD" >&2; fi; rm -f "$bang_log"' EXIT +# packing options +packopts='--window=50 --window-memory=1g --depth=50' + umask 002 [ "$cfg_permission_control" != "Hooks" ] || umask 000 @@ -34,6 +37,25 @@ createlock() { return 1 } +# if the current directory is_gfi_mirror then repack all packs listed in gfi-packs +repack_gfi_packs() { + is_gfi_mirror || return 0 + [ -s gfi-packs ] || return 0 + while IFS=': ' read -r _pack _junk; do + if [ -s "$_pack" -a -s "${_pack%.pack}.idx" ]; then + git show-index < "${_pack%.pack}.idx" | cut -d ' ' -f 2 + fi + done < gfi-packs | \ + git pack-objects $packopts --no-reuse-delta --threads=1 $quiet objects/pack/packtmp | \ + while read -r _newpack; do + rm -f objects/pack/pack-$_newpack.* + ln objects/pack/packtmp-$_newpack.pack objects/pack/pack-$_newpack.pack + ln objects/pack/packtmp-$_newpack.idx objects/pack/pack-$_newpack.idx + rm -f objects/pack/packtmp-$_newpack.* + done + rm -f gfi-packs +} + proj="$1" cd "$cfg_reporoot/$proj.git" @@ -113,7 +135,8 @@ fi quiet=; [ -n "$show_progress" ] || quiet=-q git pack-refs --all -git repack -a -d --window=50 --window-memory=1G --depth=50 -l $quiet +repack_gfi_packs +git repack $packopts -a -d -l $quiet git prune git update-server-info # darcs:// mirrors have a xxx.log file that will grow endlessly diff --git a/jobd/update.sh b/jobd/update.sh index 56c7bb5..d9dc4d5 100755 --- a/jobd/update.sh +++ b/jobd/update.sh @@ -16,8 +16,22 @@ git_darcs_fetch() { { read -r _err1 || :; read -r _err2 || :; } <<-EOT $( exec 4>&3 3>&1 1>&4 4>&- - { _e1=0; "$cfg_basedir"/bin/darcs-fast-export --export-marks=$(pwd)/dfe-marks --import-marks=$(pwd)/dfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \ - { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks --import-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; } + { + _e1=0 + "$cfg_basedir"/bin/darcs-fast-export \ + --export-marks="$(pwd)/dfe-marks" \ + --import-marks="$(pwd)/dfe-marks" "$1" 3>&- || _e1=$? + echo $_e1 >&3 + } | \ + { + _e2=0 + git fast-import \ + --export-marks="$(pwd)/gfi-marks" \ + --export-pack-edges="$(pwd)/gfi-packs" \ + --import-marks="$(pwd)/gfi-marks" \ + --force 3>&- || _e2=$? + echo $_e2 >&3 + } ) EOT exec 3>&- @@ -33,8 +47,22 @@ git_bzr_fetch() { { read -r _err1 || :; read -r _err2 || :; } <<-EOT $( exec 4>&3 3>&1 1>&4 4>&- - { _e1=0; bzr fast-export --export-marks=$(pwd)/dfe-marks --import-marks=$(pwd)/dfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \ - { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks --import-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; } + { + _e1=0 + bzr fast-export \ + --export-marks="$(pwd)/dfe-marks" \ + --import-marks="$(pwd)/dfe-marks" "$1" 3>&- || _e1=$? + echo $_e1 >&3 + } | \ + { + _e2=0 + git fast-import \ + --export-marks="$(pwd)/gfi-marks" \ + --export-pack-edges="$(pwd)/gfi-packs" \ + --import-marks="$(pwd)/gfi-marks" \ + --force 3>&- || _e2=$? + echo $_e2 >&3 + } ) EOT exec 3>&- diff --git a/shlib.sh b/shlib.sh index 373e267..e19b8bf 100644 --- a/shlib.sh +++ b/shlib.sh @@ -222,6 +222,33 @@ get_repo_list() { done } +# returns true if the passed in git dir (defaults to ".") is a mirror using git fast-import +is_gfi_mirror() { + _gitdir="${1-.}" + # always return false for non-mirrors + [ ! -e "$_gitdir/.nofetch" ] || return 1 + _url="$(GIT_DIR="$_gitdir" config_get baseurl 2>/dev/null || :)" + case "$_url" in + svn://* | svn+http://* | svn+https://*) + # Don't think git-svn currently uses git fast-import + return 1 + ;; + darcs://*) + # darcs mirrors use git fast-import + return 0 + ;; + bzr://*) + # bzr mirrors use git fast-import + return 0 + ;; + hg+http://* | hg+https://*) + # hg mirrors use git fast-import + return 0 + ;; + esac + # assume it does not use git fast-import + return 1 +} # hg-fast-export | git fast-import with error handling in current directory GIT_DIR git_hg_fetch() { @@ -250,7 +277,10 @@ git_hg_fetch() { { _e2=0 rm -f hg2git-marks.new - git fast-import --force --export-marks="$(pwd)/hg2git-marks.new" 3>&- || _e2=$? + git fast-import \ + --export-marks="$(pwd)/hg2git-marks.new" \ + --export-pack-edges="$(pwd)/gfi-packs" \ + --force 3>&- || _e2=$? echo $_e2 >&3 } ) diff --git a/taskd/clone.sh b/taskd/clone.sh index 9e2ec94..2facaf8 100755 --- a/taskd/clone.sh +++ b/taskd/clone.sh @@ -12,8 +12,20 @@ git_darcs_fetch() { { read -r _err1 || :; read -r _err2 || :; } <<-EOT $( exec 4>&3 3>&1 1>&4 4>&- - { _e1=0; "$cfg_basedir"/bin/darcs-fast-export --export-marks=$(pwd)/dfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \ - { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; } + { + _e1=0 + "$cfg_basedir"/bin/darcs-fast-export \ + --export-marks="$(pwd)/dfe-marks" "$1" 3>&- || _e1=$? + echo $_e1 >&3 + } | \ + { + _e2=0 + git fast-import \ + --export-marks="$(pwd)/gfi-marks" \ + --export-pack-edges="$(pwd)/gfi-packs" \ + --force 3>&- || _e2=$? + echo $_e2 >&3 + } ) EOT exec 3>&- @@ -29,8 +41,20 @@ git_bzr_fetch() { { read -r _err1 || :; read -r _err2 || :; } <<-EOT $( exec 4>&3 3>&1 1>&4 4>&- - { _e1=0; bzr fast-export --export-marks=$(pwd)/bfe-marks "$1" 3>&- || _e1=$?; echo $_e1 >&3; } | \ - { _e2=0; git fast-import --force --export-marks=$(pwd)/gfi-marks 3>&- || _e2=$?; echo $_e2 >&3; } + { + _e1=0 + bzr fast-export \ + --export-marks="$(pwd)/bfe-marks" "$1" 3>&- || _e1=$? + echo $_e1 >&3 + } | \ + { + _e2=0 + git fast-import \ + --export-marks="$(pwd)/gfi-marks" \ + --export-pack-edges="$(pwd)/gfi-packs" \ + --force 3>&- || _e2=$? + echo $_e2 >&3 + } ) EOT exec 3>&- @@ -145,6 +169,8 @@ case "$url" in httpurl="http://${url#darcs://}" # Remove any left-over .darcs dirs from a previous failed attempt rm -rf *.darcs + # Remove any left-over export files from a previous failed attempt + rm -f dfe-marks gfi-marks gfi-packs git_darcs_fetch "$httpurl" ;; bzr://*) @@ -152,6 +178,8 @@ case "$url" in # we just remove bzr:// here, a typical bzr url is just # "lp:foo" bzrurl="${url#bzr://}" + # Remove any left-over export files from a previous failed attempt + rm -f bfe-marks gfi-marks gfi-packs git_bzr_fetch "$bzrurl" ;; hg+http://* | hg+https://*) @@ -161,7 +189,7 @@ case "$url" in # Remove any left-over repo.hg dir from a previous failed attempt rm -rf repo.hg # Remove any left-over export files from a previous failed attempt - rm -f hg2git-heads hg2git-mapping hg2git-marks* hg2git-state + rm -f gfi-packs hg2git-heads hg2git-mapping hg2git-marks* hg2git-state # Perform the initial hg clone hg clone -U "$hgurl" "$(pwd)/repo.hg" # Do the fast-export | fast-import