From 381facdd0519bc4f00fa6dd210eff1bac6a71e2a Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Thu, 25 Aug 2016 02:11:04 -0700 Subject: [PATCH] gc.sh: overhaul how forks are handled during gc If the project we are running garbage collection (gc) on has any forks we must be careful not to remove any objects that while no longer referenced by the project being gc'd (the parent) are still referenced by one or more forks (the children) otherwise the children will become corrupt and we can't abide corrupt children. One way to accomplish this is to simply hard-link all currently existing loose objects and packs in the parent into all the children that refer to the parent (via a line in their objects/info/alternates file) before beginning the gc operation and then relying on a subsequent gc in the child to clean up any excess objects/packs. We used to use this strategy but it's very inefficient because: 1. The disk space used by the old pack(s)/object(s) will not be reclaimed until all children (and their children, if any) run gc by which time it's quite possible the topmost parent will have run gc again and hard-linked yet another old pack down to its children (not to mention loose objects). 2. As we are now using the "-A" option with "git repack", any new objects in the parent that are not referenced by children will continually get exploded out of the hard-linked pack in the children whenever the children run gc. 3. To avoid suboptimal and/or unnecessarily many packs being hard-linked into child forks, we must run the "mini" gc maintenance before we perform the hard-linking into the children which provides yet another source of inefficiency. Since we are now using the "-A" option to "git repack" (that was not always the case) to guarantee we can access old ref values for long enough to send out a meaningful mail.sh notification, we now have another, more efficient, option available to prevent corruption of child forks that continue to refer to objects that are no longer reachable from any ref in the parent. The only things that need be copied (or hard-linked) into the child fork(s) are those objects that have become unreachable from any ref in the parent. They are the only things that could ever be removed by "git prune" and therefore the only things we need to prevent the loss of in order to avoid corruption of the child fork(s). Therefore change the way we handle forks during gc to now use the following strategy instead to avoid excessive disk use and lots of unnecessary loose objects in child forks: 1. Run "git repack -A -d -l" in the parent BEFORE doing anything about child forks. 2. Collect all remaining existing loose objects in the parent into a single pack BEFORE running "git prune" in the parent and if it's not empty then hard-link that single pack into the immediate children. 3. Now run "git prune" in the parent. With this new strategy we avoid the need to run any "mini" gc maintenance before copying (or hard-linking) anything down to the child forks. Furthermore, only when the parent performs a non-fast-forward update will anything ever be transferred to the children leaving them unperturbed in the vast majority of cases. Finally, even if the parent references objects the children do not, those objects will no longer continually end up in the children as unreachable loose objects after the children run gc. Signed-off-by: Kyle J. McKay --- jobd/gc.sh | 192 ++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 140 insertions(+), 52 deletions(-) diff --git a/jobd/gc.sh b/jobd/gc.sh index c8cd8b5..42bc3cd 100755 --- a/jobd/gc.sh +++ b/jobd/gc.sh @@ -322,6 +322,7 @@ remove_crud() { proj="${1%.git}" shift cd "$cfg_reporoot/$proj.git" +[ -d objects/pack ] || { rm -f gfi-packs; mkdir -p objects/pack; } mirror_url="$(get_mirror_url)" svn_mirror= ! is_svn_mirror_url "$mirror_url" || svn_mirror=1 @@ -517,60 +518,88 @@ if [ -z "$newdeltas" ] && [ -n "$noreusedeltaopt" ] && \ # There aren't enough objects to worry about so just redelta to get the best pack newdeltas=-f fi -if [ -z "$newdeltas" ] || has_forks "$proj"; then - # Since we're not going to recompute deltas overall, we need to do the "mini" - # maintenance and by doing it before we copy objects down to forks we reduce - # the amount that gets sprayed into the forks' objects directories. - # If we have forks we always need to do the "mini" maintenance, even if we are - # recomputing all deltas, in order to avoid having suboptimal packs in the forks. - make_svn_pack +if [ -z "$newdeltas" ]; then + # Since we're not going to recompute deltas overall, we need to do the + # "mini" maintenance so that we can get more optimal deltas + [ -z "$noreusedeltaopt" ] || make_svn_pack repack_gfi_packs - combine_small_packs 1 + force_single_pack_redelta= + [ -n "$gfi_mirror" ] || [ -n "$svn_mirror" ] || force_single_pack_redelta=1 + [ -z "$noreusedeltaopt" ] || combine_small_packs $force_single_pack_redelta fi -# safe pruning: we put all our objects in all forks, then we can -# safely get rid of extra ones; repacks in forks will get rid of -# the redundant ones again then; we carefully grab only loose -# objects and pack .idx and .pack files -forkdir="$proj" -if [ -d "../${forkdir##*/}" ]; then - # It is enough to copy objects just one level down and get_repo_list - # takes a regular expression (which is automatically prefixed with '^') - # so we can easily match forks exactly one level down from this project - get_repo_list "$forkdir/[^/]*:" | - while read fork; do - # Ignore forks that do not exist or are symbolic links - [ ! -L "$cfg_reporoot/$fork.git" -a -d "$cfg_reporoot/$fork.git" ] || \ - continue - # Or do not have a non-zero length alternates file - [ -s "$cfg_reporoot/$fork.git/objects/info/alternates" ] || \ - continue - # Match objects in parent project - for d in objects/??; do - [ "$d" != "objects/??" ] || continue - mkdir -p "$cfg_reporoot/$fork.git/$d" - ln -f "$d"/* "$cfg_reporoot/$fork.git/$d" || : - done - # Match packs in parent project - mkdir -p "$cfg_reporoot/$fork.git/objects/pack" - if [ "$(echo objects/pack/pack-*.idx)" != \ - "objects/pack/pack-*.idx" ]; then - ln -f objects/pack/pack-*.pack "$cfg_reporoot/$fork.git/objects/pack" || : - ln -f objects/pack/pack-*.idx "$cfg_reporoot/$fork.git/objects/pack" || : - if ! [ -e "$cfg_reporoot/$fork.git/.needsgc" ]; then - # Trigger a mini gc in the fork if it now has too many packs - packs="$(list_packs --quiet --count --exclude-no-idx "$cfg_reporoot/$fork.git/objects/pack" || :)" - if [ -n "$packs" ] && [ "$packs" -ge 20 ]; then - >"$cfg_reporoot/$fork.git/.needsgc" - fi - fi - git --git-dir="$cfg_reporoot/$fork.git" update-server-info - fi - # Update the fork's lastparentgc date (must be current, not $gcstart) - git --git-dir="$cfg_reporoot/$fork.git" config \ - gitweb.lastparentgc "$(date "$datefmt")" - done -fi +# +## Safe Pruning In Forks +## +## We are about to perform garbage collection. We do NOT use the "git gc" +## command directly as it does not provide enough control over the fine details +## that we require. However, we DO maintain a "gc.pid" file during our garbage +## collection so that a simultaneous "git gc" by an administrator will be +## blocked (and similarly we refuse to start garbage collection if we cannot +## create the "gc.pid" file). When we say "gc" in the below description we are +## referring to our "gc.sh" script, NOT the "git gc" command. +## +## If the project we are running garbage collection (gc) on has any forks we +## must be careful not to remove any objects that while no longer referenced by +## this project (the parent) are still referenced by one or more forks (the +## children) otherwise the children will become corrupt and we can't abide +## corrupt children. +## +## One way to accomplish this is to simply hard-link all currently existing +## loose objects and packs in the parent into all the children that refer to the +## parent (via a line in their objects/info/alternates file) before beginning +## the gc operation and then relying on a subsequent gc in the child to clean up +## any excess objects/packs. We used to use this strategy but it's very +## inefficient because: +## +## 1. The disk space used by the old pack(s)/object(s) will not be reclaimed +## until all children (and their children, if any) run gc by which time +## it's quite possible the topmost parent will have run gc again and +## hard-linked yet another old pack down to its children (not to mention +## loose objects). +## +## 2. As we are now using the "-A" option with "git repack", any new objects +## in the parent that are not referenced by children will continually get +## exploded out of the hard-linked pack in the children whenever the +## children run gc. +## +## 3. To avoid suboptimal and/or unnecessarily many packs being hard-linked +## into child forks, we must run the "mini" gc maintenance before we +## perform the hard-linking into the children which provides yet another +## source of inefficiency. +## +## Since we are using the "-A" option to "git repack" (that was not always the +## case) to guarantee we can access old ref values for long enough to send out +## a meaningful mail.sh notification, we now have another, more efficient, +## option available to prevent corruption of child forks that continue to refer +## to objects that are no longer reachable from any ref in the parent. +## +## The only things that need be copied (or hard-linked) into the child fork(s) +## are those objects that have become unreachable from any ref in the parent. +## They are the only things that could ever be removed by "git prune" and +## therefore the only things we need to prevent the loss of in order to avoid +## corruption of the child fork(s). +## +## Therefore we now use the following strategy instead to avoid excessive disk +## use and lots of unnecessary loose objects in child forks: +## +## 1. Run "git repack -A -d -l" in the parent BEFORE doing anything about +## child forks. +## +## 2. Collect all remaining existing loose objects in the parent into a +## single pack BEFORE running "git prune" and if it's not empty then +## hard-link that single pack into the immediate children. +## +## 3. Now run "git prune" in the parent. +## +## With this new strategy we avoid the need to run any "mini" gc maintenance +## before copying (or hard-linking) anything down to the child forks. +## Furthermore, only when the parent performs a non-fast-forward update will +## anything ever be transferred to the children leaving them unperturbed in the +## vast majority of cases. Finally, even if the parent references objects the +## children do not, those objects will no longer continually end up in the +## children as unreachable loose objects after the children run gc. +# git pack-refs --all touch .gc_in_progress @@ -623,12 +652,72 @@ pkrf= [ ! -e packed-refs ] || pkrf=packed-refs eval "reposizek=$(( $(echo 0 $(du -k $pkrf $allpacks 2>/dev/null | LC_ALL=C awk '{print $1}') | \ LC_ALL=C sed -e 's/ / + /g') ))" +git update-server-info # The -A option to `git repack` may have caused some loose objects to pop # out of their packs. We must make these objects group writable so that they # can be freshened by other pushers. Technically we need only do this for # push projects but to enable mirror projects to be more easily converted to # push projects, we go ahead and do it for all projects. { find objects/$octet -type f -name "$octet19" -print0 | xargs -0 chmod ug+w || :; } 2>/dev/null + +if has_forks "$proj"; then + # Pack up all the loose objects and copy (actually hard link) them into all the forks + lpacks="$(find objects/$octet -maxdepth 1 -type f -name "$octet19" -print 2>/dev/null | + LC_ALL=C awk -F / '{print $2 $3}' | + run_combine_packs --objects --names $packopts --incremental --all-progress-implied $quiet --non-empty)" || { + >.gc_failed + exit 1 + } + # We have to update the lastparentgc time in the child forks even if they do not get any + # new "loose objects" pack(s) because they need to run gc just in case the parent now has + # some objects that used to only be in the child so they can be removed from the child. + # For example, a "patch" might be developed first in a fork and then later accepted into + # the parent in which case the objects making up the patch in the child fork are now + # redundant (since they're now in the parent as well) and need to be removed from the + # child fork which can only happen if the child fork runs gc. + forkdir="$proj" + # It is enough to copy objects just one level down and get_repo_list + # takes a regular expression (which is automatically prefixed with '^') + # so we can easily match forks exactly one level down from this project + get_repo_list "$forkdir/[^/]*:" | + while read fork; do + # Ignore forks that do not exist or are symbolic links + [ ! -L "$cfg_reporoot/$fork.git" -a -d "$cfg_reporoot/$fork.git" ] || \ + continue + # Or do not have a non-zero length alternates file + [ -s "$cfg_reporoot/$fork.git/objects/info/alternates" ] || \ + continue + if [ -n "$lpacks" ]; then + # Install the "loose objects" pack(s) into the fork + [ -d "$cfg_reporoot/$fork.git/objects/pack" ] || ( + cd "$cfg_reporoot/$fork.git" && \ + mkdir -p objects/pack + ) + for lpack in $lpacks; do + ln -f objects/pack/"pack-$lpack.pack" objects/pack/"pack-$lpack.idx" \ + "$cfg_reporoot/$fork.git/objects/pack/" || : + done + if ! [ -e "$cfg_reporoot/$fork.git/.needsgc" ]; then + # Trigger a mini gc in the fork if it now has too many packs + packs="$(list_packs --quiet --count --exclude-no-idx "$cfg_reporoot/$fork.git/objects/pack" || :)" + if [ -n "$packs" ] && [ "$packs" -ge 20 ]; then + >"$cfg_reporoot/$fork.git/.needsgc" + fi + fi + git --git-dir="$cfg_reporoot/$fork.git" update-server-info + fi + # Update the fork's lastparentgc date (must be current, not $gcstart) + git --git-dir="$cfg_reporoot/$fork.git" config \ + gitweb.lastparentgc "$(date "$datefmt")" + done + if [ -n "$lpacks" ]; then + # Remove the "loose objects" pack(s) from the parent + for lpack in $lpacks; do + rm -f objects/pack/"pack-$lpack.idx" objects/pack/"pack-$lpack.pack" + done + fi +fi + # The git prune command does not take a -q or --quiet but started outputting # 'Checking connectivity' progress messages in v1.7.9. However, we can # suppress those by piping through cat as it only activates the progress @@ -650,7 +739,6 @@ prunecmd='git prune --expire 1_day_ago' prunecmd="{ $prunecmd 2>&1 || touch .gc_failed; } | cat" eval "$prunecmd" [ ! -e .gc_failed ] || exit 1 -git update-server-info # darcs:// mirrors have a xxx.log file that will grow endlessly # if this is a mirror and the file exists, shorten it to 10000 lines -- 2.11.4.GIT