From a7ea68a8a80e43d5094ef0b321f1e052b86e4aaf Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Fri, 2 Oct 2015 18:52:12 -0700 Subject: [PATCH] gc: retain recently unreferenced objects for 1 day Previously we were running git repack with "-a -d" options which results in unreachable objects in the pack being removed immediately. Immediately thereafter we were running git prune with no options which follows up by immediately removing any unreachable loose objects. In a perfect world with bug-free code and no race conditions this would be ideal and never cause problems. However, anytime we allow simultaneous gc to run while any other repository activity is occurring (one or more pushes, ref updates being sent out etc.) there's a window where gc could remove the objects we're interested in (an old, unreachable ref for comparison purposes or perhaps because we're resurrecting it with a push) before we've had a chance to access them. Attempt to avoid this situation by arranging for objects to live on for 24 hours after they've most recently become unreachable. We do this by running git repack with "-A -d" to unpack unreachable objects and we arrange for them to get the current time as their modification time rather than the pack's mod time. We also arrange for loose objects that have become unreferenced to acquire the modification time of their most recent unreferencing. Then we run git prune with "--expire 1_day_ago" and that combined with Git v2.2.0's d3038d22 (prune: keep objects reachable from recent objects) allows objects and whatever they reference to stay alive for at least 24 hours after they've most recently become unreferenced. Signed-off-by: Kyle J. McKay --- hooks/pre-receive | 25 ++++++++++++++++++--- jailsetup.sh | 1 + jobd/gc.sh | 36 +++++++++++++++++++++++++++--- jobd/update.sh | 67 +++++++++++++++++++++++++++++++++++++++---------------- shlib.sh | 3 ++- 5 files changed, 106 insertions(+), 26 deletions(-) diff --git a/hooks/pre-receive b/hooks/pre-receive index 2950a2d..5b24e3f 100755 --- a/hooks/pre-receive +++ b/hooks/pre-receive @@ -8,7 +8,26 @@ set -e git config gitweb.lastreceive "$(date '+%a, %d %b %Y %T %z')" -# Read the incoming refs, but just ignore them -while read line; do - : +# Read the incoming refs and freshen old loose objects +# If we waited until post-receive a gc could have already nuked them +# We freshen the new ref in case it's being resurrected to protect it from gc +# We probably do not need to do it for new refs as Git tries to do that, +# but since we're already doing it for old refs (which Git does not do), +# it's almost no extra work for new refs, just in case. +while read -r old new ref; do + oldp= + newp= + if [ "$old" != "0000000000000000000000000000000000000000" ]; then + # freshen mod time on recently unref'd loose objects + fn="${old#??}" + shard="${old%$fn}" + oldp="objects/$shard/$fn" + fi + if [ "$new" != "0000000000000000000000000000000000000000" ]; then + # prevent imminent pruning of a ref being resurrected + fn="${new#??}" + shard="${new%$fn}" + newp="objects/$shard/$fn" + fi + touch -c -m $oldp $newp || : done diff --git a/jailsetup.sh b/jailsetup.sh index 23b81f3..bf728e6 100755 --- a/jailsetup.sh +++ b/jailsetup.sh @@ -276,6 +276,7 @@ pull_in_bin /bin/sh bin pull_in_bin /bin/date bin pull_in_bin /bin/mv bin pull_in_bin /bin/rm bin +pull_in_bin /usr/bin/touch bin pull_in_bin /usr/sbin/sshd sbin # ...and the bits of git we need, diff --git a/jobd/gc.sh b/jobd/gc.sh index db57630..15272f9 100755 --- a/jobd/gc.sh +++ b/jobd/gc.sh @@ -368,12 +368,30 @@ repack_gfi_packs touch .gc_in_progress rm -f .gc_failed bundles/* rm -f objects/pack/pack-*.bndl +# We use the -A option with git repack so that unreachable objects can live +# on for a time as loose objects. This is particularly helpful if we just +# happen to be in the process of sending out a ref update for a ref that was +# force updated and the old ref value would have otherwise been removed by +# repack because it was now unreachable. Admittedly the window for gc to run +# and do that before we manage to send out the ref update is not large, but +# it would not be difficult to create such a situation. Unfortunately, when +# Git unpacks these unreachable objects it will give them the modification +# time of the *.pack file they came out of. This could be very, very old. +# If that happens, the subsequent git prune --expire some_time_ago will still +# remove the object(s) and our pending ref update will still lose out. +# To prevent this from happening and to get the behavior we want, we now +# touch the modification time of all pack-.pack files so that any +# loosened objects get a current time. Git does not provide any other +# mechanism to do this. We do not want to just touch all loose objects +# left after the repack because that would cause objects that were loosened +# previously to live on which we definitely do not want. +touch -c -m objects/pack/pack-$octet20.pack # The git repack command may issue a 'disabling bitmap' warning for some # repositories. This is perfectly normal and should be suppressed unless # show_progress is set. Unfortunately that means we have to grep -v the # output. And furthermore, since it's a translated message, we have to # force the language to english to be sure we do it. -repackcmd="git repack $packopts -a -d -l $quiet $@" +repackcmd="git repack $packopts -A -d -l $quiet $@" [ -n "$show_progress" ] || \ repackcmd="{ LC_ALL=C $repackcmd 2>&1 || touch .gc_failed; } | grep -v 'disabling bitmap' || :" eval "$repackcmd" @@ -387,8 +405,20 @@ eval "reposizek=$(( $(echo 0 $(du -k $pkrf $allpacks 2>/dev/null | awk '{print $ # The git prune command does not take a -q or --quiet but started outputting # 'Checking connectivity' progress messages in v1.7.9. However, we can # suppress those by piping through cat as it only activates the progress -# messages when stderr is a tty. -prunecmd='git prune' +# messages when stderr is a tty. We only expire loose objects older than one +# day just in case there's some pending action (such as sending out a ref +# update) in progress that might want to examine them. This may leave us with +# loose objects. That's okay because at the next gc interval, we will always +# run gc if we see any loose objects regardless of whether or not we've seen +# any updates or we've received new linked objects from our parent. Note that +# in order to keep loose objects that just recently became unreferenced but +# have a very old modification date around we rely on some help from both the +# update.sh and hooks/pre-receive scripts. Furthermore, since Git v2.2.0 +# (d3038d22 prune: keep objects reachable from recent objects) an unreachable +# object that would otherwise be pruned (because it's too old) will be kept +# alive by an unreachable object that refers to it that's not old enough to +# be pruned yet. +prunecmd='git prune --expire 1_day_ago' [ -n "$show_progress" ] || \ prunecmd="{ $prunecmd 2>&1 || touch .gc_failed; } | cat" eval "$prunecmd" diff --git a/jobd/update.sh b/jobd/update.sh index 8ce502e..5aa0789 100755 --- a/jobd/update.sh +++ b/jobd/update.sh @@ -12,6 +12,19 @@ fi # date -R is linux-only, POSIX equivalent is '+%a, %d %b %Y %T %z' datefmt='+%a, %d %b %Y %T %z' +# freshen_loose_objects full-sha ... +# if "$n" is a loose object, set its modification time to now +# otherwise silently do nothing with no error +freshen_loose_objects() { + _list= + for _sha; do + _fn="${_sha#??}" + _shard="${_sha%$_fn}" + _list="$_list objects/$_shard/$_fn" + done + [ -z "$_list" ] || touch -c -m $_list || : +} + # darcs fast-export | git fast-import with error handling git_darcs_fetch() ( set_utf8_locale @@ -268,25 +281,41 @@ bang_eval "LC_ALL=C sort -k1b,1 <.refs-temp >.refs-after" sockpath="$cfg_chroot/etc/taskd.socket" if ! cmp -s .refs-before .refs-after; then bang config_set lastreceive "$(date '+%a, %d %b %Y %T %z')" - if [ -S "$sockpath" ]; then - ( - trap ':' PIPE - echo "ref-changes %$proj% $proj" || : - LC_ALL=C join -j 1 .refs-before .refs-after | - while read ref old new; do - [ "$old" != "$new" ] || continue - echo "$old $new $ref" || : - done - LC_ALL=C join -j 1 -v 1 .refs-before .refs-after | - while read ref old; do - echo "$old 0000000000000000000000000000000000000000 $ref" || : - done - LC_ALL=C join -j 1 -v 2 .refs-before .refs-after | - while read ref new; do - echo "0000000000000000000000000000000000000000 $new $ref" || : - done - ) 2>/dev/null | { nc_openbsd -w 1 -U "$sockpath" || :; } - fi + # We freshen the mod time to now on any old or new ref that is a loose object + # For old refs we do it so we will be able to keep them around for 1 day + # For new refs we do it in case we are about to run gc and the new ref + # actually points to an oldish loose object that had been unreachable + # We probably do not need to do it for new refs as Git tries to do that, + # but since we're already doing it for old refs (which Git does not do), + # it's almost no extra work for new refs, just in case. + ( + trap ':' PIPE + echo "ref-changes %$proj% $proj" || : + LC_ALL=C join -j 1 .refs-before .refs-after | + while read ref old new; do + [ "$old" != "$new" ] || continue + freshen_loose_objects "$old" "$new" + echo "$old $new $ref" || : + done + LC_ALL=C join -j 1 -v 1 .refs-before .refs-after | + while read ref old; do + freshen_loose_objects "$old" + echo "$old 0000000000000000000000000000000000000000 $ref" || : + done + LC_ALL=C join -j 1 -v 2 .refs-before .refs-after | + while read ref new; do + freshen_loose_objects "$new" + echo "0000000000000000000000000000000000000000 $new $ref" || : + done + ) 2>/dev/null | { + if [ -S "$sockpath" ]; then + nc_openbsd -w 1 -U "$sockpath" || : + else + while read -r line; do + : + done + fi + } bang config_set lastchange "$(date '+%a, %d %b %Y %T %z')" bang_eval "git for-each-ref --sort=-committerdate --format='%(committerdate:iso8601)' \ --count=1 refs/heads > info/lastactivity" diff --git a/shlib.sh b/shlib.sh index 19e75a3..6e2dd1f 100644 --- a/shlib.sh +++ b/shlib.sh @@ -3,11 +3,12 @@ # This is generic shell library for all the scripts used by Girocco; # most importantly, it introduces all the $cfg_* shell variables. -# SHA-1 pattern +# SHA-1 patterns octet='[0-9a-f][0-9a-f]' octet4="$octet$octet$octet$octet" octet19="$octet4$octet4$octet4$octet4$octet$octet$octet" octet20="$octet4$octet4$octet4$octet4$octet4" +nullsha="0000000000000000000000000000000000000000" # tab tab="$(printf '\t')" -- 2.11.4.GIT