From dfa5b0571276a617f25049da1fee0371651eeb57 Mon Sep 17 00:00:00 2001 From: "Kyle J. McKay" Date: Thu, 22 Feb 2018 21:26:42 -0800 Subject: [PATCH] gc: find all the dirty bits When performing garbage collection on schedule because the min_gc_interval has expired, a check is made to see if garbage collection can actually be skipped (garbage collection is a somewhat expensive undertaking after all). If no refs in the repository have been changed since the last garbage collection and we've performed garbage collection more recently than the last time any of our "parents" did (if we're a fork with alternates), then we can skip. But, if we've passed all the "skip" tests and it ought to be okay to skip garbage collection, we do one more test in that case -- we check to see if the repository is "dirty" and if so do NOT skip garbage collection after all. In the past the "dirty" check just looked for any loose objects or more than one pack. But that's not always good enough. Enhance the check when no loose objects are found but exactly one pack (excluding special ignored pack names) is found. If the single pack has an oddball name (i.e. doesn't match pack-[hexdigit]{40}.pack) or it's an okay name but the repository doesn't actually have any refs then consider it to be dirty. This check now catches interruptions of "gc.sh: the new order" (0df1370df327fbc3, 2017-12-16) that may have left behind an oddly named pack and for some reason the lastgc date fails to trigger garbage collection like it should (it shouldn't have been updated if garbage collection was interrupted prematurely). It also now catches a normal "gc.sh: the new order" completion that may have left a single, oddly named pack behind as part of normal operation (a single pack of "loose" objects for example) that does need to be processed further at the next min_gc_interval even if no refs have been changed since it was created. Signed-off-by: Kyle J. McKay --- jobd/gc.sh | 34 ++++++++++++++++++++++++++++++---- shlib.sh | 26 +++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/jobd/gc.sh b/jobd/gc.sh index 11aa74c..bd368b9 100755 --- a/jobd/gc.sh +++ b/jobd/gc.sh @@ -158,14 +158,40 @@ compact_reflogs() { } # return true if there's more than one objects/pack-.pack file or -# ANY sha-1 files in objects +# ANY sha-1 files in objects or +# there's one pack and it's not a normal pack name or +# there's one pack but not any refs is_dirty() { - _packs=$(find -L objects/pack -name "pre-auto-gc-[12].pack" -prune -o -name "*.pack" -type f -print | head -n 2 | LC_ALL=C wc -l) - if [ $_packs != 1 ] && [ $_packs != 0 ]; then + _packs="$(find -L objects/pack -name "pre-auto-gc-[12].pack" -prune -o -name "*.pack" -type f -print 2>/dev/null | head -n 2)" + vcnt _packscnt $_packs + if [ $_packscnt -gt 1 ]; then return 0 fi + if [ $_packscnt -eq 1 ]; then + # the single pack name is in $_packs + _packs="${_packs%.pack}" + _packs="${_packs#objects/pack/}" + case "$_packs" in + pack-*) + _packs="${_packs#pack-}" + if [ "${#_packs}" -lt 40 ] || [ "${_packs#*[!0-9a-fA-F]}" != "$_packs" ]; then + # name not exclusively 40 or more hexadecimal digits makes it dirty + return 0 + fi + ;; + *) + # abnormal name makes it dirty + return 0 + ;; + esac + fi _objs=$(find -L objects/$octet -name "$octet19*" -type f -print 2>/dev/null | head -n 1 | LC_ALL=C wc -l) - [ $_objs -ne 0 ] + [ $_objs -eq 0 ] || return 0 + [ $_packscnt -eq 1 ] || return 1 + # we do this check last because it's potentially the most expensive; + # at this point we know we do not have any loose objects, but we do + # have one pack that's named "normally"; empty refs => dirty + is_empty_refs_dir } # make sure combine-packs uses the correct Git executable diff --git a/shlib.sh b/shlib.sh index 86e1b1e..0447f9d 100644 --- a/shlib.sh +++ b/shlib.sh @@ -563,10 +563,11 @@ is_root() { [ "$(id -u 2>/dev/null)" = "0" ] } -# Check to see if the single argument is a Git directory +# Check to see if the single argument (default ".") is a Git directory is_git_dir() { # Just like Git's test except we ignore GIT_OBJECT_DIRECTORY # And we are slightly more picky (must be refs/.+ not refs/.*) + [ $# -ne 0 ] || set -- "." [ -d "$1/objects" ] && [ -x "$1/objects" ] || return 1 [ -d "$1/refs" ] && [ -x "$1/refs" ] || return 1 if [ -L "$1/HEAD" ]; then @@ -588,6 +589,29 @@ is_git_dir() { return 1 } +# Check to see if the single argument (default ".") is a directory with no refs +is_empty_refs_dir() { + [ $# -ne 0 ] || set -- "." + if [ -s "$1/packed-refs" ]; then + # could be a packed-refs file with just a '# pack-refs ..." line + # null hash lines and peel lines do not count either + _refcnt="$(( $(LC_ALL=C sed <"$1/packed-refs" \ + -e "/^00* /d" \ + -e "/^$octet20$hexdig* refs\/[^ $tab]*\$/!d" | wc -l) ))" + [ "${_refcnt:-0}" -eq 0 ] || return 1 + fi + if [ -d "$1/refs" ]; then + # quick and dirty check, doesn't try to validate contents + # or ignore embedded symbolic refs + _refcnt="$(( $(find -L "$1/refs" -type f -print 2>/dev/null | head -n 1 | LC_ALL=C wc -l) ))" + [ "${_refcnt:-0}" -eq 0 ] || return 1 + fi + # last chance a detached HEAD (we ignore any linked working trees though) + [ -s "$1/HEAD" ] && read -r _hr <"$1/HEAD" && [ -n "$_hr" ] || return 0 + [ "${_hr#*[!0-9a-f]}" != "$_hr" ] || [ "${_hr#*[!0]}" = "$_hr" ] || [ "${#_hr}" -lt 40 ] || return 1 + return 0 +} + # List all Git repositories, with given prefix if specified, one-per-line # All project names starting with _ are always excluded from the result get_repo_list() { -- 2.11.4.GIT