ppg-apply: use flock_multi to rate-limit puppet runs
[puppet-git.git] / flock_multi
blob0004f3d5340711c4ab7be6d63f32330e6b5c2a48
1 #!/bin/bash
3 # Default to an uncommon flockerr because we need to
4 # differentiate from cmd error to avoid bogus retries.
5 # flock's default of 1 would lead to frequent misfires
6 flockerr=254
7 sleeptime=60
9 lockdir=${FLOCK_MULTI_DIR:-"/mnt/cluster/lock"}
10 lockconfdir=${FLOCK_MULTI_CONF_DIR:-"/mnt/cluster/conf/lock"}
12 usage() {
13 echo "Usage:"
14 echo
15 echo " flock_multi [-h] [-v] [-E 200] [-s 20] [-w 60m ] heavy 4 heavyscript"
16 echo "Notes:"
17 echo " -w accepts m and h suffixes "
20 maybe_timeout() {
21 if [ -n "${timeout}" ]; then
22 if [ $(date +%s) -gt $timeout ]; then
23 echo "flock_multi timeout" >&2
24 exit $flockerr
29 is_number() {
30 # exit 0 if it's a number
31 echo $1 | grep -qE '^[[:digit:]]+$'
32 return $?
35 case "$#" in
36 0) usage
37 exit
39 *) while :; do
40 case $1 in
41 -h|--help)
42 usage
43 exit 0;;
44 -v|--verbose)
45 verbose=y
46 shift;;
47 -E|--conflict-exit-code)
48 flockerr=$2
49 shift
50 shift
52 -s|--sleeptime)
53 sleeptime=$2
54 shift
55 shift
57 -w|--wait|--timeout)
58 timeout=$2
59 shift
60 shift
62 -*)
63 echo >&2 "ERROR: Unknown option $1"
64 usage
65 exit 1
68 break
70 esac
71 done
72 esac
73 if [ "$#" -lt 3 ]; then
74 usage
75 exit
78 lockname=$1 ; shift
79 locknr=$1 ;shift
81 if ! is_number "${sleeptime}"; then
82 echo "Bad sleptime - should be a number, in seconds" >&2
83 exit 1
86 if ! is_number "${flockerr}"; then
87 echo "Bad conflict exit conflict-exit-code - should be a number" >&2
88 exit 1
91 if [ -n "${timeout}" ]; then
92 # expand timeout suffixes
93 if [[ 'm' = "${timeout:${#timeout}-1:1}" ]]; then
94 timeout=$(( ${timeout:0:${#timeout}-1} * 60 ))
96 if [[ 'h' = "${timeout:${#timeout}-1:1}" ]]; then
97 timeout=$(( ${timeout:0:${#timeout}-1} * 3600 ))
99 if ! is_number "${timeout}"; then
100 echo "Bad timeout - should be a number, with optional 'm' and 'h' suffixes" >&2
101 exit 1
103 # translate timeout to a timeout target time
104 timeout=$[ $(date +%s) + ${timeout} ]
107 if ! is_number $locknr ; then
108 echo "Bad default concurrency parameter" >&2
109 exit 1
112 lockfile="$lockdir/$lockname"
114 # Get the value from file, if none, default to 4
115 if [ -f $lockconfdir/$lockname ]; then
116 locknr_tmp=$(<${lockconfdir}/${lockname})
117 # tolerate empty file - but barf on bad value
118 if [ -n "${locknr_tmp}" ]; then
119 if ! is_number $locknr_tmp ; then
120 echo "Bad concurrency parameter in ${lockconfdir}/${lockname}" >&2
121 exit 1
123 locknr=$locknr_tmp
126 if [ "${verbose}" = 'y' ]; then
127 echo Using $locknr locks
130 ## Is there a benefit to this?
131 ## maybe the subshell pid is good enough
132 ## after all, we'll use this to kill it...
133 parent_pid=$$
135 while true; do
136 for trylock in $(seq 1 $locknr | sort --random-sort); do
138 if ! flock -E $flockerr --nb 200; then
139 exit $flockerr
141 if [ "${verbose}" = 'y' ]; then
142 echo Got ${lockfile}.${trylock}
144 echo "$(hostname) PID: $parent_pid epoch: $(date +%s) $(date -u)" >&200
145 echo "$@" >&200
147 ret=$?
149 # cleanup - we use the filepath instead of the FD to truncate
150 echo >${lockfile}.${trylock}
152 exit $ret
153 ) 200>${lockfile}.${trylock}
155 res=$?
156 if [ $res -ne $flockerr ]; then
157 exit $res
159 if [ "${verbose}" = 'y' ]; then
160 echo Bounced at ${lockfile}.${trylock}
162 done
163 # all locks taken
164 maybe_timeout
166 # add a +/- 5% splay
167 if [ $sleeptime -gt 20 ]; then
168 actual_sleeptime=$[ $sleeptime + ($RANDOM % ( $sleeptime / 10 ) - ( $sleeptime / 20) ) ]
169 else
170 actual_sleeptime=$sleeptime
172 if [ "${verbose}" = 'y' ]; then
173 echo Waiting $actual_sleeptime
175 sleep $actual_sleeptime
176 maybe_timeout
177 done