ctdb-scripts: Respect $RPCMOUNTDOPTS when restarting rpc.mountd
[Samba.git] / ctdb / config / functions
blobb2738edba891d5c7fe8d925bff387d6c84158cbc
1 # Hey Emacs, this is a -*- shell-script -*- !!!
3 # utility functions for ctdb event scripts
5 [ -z "$CTDB_VARDIR" ] && {
6     if [ -d "/var/lib/ctdb" ] ; then
7         export CTDB_VARDIR="/var/lib/ctdb"
8     else
9         export CTDB_VARDIR="/var/ctdb"
10     fi
12 [ -z "$CTDB_ETCDIR" ] && {
13     export CTDB_ETCDIR="/etc"
16 #######################################
17 # pull in a system config file, if any
18 _loadconfig() {
20     if [ -z "$1" ] ; then
21         foo="${service_config:-${service_name}}"
22         if [ -n "$foo" ] ; then
23             loadconfig "$foo"
24             return
25         fi
26     fi
28     if [ "$1" != "ctdb" ] ; then
29         loadconfig "ctdb"
30     fi
32     if [ -z "$1" ] ; then
33         return
34     fi
36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
37         . $CTDB_ETCDIR/sysconfig/$1
38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
39         . $CTDB_ETCDIR/default/$1
40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
41         . $CTDB_BASE/sysconfig/$1
42     fi
44     if [ "$1" = "ctdb" ] ; then
45         _config="${CTDB_BASE}/ctdbd.conf"
46         if [ -r "$_config" ] ; then
47             . "$_config"
48         fi
49     fi
52 loadconfig () {
53     _loadconfig "$@"
56 ##############################################################
58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
59 # configuration file.
60 debug ()
62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
63         # If there are arguments then echo them.  Otherwise expect to
64         # use stdin, which allows us to pass lots of debug using a
65         # here document.
66         if [ -n "$1" ] ; then
67             echo "DEBUG: $*"
68         else
69             sed -e 's@^@DEBUG: @'
70         fi
71     else
72         if [ -z "$1" ] ; then
73             cat >/dev/null
74         fi
75     fi
78 die ()
80     _msg="$1"
81     _rc="${2:-1}"
83     echo "$_msg"
84     exit $_rc
87 # Log given message or stdin to either syslog or a CTDB log file
88 # $1 is the tag passed to logger if syslog is in use.
89 script_log ()
91     _tag="$1" ; shift
93     case "$CTDB_LOGGING" in
94         file:*|"")
95             if [ -n "$CTDB_LOGGING" ] ; then
96                 _file="${CTDB_LOGGING#file:}"
97             else
98                 _file="/var/log/log.ctdb"
99             fi
100             {
101                 if [ -n "$*" ] ; then
102                     echo "$*"
103                 else
104                     cat
105                 fi
106             } >>"$_file"
107             ;;
108         *)
109             # Handle all syslog:* variants here too.  There's no tool to do
110             # the lossy things, so just use logger.
111             logger -t "ctdbd: ${_tag}" $*
112             ;;
113     esac
116 # When things are run in the background in an eventscript then logging
117 # output might get lost.  This is the "solution".  :-)
118 background_with_logging ()
120     (
121         "$@" 2>&1 </dev/null |
122         script_log "${script_name}&"
123     )&
125     return 0
128 ##############################################################
129 # check number of args for different events
130 ctdb_check_args ()
132     case "$1" in
133         takeip|releaseip)
134             if [ $# != 4 ]; then
135                 echo "ERROR: must supply interface, IP and maskbits"
136                 exit 1
137             fi
138             ;;
139         updateip)
140             if [ $# != 5 ]; then
141                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
142                 exit 1
143             fi
144             ;;
145     esac
148 ##############################################################
149 # determine on what type of system (init style) we are running
150 detect_init_style()
152     # only do detection if not already set:
153     [ -z "$CTDB_INIT_STYLE" ] || return
155     if [ -x /sbin/startproc ]; then
156         CTDB_INIT_STYLE="suse"
157     elif [ -x /sbin/start-stop-daemon ]; then
158         CTDB_INIT_STYLE="debian"
159     else
160         CTDB_INIT_STYLE="redhat"
161     fi
164 ######################################################
165 # simulate /sbin/service on platforms that don't have it
166 # _service() makes it easier to hook the service() function for
167 # testing.
168 _service ()
170   _service_name="$1"
171   _op="$2"
173   # do nothing, when no service was specified
174   [ -z "$_service_name" ] && return
176   if [ -x /sbin/service ]; then
177       $_nice /sbin/service "$_service_name" "$_op"
178   elif [ -x /usr/sbin/service ]; then
179       $_nice /usr/sbin/service "$_service_name" "$_op"
180   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
181       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
182   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
183       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
184   fi
187 service()
189     _nice=""
190     _service "$@"
193 ######################################################
194 # simulate /sbin/service (niced) on platforms that don't have it
195 nice_service()
197     _nice="nice"
198     _service "$@"
201 ######################################################
202 # wrapper around /proc/ settings to allow them to be hooked
203 # for testing
204 # 1st arg is relative path under /proc/, 2nd arg is value to set
205 set_proc ()
207     echo "$2" >"/proc/$1"
210 ######################################################
211 # wrapper around getting file contents from /proc/ to allow
212 # this to be hooked for testing
213 # 1st arg is relative path under /proc/
214 get_proc ()
216     cat "/proc/$1"
219 ######################################################
220 # Print up to $_max kernel stack traces for processes named $_program
221 program_stack_traces ()
223     _prog="$1"
224     _max="${2:-1}"
226     _count=1
227     for _pid in $(pidof "$_prog") ; do
228         [ $_count -le $_max ] || break
230         # Do this first to avoid racing with process exit
231         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
232         if [ -n "$_stack" ] ; then
233             echo "Stack trace for ${_prog}[${_pid}]:"
234             echo "$_stack"
235             _count=$(($_count + 1))
236         fi
237     done
240 ######################################################
241 # Check that an RPC service is healthy -
242 # this includes allowing a certain number of failures
243 # before marking the NFS service unhealthy.
245 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
247 # each triple is a set of 3 arguments: an operator, a 
248 # fail count limit and an action string.
250 # For example:
252 #       nfs_check_rpc_service "lockd" \
253 #           -ge 15 "verbose restart unhealthy" \
254 #           -eq 10 "restart:bs"
256 # says that if lockd is down for 15 iterations then do
257 # a verbose restart of lockd and mark the node unhealthy.
258 # Before this, after 10 iterations of failure, the
259 # service is restarted silently in the background.
260 # Order is important: the number of failures need to be
261 # specified in reverse order because processing stops
262 # after the first condition that is true.
263 ######################################################
264 nfs_check_rpc_service ()
266     _prog_name="$1" ; shift
268     if _nfs_check_rpc_common "$_prog_name" ; then
269         return
270     fi
272     while [ -n "$3" ] ; do
273         if _nfs_check_rpc_action "$1" "$2" "$3" ; then
274             break
275         fi
276         shift 3
277     done
280 # The new way of doing things...
281 nfs_check_rpc_services ()
283     # Files must end with .check - avoids editor backups, RPM fu, ...
284     for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
285         _t="${_f%.check}"
286         _prog_name="${_t##*/[0-9][0-9].}"
288         if _nfs_check_rpc_common "$_prog_name" ; then
289             # This RPC service is up, check next service...
290             continue
291         fi
293         # Check each line in the file in turn until one of the limit
294         # checks is hit...
295         while read _cmp _lim _rest ; do
296             # Skip comments
297             case "$_cmp" in
298                 \#*) continue ;;
299             esac
301             if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
302                 # Limit was hit on this line, no further checking...
303                 break
304             fi
305         done <"$_f"
306     done
309 _nfs_check_rpc_common ()
311     _prog_name="$1"
313     # Some platforms don't have separate programs for all services.
314     case "$_prog_name" in
315         statd)
316             which "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
317     esac
319     case "$_prog_name" in
320         nfsd)
321             _rpc_prog=nfs
322             _version=3
323             ;;
324         mountd)
325             _rpc_prog=mountd
326             _version=1
327             ;;
328         rquotad)
329             _rpc_prog=rquotad
330             _version=1
331             ;;
332         lockd)
333             _rpc_prog=nlockmgr
334             _version=4
335             ;;
336         statd)
337             _rpc_prog=status
338             _version=1
339             ;;
340         *)
341             echo "Internal error: unknown RPC program \"$_prog_name\"."
342             exit 1
343     esac
345     _service_name="nfs_${_prog_name}"
347     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
348         ctdb_counter_init "$_service_name"
349         return 0
350     fi
352     ctdb_counter_incr "$_service_name"
354     return 1
357 _nfs_check_rpc_action ()
359     _cmp="$1"
360     _limit="$2"
361     _actions="$3"
363     if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
364         return 1
365     fi
367     for _action in $_actions ; do
368         case "$_action" in
369             verbose)
370                 echo "$ctdb_check_rpc_out"
371                 ;;
372             restart)
373                 _nfs_restart_rpc_service "$_prog_name"
374                 ;;
375             restart:b)
376                 _nfs_restart_rpc_service "$_prog_name" true
377                 ;;
378             unhealthy)
379                 exit 1
380                 ;;
381             *)
382                 echo "Internal error: unknown action \"$_action\"."
383                 exit 1
384         esac
385     done
387     return 0
390 _nfs_restart_rpc_service ()
392     _prog_name="$1"
393     _background="${2:-false}"
395     if $_background ; then
396         _maybe_background="background_with_logging"
397     else
398         _maybe_background=""
399     fi
401     _p="rpc.${_prog_name}"
403     case "$_prog_name" in
404         nfsd)
405             echo "Trying to restart NFS service"
406             $_maybe_background startstop_nfs restart
407             ;;
408         mountd)
409             echo "Trying to restart $_prog_name [${_p}]"
410             killall -q -9 "$_p"
411             nfs_dump_some_threads "$_p"
412             $_maybe_background $_p $RPCMOUNTDOPTS \
413                                ${MOUNTD_PORT:+-p} $MOUNTD_PORT
414             ;;
415         rquotad)
416             echo "Trying to restart $_prog_name [${_p}]"
417             killall -q -9 "$_p"
418             nfs_dump_some_threads "$_p"
419             $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
420             ;;
421         lockd)
422             echo "Trying to restart lock manager service"
423             $_maybe_background startstop_nfslock restart
424             ;;
425         statd)
426             echo "Trying to restart $_prog_name [${_p}]"
427             killall -q -9 "$_p"
428             nfs_dump_some_threads "$_p"
429             $_maybe_background $_p \
430                 ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
431                 ${STATD_PORT:+-p} $STATD_PORT \
432                 ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
433             ;;
434         *)
435             echo "Internal error: unknown RPC program \"$_prog_name\"."
436             exit 1
437     esac
440 ######################################################
441 # check that a rpc server is registered with portmap
442 # and responding to requests
443 # usage: ctdb_check_rpc SERVICE_NAME VERSION
444 ######################################################
445 ctdb_check_rpc ()
447     progname="$1"
448     version="$2"
450     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
452     if ! ctdb_check_rpc_out=$(rpcinfo -u $_localhost $progname $version 2>&1) ; then
453         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
454 $ctdb_check_rpc_out"
455         echo "$ctdb_check_rpc_out"
456         return 1
457     fi
460 ######################################################
461 # Ensure $service_name is set
462 assert_service_name ()
464     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
467 ######################################################
468 # check a set of directories is available
469 # return 1 on a missing directory
470 # directories are read from stdin
471 ######################################################
472 ctdb_check_directories_probe()
474     while IFS="" read d ; do
475         case "$d" in
476             *%*)
477                 continue
478                 ;;
479             *)
480                 [ -d "${d}/." ] || return 1
481         esac
482     done
485 ######################################################
486 # check a set of directories is available
487 # directories are read from stdin
488 ######################################################
489 ctdb_check_directories()
491     ctdb_check_directories_probe || {
492         echo "ERROR: $service_name directory \"$d\" not available"
493         exit 1
494     }
497 ######################################################
498 # check a set of tcp ports
499 # usage: ctdb_check_tcp_ports <ports...>
500 ######################################################
502 # This flag file is created when a service is initially started.  It
503 # is deleted the first time TCP port checks for that service succeed.
504 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
505 # message if a port check fails.
506 _ctdb_check_tcp_common ()
508     assert_service_name
509     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
512 ctdb_check_tcp_init ()
514     _ctdb_check_tcp_common
515     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
516     touch "$_ctdb_service_started_file"
519 # Check whether something is listening on all of the given TCP ports
520 # using the "ctdb checktcpport" command.
521 ctdb_check_tcp_ports()
523     if [ -z "$1" ] ; then
524         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
525         exit 1
526     fi
528     for _p ; do  # process each function argument (port)
529         _cmd="ctdb checktcpport $_p"
530         _out=$($_cmd 2>&1)
531         _ret=$?
532         case "$_ret" in
533             0)
534                 _ctdb_check_tcp_common
535                 if [ ! -f "$_ctdb_service_started_file" ] ; then
536                     echo "ERROR: $service_name tcp port $_p is not responding"
537                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
538                 else
539                     echo "INFO: $service_name tcp port $_p is not responding"
540                 fi
542                 return 1
543                 ;;
544             98)
545                 # Couldn't bind, something already listening, next port...
546                 continue
547                 ;;
548             *)
549                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
550                 debug <<EOF
551 ctdb checktcpport (exited with $_ret) with output:
552 $_out"
554                 return $_ret
555         esac
556     done
558     # All ports listening
559     _ctdb_check_tcp_common
560     rm -f "$_ctdb_service_started_file"
561     return 0
564 ######################################################
565 # check a unix socket
566 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
567 ######################################################
568 ctdb_check_unix_socket() {
569     socket_path="$1"
570     [ -z "$socket_path" ] && return
572     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
573         echo "ERROR: $service_name socket $socket_path not found"
574         return 1
575     fi
578 ######################################################
579 # check a command returns zero status
580 # usage: ctdb_check_command <command>
581 ######################################################
582 ctdb_check_command ()
584     _out=$("$@" 2>&1) || {
585         echo "ERROR: $* returned error"
586         echo "$_out" | debug
587         exit 1
588     }
591 ################################################
592 # kill off any TCP connections with the given IP
593 ################################################
594 kill_tcp_connections ()
596     _ip="$1"
598     _oneway=false
599     if [ "$2" = "oneway" ] ; then
600         _oneway=true
601     fi
603     get_tcp_connections_for_ip "$_ip" | {
604         _killcount=0
605         _connections=""
606         _nl="
608         while read _dst _src; do
609             _destport="${_dst##*:}"
610             __oneway=$_oneway
611             case $_destport in
612                 # we only do one-way killtcp for CIFS
613                 139|445) __oneway=true ;;
614             esac
616             echo "Killing TCP connection $_src $_dst"
617             _connections="${_connections}${_nl}${_src} ${_dst}"
618             if ! $__oneway ; then
619                 _connections="${_connections}${_nl}${_dst} ${_src}"
620             fi
622             _killcount=$(($_killcount + 1))
623         done
625         if [ $_killcount -eq 0 ] ; then
626             return
627         fi
629         echo "$_connections" | ctdb killtcp || {
630             echo "Failed to send killtcp control"
631             return
632         }
634         _count=0
635         while : ; do
636             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
638             if [ $_remaining -eq 0 ] ; then
639                 echo "Killed $_killcount TCP connections to released IP $_ip"
640                 return
641             fi
643             _count=$(($_count + 1))
644             if [ $_count -gt 3 ] ; then
645                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
646                 return
647             fi
649             echo "Waiting for $_remaining connections to be killed for IP $_ip"
650             sleep 1
651         done
652     }
655 ##################################################################
656 # kill off the local end for any TCP connections with the given IP
657 ##################################################################
658 kill_tcp_connections_local_only ()
660     kill_tcp_connections "$1" "oneway"
663 ##################################################################
664 # tickle any TCP connections with the given IP
665 ##################################################################
666 tickle_tcp_connections ()
668     _ip="$1"
670     get_tcp_connections_for_ip "$_ip" |
671     {
672         _failed=false
674         while read dest src; do
675             echo "Tickle TCP connection $src $dest"
676             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
677             echo "Tickle TCP connection $dest $src"
678             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
679         done
681         if $_failed ; then
682             echo "Failed to send tickle control"
683         fi
684     }
687 get_tcp_connections_for_ip ()
689     _ip="$1"
691     netstat -tn | awk -v ip=$_ip \
692         'index($1, "tcp") == 1 && \
693          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
694          && $6 == "ESTABLISHED" \
695          {print $4" "$5}'
698 ##################################################################
699 # use statd-callout to update NFS lock info
700 ##################################################################
701 nfs_update_lock_info ()
703     if [ -x "$CTDB_BASE/statd-callout" ] ; then
704         "$CTDB_BASE/statd-callout" update
705     fi
708 ########################################################
709 # start/stop the Ganesha nfs service
710 ########################################################
711 startstop_ganesha()
713     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
714     case "$1" in
715         start)
716             service "$_service_name" start
717             ;;
718         stop)
719             service "$_service_name" stop
720             ;;
721         restart)
722             service "$_service_name" stop
723             nfs_dump_some_threads "rpc.statd"
724             service "$_service_name" start
725             ;;
726     esac
729 ########################################################
730 # start/stop the nfs service on different platforms
731 ########################################################
732 startstop_nfs() {
733         PLATFORM="unknown"
734         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
735                 PLATFORM="sles"
736         }
737         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
738             -r /usr/lib/systemd/system/nfs-lock.service ] && {
739                 PLATFORM="rhel"
740         }
742         case $PLATFORM in
743         sles)
744                 case $1 in
745                 start)
746                         service nfsserver start
747                         ;;
748                 stop)
749                         service nfsserver stop > /dev/null 2>&1
750                         ;;
751                 restart)
752                         set_proc "fs/nfsd/threads" 0
753                         service nfsserver stop > /dev/null 2>&1
754                         pkill -9 nfsd
755                         nfs_dump_some_threads
756                         service nfsserver start
757                         ;;
758                 esac
759                 ;;
760         rhel)
761                 case $1 in
762                 start)
763                         service nfslock start
764                         service nfs start
765                         ;;
766                 stop)
767                         service nfs stop
768                         service nfslock stop
769                         ;;
770                 restart)
771                         set_proc "fs/nfsd/threads" 0
772                         service nfs stop > /dev/null 2>&1
773                         service nfslock stop > /dev/null 2>&1
774                         pkill -9 nfsd
775                         nfs_dump_some_threads
776                         service nfslock start
777                         service nfs start
778                         ;;
779                 esac
780                 ;;
781         *)
782                 echo "Unknown platform. NFS is not supported with ctdb"
783                 exit 1
784                 ;;
785         esac
788 # Dump up to the configured number of nfsd thread backtraces.
789 nfs_dump_some_threads ()
791     _prog="${1:-nfsd}"
793     _num="${CTDB_NFS_DUMP_STUCK_THREADS:-5}"
794     [ $_num -gt 0 ] || return 0
796     program_stack_traces "$_prog" $_num
799 ########################################################
800 # start/stop the nfs lockmanager service on different platforms
801 ########################################################
802 startstop_nfslock() {
803         PLATFORM="unknown"
804         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
805                 PLATFORM="sles"
806         }
807         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
808             -r /usr/lib/systemd/system/nfs-lock.service ] && {
809                 PLATFORM="rhel"
810         }
812         case $PLATFORM in
813         sles)
814                 # for sles there is no service for lockmanager
815                 # so we instead just shutdown/restart nfs
816                 case $1 in
817                 start)
818                         service nfsserver start
819                         ;;
820                 stop)
821                         service nfsserver stop > /dev/null 2>&1
822                         ;;
823                 restart)
824                         service nfsserver stop > /dev/null 2>&1
825                         service nfsserver start
826                         ;;
827                 esac
828                 ;;
829         rhel)
830                 case $1 in
831                 start)
832                         service nfslock start
833                         ;;
834                 stop)
835                         service nfslock stop > /dev/null 2>&1
836                         ;;
837                 restart)
838                         service nfslock stop > /dev/null 2>&1
839                         service nfslock start
840                         ;;
841                 esac
842                 ;;
843         *)
844                 echo "Unknown platform. NFS locking is not supported with ctdb"
845                 exit 1
846                 ;;
847         esac
850 ########################################################
852 add_ip_to_iface ()
854     _iface=$1
855     _ip=$2
856     _maskbits=$3
858     # Ensure interface is up
859     ip link set "$_iface" up || \
860         die "Failed to bringup interface $_iface"
862     # Only need to define broadcast for IPv4
863     case "$ip" in
864         *:*) _bcast=""      ;;
865         *)   _bcast="brd +" ;;
866     esac
868     ip addr add "$_ip/$_maskbits" $_bcast dev "$_iface" || {
869         echo "Failed to add $_ip/$_maskbits on dev $_iface"
870         return 1
871     }
873     # Wait 5 seconds for IPv6 addresses to stop being tentative...
874     if [ -z "$_bcast" ] ; then
875         for _x in $(seq 1 10) ; do
876             ip addr show to "${_ip}/128" | grep -q "tentative" || break
877             sleep 0.5
878         done
880         # If the address was a duplicate then it won't be on the
881         # interface so flag an error.
882         _t=$(ip addr show to "${_ip}/128")
883         case "$_t" in
884             "")
885                 echo "Failed to add $_ip/$_maskbits on dev $_iface"
886                 return 1
887                 ;;
888             *tentative*|*dadfailed*)
889                 echo "Failed to add $_ip/$_maskbits on dev $_iface"
890                 ip addr del "$_ip/$_maskbits" dev "$_iface"
891                 return 1
892                 ;;
893         esac
894     fi
897 delete_ip_from_iface()
899     _iface=$1
900     _ip=$2
901     _maskbits=$3
903     # This could be set globally for all interfaces but it is probably
904     # better to avoid surprises, so limit it the interfaces where CTDB
905     # has public IP addresses.  There isn't anywhere else convenient
906     # to do this so just set it each time.  This is much cheaper than
907     # remembering and re-adding secondaries.
908     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
910     ip addr del "$_ip/$_maskbits" dev "$_iface" || {
911         echo "Failed to del $_ip on dev $_iface"
912         return 1
913     }
916 # If the given IP is hosted then print 2 items: maskbits and iface
917 ip_maskbits_iface ()
919     _addr="$1"
921     case "$_addr" in
922         *:*) _family="inet6" ; _bits=128 ;;
923         *)   _family="inet"  ; _bits=32  ;;
924     esac
926     ip addr show to "${_addr}/${_bits}" 2>/dev/null | \
927         awk -v family="${_family}" \
928             'NR == 1 { iface = $2; sub(":$", "", iface) } \
929              $1 ~ /inet/ { mask = $2; sub(".*/", "", mask); \
930                            print mask, iface, family }'
933 drop_ip ()
935     _addr="${1%/*}"  # Remove optional maskbits
937     set -- $(ip_maskbits_iface $_addr)
938     if [ -n "$1" ] ; then
939         _maskbits="$1"
940         _iface="$2"
941         echo "Removing public address $_addr/$_maskbits from device $_iface"
942         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
943     fi
946 drop_all_public_ips ()
948     while read _ip _x ; do
949         drop_ip "$_ip"
950     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
953 flush_route_cache ()
955     set_proc sys/net/ipv4/route/flush 1
956     set_proc sys/net/ipv6/route/flush 1
959 ########################################################
960 # Simple counters
961 _ctdb_counter_common () {
962     _service_name="${1:-${service_name:-${script_name}}}"
963     _counter_file="$ctdb_fail_dir/$_service_name"
964     mkdir -p "${_counter_file%/*}" # dirname
966 ctdb_counter_init () {
967     _ctdb_counter_common "$1"
969     >"$_counter_file"
971 ctdb_counter_incr () {
972     _ctdb_counter_common "$1"
974     # unary counting!
975     echo -n 1 >> "$_counter_file"
977 ctdb_check_counter () {
978     _msg="${1:-error}"  # "error"  - anything else is silent on fail
979     _op="${2:--ge}"  # an integer operator supported by test
980     _limit="${3:-${service_fail_limit}}"
981     shift 3
982     _ctdb_counter_common "$1"
984     # unary counting!
985     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
986     _hit=false
987     if [ "$_op" != "%" ] ; then
988         if [ $_size $_op $_limit ] ; then
989             _hit=true
990         fi
991     else
992         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
993             _hit=true
994         fi
995     fi
996     if $_hit ; then
997         if [ "$_msg" = "error" ] ; then
998             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
999             exit 1              
1000         else
1001             return 1
1002         fi
1003     fi
1006 ########################################################
1008 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
1009 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
1011 ctdb_setup_service_state_dir ()
1013     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
1014     mkdir -p "$service_state_dir" || {
1015         echo "Error creating state dir \"$service_state_dir\""
1016         exit 1
1017     }
1020 ########################################################
1021 # Managed status history, for auto-start/stop
1023 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
1025 _ctdb_managed_common ()
1027     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
1030 ctdb_service_managed ()
1032     _ctdb_managed_common
1033     mkdir -p "$ctdb_managed_dir"
1034     touch "$_ctdb_managed_file"
1037 ctdb_service_unmanaged ()
1039     _ctdb_managed_common
1040     rm -f "$_ctdb_managed_file"
1043 is_ctdb_previously_managed_service ()
1045     _ctdb_managed_common
1046     [ -f "$_ctdb_managed_file" ]
1049 ########################################################
1050 # Check and set status
1052 log_status_cat ()
1054     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
1057 ctdb_checkstatus ()
1059     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
1060         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
1061         return 1
1062     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
1063         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
1064         return 2
1065     else
1066         return 0
1067     fi
1070 ctdb_setstatus ()
1072     d="$ctdb_status_dir/$script_name"
1073     case "$1" in
1074         unhealthy|banned)
1075             mkdir -p "$d"
1076             cat "$2" >"$d/$1"
1077             ;;
1078         *)
1079             for i in "banned" "unhealthy" ; do
1080                 rm -f "$d/$i"
1081             done
1082             ;;
1083     esac
1086 ##################################################################
1087 # Reconfigure a service on demand
1089 _ctdb_service_reconfigure_common ()
1091     _d="$ctdb_status_dir/${service_name}"
1092     mkdir -p "$_d"
1093     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1096 ctdb_service_needs_reconfigure ()
1098     _ctdb_service_reconfigure_common
1099     [ -e "$_ctdb_service_reconfigure_flag" ]
1102 ctdb_service_set_reconfigure ()
1104     _ctdb_service_reconfigure_common
1105     >"$_ctdb_service_reconfigure_flag"
1108 ctdb_service_unset_reconfigure ()
1110     _ctdb_service_reconfigure_common
1111     rm -f "$_ctdb_service_reconfigure_flag"
1114 ctdb_service_reconfigure ()
1116     echo "Reconfiguring service \"${service_name}\"..."
1117     ctdb_service_unset_reconfigure
1118     service_reconfigure || return $?
1119     ctdb_counter_init
1122 # Default service_reconfigure() function does nothing.
1123 service_reconfigure ()
1125     :
1128 ctdb_reconfigure_take_lock ()
1130     _ctdb_service_reconfigure_common
1131     _lock="${_d}/reconfigure_lock"
1132     mkdir -p "${_lock%/*}" # dirname
1133     touch "$_lock"
1135     (
1136         flock 0
1137         # This is overkill but will work if we need to extend this to
1138         # allow certain events to run multiple times in parallel
1139         # (e.g. takeip) and write multiple PIDs to the file.
1140         read _locker_event 
1141         if [ -n "$_locker_event" ] ; then
1142             while read _pid ; do
1143                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1144                     kill -0 "$_pid" 2>/dev/null ; then
1145                     exit 1
1146                 fi
1147             done
1148         fi
1150         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1151         exit 0
1152     ) <"$_lock"
1155 ctdb_reconfigure_release_lock ()
1157     _ctdb_service_reconfigure_common
1158     _lock="${_d}/reconfigure_lock"
1160     rm -f "$_lock"
1163 ctdb_replay_monitor_status ()
1165     echo "Replaying previous status for this script due to reconfigure..."
1166     # Leading separator ('|') is missing in some versions...
1167     _out=$(ctdb scriptstatus -X | grep -E "^\|?monitor\|${script_name}\|")
1168     # Output looks like this:
1169     # |monitor|60.nfs|1|ERROR|1314764004.030861|1314764004.035514|foo bar|
1170     # This is the cheapest way of getting fields in the middle.
1171     set -- $(IFS="|" ; echo $_out)
1172     _code="$3"
1173     _status="$4"
1174     # The error output field can include colons so we'll try to
1175     # preserve them.  The weak checking at the beginning tries to make
1176     # this work for both broken (no leading '|') and fixed output.
1177     _out="${_out%|}"
1178     _err_out="${_out#*monitor|${script_name}|*|*|*|*|}"
1179     case "$_status" in
1180         OK) : ;;  # Do nothing special.
1181         TIMEDOUT)
1182             # Recast this as an error, since we can't exit with the
1183             # correct negative number.
1184             _code=1
1185             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1186             ;;
1187         DISABLED)
1188             # Recast this as an OK, since we can't exit with the
1189             # correct negative number.
1190             _code=0
1191             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1192             ;;
1193         *) : ;;  # Must be ERROR, do nothing special.
1194     esac
1195     if [ -n "$_err_out" ] ; then
1196         echo "$_err_out"
1197     fi
1198     exit $_code
1201 ctdb_service_check_reconfigure ()
1203     assert_service_name
1205     # We only care about some events in this function.  For others we
1206     # return now.
1207     case "$event_name" in
1208         monitor|ipreallocated|reconfigure) : ;;
1209         *) return 0 ;;
1210     esac
1212     if ctdb_reconfigure_take_lock ; then
1213         # No events covered by this function are running, so proceed
1214         # with gay abandon.
1215         case "$event_name" in
1216             reconfigure)
1217                 (ctdb_service_reconfigure)
1218                 exit $?
1219                 ;;
1220             ipreallocated)
1221                 if ctdb_service_needs_reconfigure ; then
1222                     ctdb_service_reconfigure
1223                 fi
1224                 ;;
1225         esac
1227         ctdb_reconfigure_release_lock
1228     else
1229         # Somebody else is running an event we don't want to collide
1230         # with.  We proceed with caution.
1231         case "$event_name" in
1232             reconfigure)
1233                 # Tell whoever called us to retry.
1234                 exit 2
1235                 ;;
1236             ipreallocated)
1237                 # Defer any scheduled reconfigure and just run the
1238                 # rest of the ipreallocated event, as per the
1239                 # eventscript.  There's an assumption here that the
1240                 # event doesn't depend on any scheduled reconfigure.
1241                 # This is true in the current code.
1242                 return 0
1243                 ;;
1244             monitor)
1245                 # There is most likely a reconfigure in progress so
1246                 # the service is possibly unstable.  As above, we
1247                 # defer any scheduled reconfigured.  We also replay
1248                 # the previous monitor status since that's the best
1249                 # information we have.
1250                 ctdb_replay_monitor_status
1251                 ;;
1252         esac
1253     fi
1256 ##################################################################
1257 # Does CTDB manage this service? - and associated auto-start/stop
1259 ctdb_compat_managed_service ()
1261     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1262         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1263     fi
1266 is_ctdb_managed_service ()
1268     assert_service_name
1270     # $t is used just for readability and to allow better accurate
1271     # matching via leading/trailing spaces
1272     t=" $CTDB_MANAGED_SERVICES "
1274     # Return 0 if "<space>$service_name<space>" appears in $t
1275     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1276         return 0
1277     fi
1279     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1280     # backward compatibility and try again.
1281     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1282     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1283     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1284     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1285     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1286     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1287     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1288     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1289     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1291     t=" $CTDB_MANAGED_SERVICES "
1293     # Return 0 if "<space>$service_name<space>" appears in $t
1294     [ "${t#* ${service_name} }" != "${t}" ]
1297 ctdb_start_stop_service ()
1299     assert_service_name
1301     # Allow service-start/service-stop pseudo-events to start/stop
1302     # services when we're not auto-starting/stopping and we're not
1303     # monitoring.
1304     case "$event_name" in
1305         service-start)
1306             if is_ctdb_managed_service ; then
1307                 die 'service-start event not permitted when service is managed'
1308             fi
1309             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1310                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1311             fi
1312             ctdb_service_start
1313             exit $?
1314             ;;
1315         service-stop)
1316             if is_ctdb_managed_service ; then
1317                 die 'service-stop event not permitted when service is managed'
1318             fi
1319             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1320                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1321             fi
1322             ctdb_service_stop
1323             exit $?
1324             ;;
1325     esac
1327     # Do nothing unless configured to...
1328     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1330     [ "$event_name" = "monitor" ] || return 0
1332     if is_ctdb_managed_service ; then
1333         if ! is_ctdb_previously_managed_service ; then
1334             echo "Starting service \"$service_name\" - now managed"
1335             background_with_logging ctdb_service_start
1336             exit $?
1337         fi
1338     else
1339         if is_ctdb_previously_managed_service ; then
1340             echo "Stopping service \"$service_name\" - no longer managed"
1341             background_with_logging ctdb_service_stop
1342             exit $?
1343         fi
1344     fi
1347 ctdb_service_start ()
1349     # The service is marked managed if we've ever tried to start it.
1350     ctdb_service_managed
1352     service_start || return $?
1354     ctdb_counter_init
1355     ctdb_check_tcp_init
1358 ctdb_service_stop ()
1360     ctdb_service_unmanaged
1361     service_stop
1364 # Default service_start() and service_stop() functions.
1366 # These may be overridden in an eventscript.
1367 service_start ()
1369     service "$service_name" start
1372 service_stop ()
1374     service "$service_name" stop
1377 ##################################################################
1379 ctdb_standard_event_handler ()
1381     case "$1" in
1382         status)
1383             ctdb_checkstatus
1384             exit
1385             ;;
1386         setstatus)
1387             shift
1388             ctdb_setstatus "$@"
1389             exit
1390             ;;
1391     esac
1394 iptables_wrapper ()
1396     _family="$1" ; shift
1397     if [ "$_family" = "inet6" ] ; then
1398         _iptables_cmd="ip6tables"
1399     else
1400         _iptables_cmd="iptables"
1401     fi
1403     # iptables doesn't like being re-entered, so flock-wrap it.
1404     flock -w 30 "${CTDB_VARDIR}/iptables-ctdb.flock" "$_iptables_cmd" "$@"
1407 # AIX (and perhaps others?) doesn't have mktemp
1408 if ! which mktemp >/dev/null 2>&1 ; then
1409     mktemp ()
1410     {
1411         _dir=false
1412         if [ "$1" = "-d" ] ; then
1413             _dir=true
1414             shift
1415         fi
1416         _d="${TMPDIR:-/tmp}"
1417         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1418             md5sum | \
1419             sed -e 's@\(..........\).*@\1@')
1420         _t="${_d}/tmp.${_hex10}"
1421         (
1422             umask 077
1423             if $_dir ; then
1424                 mkdir "$_t"
1425             else
1426                 >"$_t"
1427             fi
1428         )
1429         echo "$_t"
1430     }
1433 ########################################################
1434 # tickle handling
1435 ########################################################
1437 update_tickles ()
1439         _port="$1"
1441         tickledir="$CTDB_VARDIR/state/tickles"
1442         mkdir -p "$tickledir"
1444         # Who am I?
1445         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1447         # What public IPs do I hold?
1448         _ips=$(ctdb -X ip | awk -F'|' -v pnn=$_pnn '$3 == pnn {print $2}')
1450         # IPs as a regexp choice
1451         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1453         # Record connections to our public IPs in a temporary file
1454         _my_connections="${tickledir}/${_port}.connections"
1455         rm -f "$_my_connections"
1456         netstat -tn |
1457         awk -v destpat="^${_ipschoice}:${_port}\$" \
1458           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1459         sort >"$_my_connections"
1461         # Record our current tickles in a temporary file
1462         _my_tickles="${tickledir}/${_port}.tickles"
1463         rm -f "$_my_tickles"
1464         for _i in $_ips ; do
1465                 ctdb -X gettickles $_i $_port |
1466                 awk -F'|' 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1467         done |
1468         sort >"$_my_tickles"
1470         # Add tickles for connections that we haven't already got tickles for
1471         comm -23 "$_my_connections" "$_my_tickles" |
1472         while read _src _dst ; do
1473                 ctdb addtickle $_src $_dst
1474         done
1476         # Remove tickles for connections that are no longer there
1477         comm -13 "$_my_connections" "$_my_tickles" |
1478         while read _src _dst ; do
1479                 ctdb deltickle $_src $_dst
1480         done
1482         rm -f "$_my_connections" "$_my_tickles" 
1485 ########################################################
1486 # load a site local config file
1487 ########################################################
1489 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1490         . "$CTDB_RC_LOCAL"
1493 [ -x $CTDB_BASE/rc.local ] && {
1494         . $CTDB_BASE/rc.local
1497 [ -d $CTDB_BASE/rc.local.d ] && {
1498         for i in $CTDB_BASE/rc.local.d/* ; do
1499                 [ -x "$i" ] && . "$i"
1500         done
1503 script_name="${0##*/}"       # basename
1504 service_fail_limit=1
1505 event_name="$1"