ctdb-logging: New option CTDB_LOGGING, remove CTDB_LOGFILE, CTDB_SYSLOG
[Samba.git] / ctdb / config / functions
blob5c9497dc520a97ed69ba0660e611ddc778f57100
1 # Hey Emacs, this is a -*- shell-script -*- !!!
3 # utility functions for ctdb event scripts
5 [ -z "$CTDB_VARDIR" ] && {
6     if [ -d "/var/lib/ctdb" ] ; then
7         export CTDB_VARDIR="/var/lib/ctdb"
8     else
9         export CTDB_VARDIR="/var/ctdb"
10     fi
12 [ -z "$CTDB_ETCDIR" ] && {
13     export CTDB_ETCDIR="/etc"
16 #######################################
17 # pull in a system config file, if any
18 _loadconfig() {
20     if [ -z "$1" ] ; then
21         foo="${service_config:-${service_name}}"
22         if [ -n "$foo" ] ; then
23             loadconfig "$foo"
24             return
25         fi
26     fi
28     if [ "$1" != "ctdb" ] ; then
29         loadconfig "ctdb"
30     fi
32     if [ -z "$1" ] ; then
33         return
34     fi
36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
37         . $CTDB_ETCDIR/sysconfig/$1
38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
39         . $CTDB_ETCDIR/default/$1
40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
41         . $CTDB_BASE/sysconfig/$1
42     fi
44     if [ "$1" = "ctdb" ] ; then
45         _config="${CTDB_BASE}/ctdbd.conf"
46         if [ -r "$_config" ] ; then
47             . "$_config"
48         fi
49     fi
52 loadconfig () {
53     _loadconfig "$@"
56 ##############################################################
58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
59 # configuration file.
60 debug ()
62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
63         # If there are arguments then echo them.  Otherwise expect to
64         # use stdin, which allows us to pass lots of debug using a
65         # here document.
66         if [ -n "$1" ] ; then
67             echo "DEBUG: $*"
68         elif ! tty -s ; then
69             sed -e 's@^@DEBUG: @'
70         fi
71     fi
74 die ()
76     _msg="$1"
77     _rc="${2:-1}"
79     echo "$_msg"
80     exit $_rc
83 # Log given message or stdin to either syslog or a CTDB log file
84 # $1 is the tag passed to logger if syslog is in use.
85 script_log ()
87     _tag="$1" ; shift
89     case "$CTDB_LOGGING" in
90         file:*|"")
91             if [ -n "$CTDB_LOGGING" ] ; then
92                 _file="${CTDB_LOGGING#file:}"
93             else
94                 _file="/var/log/log.ctdb"
95             fi
96             {
97                 if [ -n "$*" ] ; then
98                     echo "$*"
99                 else
100                     cat
101                 fi
102             } >>"$_file"
103             ;;
104         *)
105             logger -t "ctdbd: ${_tag}" $*
106             ;;
107     esac
110 # When things are run in the background in an eventscript then logging
111 # output might get lost.  This is the "solution".  :-)
112 background_with_logging ()
114     (
115         "$@" 2>&1 </dev/null |
116         script_log "${script_name}&"
117     )&
119     return 0
122 ##############################################################
123 # check number of args for different events
124 ctdb_check_args ()
126     case "$1" in
127         takeip|releaseip)
128             if [ $# != 4 ]; then
129                 echo "ERROR: must supply interface, IP and maskbits"
130                 exit 1
131             fi
132             ;;
133         updateip)
134             if [ $# != 5 ]; then
135                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
136                 exit 1
137             fi
138             ;;
139     esac
142 ##############################################################
143 # determine on what type of system (init style) we are running
144 detect_init_style()
146     # only do detection if not already set:
147     [ -z "$CTDB_INIT_STYLE" ] || return
149     if [ -x /sbin/startproc ]; then
150         CTDB_INIT_STYLE="suse"
151     elif [ -x /sbin/start-stop-daemon ]; then
152         CTDB_INIT_STYLE="debian"
153     else
154         CTDB_INIT_STYLE="redhat"
155     fi
158 ######################################################
159 # simulate /sbin/service on platforms that don't have it
160 # _service() makes it easier to hook the service() function for
161 # testing.
162 _service ()
164   _service_name="$1"
165   _op="$2"
167   # do nothing, when no service was specified
168   [ -z "$_service_name" ] && return
170   if [ -x /sbin/service ]; then
171       $_nice /sbin/service "$_service_name" "$_op"
172   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
173       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
174   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
175       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
176   fi
179 service()
181     _nice=""
182     _service "$@"
185 ######################################################
186 # simulate /sbin/service (niced) on platforms that don't have it
187 nice_service()
189     _nice="nice"
190     _service "$@"
193 ######################################################
194 # wrapper around /proc/ settings to allow them to be hooked
195 # for testing
196 # 1st arg is relative path under /proc/, 2nd arg is value to set
197 set_proc ()
199     echo "$2" >"/proc/$1"
202 ######################################################
203 # wrapper around getting file contents from /proc/ to allow
204 # this to be hooked for testing
205 # 1st arg is relative path under /proc/
206 get_proc ()
208     cat "/proc/$1"
211 ######################################################
212 # Check that an RPC service is healthy -
213 # this includes allowing a certain number of failures
214 # before marking the NFS service unhealthy.
216 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
218 # each triple is a set of 3 arguments: an operator, a 
219 # fail count limit and an action string.
221 # For example:
223 #       nfs_check_rpc_service "lockd" \
224 #           -ge 15 "verbose restart unhealthy" \
225 #           -eq 10 "restart:bs"
227 # says that if lockd is down for 15 iterations then do
228 # a verbose restart of lockd and mark the node unhealthy.
229 # Before this, after 10 iterations of failure, the
230 # service is restarted silently in the background.
231 # Order is important: the number of failures need to be
232 # specified in reverse order because processing stops
233 # after the first condition that is true.
234 ######################################################
235 nfs_check_rpc_service ()
237     _prog_name="$1" ; shift
239     if _nfs_check_rpc_common "$_prog_name" ; then
240         return
241     fi
243     while [ -n "$3" ] ; do
244         if _nfs_check_rpc_action "$1" "$2" "$3" ; then
245             break
246         fi
247         shift 3
248     done
251 # The new way of doing things...
252 nfs_check_rpc_services ()
254     # Files must end with .check - avoids editor backups, RPM fu, ...
255     for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
256         _t="${_f%.check}"
257         _prog_name="${_t##*/[0-9][0-9].}"
259         if _nfs_check_rpc_common "$_prog_name" ; then
260             # This RPC service is up, check next service...
261             continue
262         fi
264         # Check each line in the file in turn until one of the limit
265         # checks is hit...
266         while read _cmp _lim _rest ; do
267             # Skip comments
268             case "$_cmp" in
269                 \#*) continue ;;
270             esac
272             if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
273                 # Limit was hit on this line, no further checking...
274                 break
275             fi
276         done <"$_f"
277     done
280 _nfs_check_rpc_common ()
282     _prog_name="$1"
284     # Some platforms don't have separate programs for all services.
285     case "$_prog_name" in
286         statd)
287             which "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
288     esac
290     case "$_prog_name" in
291         nfsd)
292             _rpc_prog=nfs
293             _version=3
294             ;;
295         mountd)
296             _rpc_prog=mountd
297             _version=1
298             ;;
299         rquotad)
300             _rpc_prog=rquotad
301             _version=1
302             ;;
303         lockd)
304             _rpc_prog=nlockmgr
305             _version=4
306             ;;
307         statd)
308             _rpc_prog=status
309             _version=1
310             ;;
311         *)
312             echo "Internal error: unknown RPC program \"$_prog_name\"."
313             exit 1
314     esac
316     _service_name="nfs_${_prog_name}"
318     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
319         ctdb_counter_init "$_service_name"
320         return 0
321     fi
323     ctdb_counter_incr "$_service_name"
325     return 1
328 _nfs_check_rpc_action ()
330     _cmp="$1"
331     _limit="$2"
332     _actions="$3"
334     if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
335         return 1
336     fi
338     for _action in $_actions ; do
339         case "$_action" in
340             verbose)
341                 echo "$ctdb_check_rpc_out"
342                 ;;
343             restart)
344                 _nfs_restart_rpc_service "$_prog_name"
345                 ;;
346             restart:b)
347                 _nfs_restart_rpc_service "$_prog_name" true
348                 ;;
349             unhealthy)
350                 exit 1
351                 ;;
352             *)
353                 echo "Internal error: unknown action \"$_action\"."
354                 exit 1
355         esac
356     done
358     return 0
361 _nfs_restart_rpc_service ()
363     _prog_name="$1"
364     _background="${2:-false}"
366     if $_background ; then
367         _maybe_background="background_with_logging"
368     else
369         _maybe_background=""
370     fi
372     _p="rpc.${_prog_name}"
374     case "$_prog_name" in
375         nfsd)
376             echo "Trying to restart NFS service"
377             $_maybe_background startstop_nfs restart
378             ;;
379         mountd)
380             echo "Trying to restart $_prog_name [${_p}]"
381             killall -q -9 "$_p"
382             $_maybe_background $_p ${MOUNTD_PORT:+-p} $MOUNTD_PORT
383             ;;
384         rquotad)
385             echo "Trying to restart $_prog_name [${_p}]"
386             killall -q -9 "$_p"
387             $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
388             ;;
389         lockd)
390             echo "Trying to restart lock manager service"
391             $_maybe_background startstop_nfslock restart
392             ;;
393         statd)
394             echo "Trying to restart $_prog_name [${_p}]"
395             killall -q -9 "$_p"
396             $_maybe_background $_p \
397                 ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
398                 ${STATD_PORT:+-p} $STATD_PORT \
399                 ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
400             ;;
401         *)
402             echo "Internal error: unknown RPC program \"$_prog_name\"."
403             exit 1
404     esac
407 ######################################################
408 # check that a rpc server is registered with portmap
409 # and responding to requests
410 # usage: ctdb_check_rpc SERVICE_NAME VERSION
411 ######################################################
412 ctdb_check_rpc ()
414     progname="$1"
415     version="$2"
417     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
419     if ! ctdb_check_rpc_out=$(rpcinfo -u $_localhost $progname $version 2>&1) ; then
420         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
421 $ctdb_check_rpc_out"
422         echo "$ctdb_check_rpc_out"
423         return 1
424     fi
427 ######################################################
428 # Ensure $service_name is set
429 assert_service_name ()
431     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
434 ######################################################
435 # check a set of directories is available
436 # return 1 on a missing directory
437 # directories are read from stdin
438 ######################################################
439 ctdb_check_directories_probe()
441     while IFS="" read d ; do
442         case "$d" in
443             *%*)
444                 continue
445                 ;;
446             *)
447                 [ -d "${d}/." ] || return 1
448         esac
449     done
452 ######################################################
453 # check a set of directories is available
454 # directories are read from stdin
455 ######################################################
456 ctdb_check_directories()
458     ctdb_check_directories_probe || {
459         echo "ERROR: $service_name directory \"$d\" not available"
460         exit 1
461     }
464 ######################################################
465 # check a set of tcp ports
466 # usage: ctdb_check_tcp_ports <ports...>
467 ######################################################
469 # This flag file is created when a service is initially started.  It
470 # is deleted the first time TCP port checks for that service succeed.
471 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
472 # message if a port check fails.
473 _ctdb_check_tcp_common ()
475     assert_service_name
476     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
479 ctdb_check_tcp_init ()
481     _ctdb_check_tcp_common
482     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
483     touch "$_ctdb_service_started_file"
486 # Check whether something is listening on all of the given TCP ports
487 # using the "ctdb checktcpport" command.
488 ctdb_check_tcp_ports()
490     if [ -z "$1" ] ; then
491         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
492         exit 1
493     fi
495     for _p ; do  # process each function argument (port)
496         _cmd="ctdb checktcpport $_p"
497         _out=$($_cmd 2>&1)
498         _ret=$?
499         case "$_ret" in
500             0)
501                 _ctdb_check_tcp_common
502                 if [ ! -f "$_ctdb_service_started_file" ] ; then
503                     echo "ERROR: $service_name tcp port $_p is not responding"
504                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
505                 else
506                     echo "INFO: $service_name tcp port $_p is not responding"
507                 fi
509                 return 1
510                 ;;
511             98)
512                 # Couldn't bind, something already listening, next port...
513                 continue
514                 ;;
515             *)
516                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
517                 debug <<EOF
518 ctdb checktcpport (exited with $_ret) with output:
519 $_out"
521                 return $_ret
522         esac
523     done
525     # All ports listening
526     _ctdb_check_tcp_common
527     rm -f "$_ctdb_service_started_file"
528     return 0
531 ######################################################
532 # check a unix socket
533 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
534 ######################################################
535 ctdb_check_unix_socket() {
536     socket_path="$1"
537     [ -z "$socket_path" ] && return
539     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
540         echo "ERROR: $service_name socket $socket_path not found"
541         return 1
542     fi
545 ######################################################
546 # check a command returns zero status
547 # usage: ctdb_check_command <command>
548 ######################################################
549 ctdb_check_command ()
551     _out=$("$@" 2>&1) || {
552         echo "ERROR: $* returned error"
553         echo "$_out" | debug
554         exit 1
555     }
558 ################################################
559 # kill off any TCP connections with the given IP
560 ################################################
561 kill_tcp_connections ()
563     _ip="$1"
565     _oneway=false
566     if [ "$2" = "oneway" ] ; then
567         _oneway=true
568     fi
570     get_tcp_connections_for_ip "$_ip" | {
571         _killcount=0
572         _connections=""
573         _nl="
575         while read _dst _src; do
576             _destport="${_dst##*:}"
577             __oneway=$_oneway
578             case $_destport in
579                 # we only do one-way killtcp for CIFS
580                 139|445) __oneway=true ;;
581             esac
583             echo "Killing TCP connection $_src $_dst"
584             _connections="${_connections}${_nl}${_src} ${_dst}"
585             if ! $__oneway ; then
586                 _connections="${_connections}${_nl}${_dst} ${_src}"
587             fi
589             _killcount=$(($_killcount + 1))
590         done
592         if [ $_killcount -eq 0 ] ; then
593             return
594         fi
596         echo "$_connections" | ctdb killtcp || {
597             echo "Failed to send killtcp control"
598             return
599         }
601         _count=0
602         while : ; do
603             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
605             if [ $_remaining -eq 0 ] ; then
606                 echo "Killed $_killcount TCP connections to released IP $_ip"
607                 return
608             fi
610             _count=$(($_count + 1))
611             if [ $_count -gt 3 ] ; then
612                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
613                 return
614             fi
616             echo "Waiting for $_remaining connections to be killed for IP $_ip"
617             sleep 1
618         done
619     }
622 ##################################################################
623 # kill off the local end for any TCP connections with the given IP
624 ##################################################################
625 kill_tcp_connections_local_only ()
627     kill_tcp_connections "$1" "oneway"
630 ##################################################################
631 # tickle any TCP connections with the given IP
632 ##################################################################
633 tickle_tcp_connections ()
635     _ip="$1"
637     get_tcp_connections_for_ip "$_ip" |
638     {
639         _failed=false
641         while read dest src; do
642             echo "Tickle TCP connection $src $dest"
643             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
644             echo "Tickle TCP connection $dest $src"
645             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
646         done
648         if $_failed ; then
649             echo "Failed to send tickle control"
650         fi
651     }
654 get_tcp_connections_for_ip ()
656     _ip="$1"
658     netstat -tn | awk -v ip=$_ip \
659         'index($1, "tcp") == 1 && \
660          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
661          && $6 == "ESTABLISHED" \
662          {print $4" "$5}'
665 ########################################################
666 # start/stop the Ganesha nfs service
667 ########################################################
668 startstop_ganesha()
670     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
671     case "$1" in
672         start)
673             service "$_service_name" start
674             ;;
675         stop)
676             service "$_service_name" stop
677             ;;
678         restart)
679             service "$_service_name" restart
680             ;;
681     esac
684 ########################################################
685 # start/stop the nfs service on different platforms
686 ########################################################
687 startstop_nfs() {
688         PLATFORM="unknown"
689         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
690                 PLATFORM="sles"
691         }
692         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
693             -r /usr/lib/systemd/system/nfs-lock.service ] && {
694                 PLATFORM="rhel"
695         }
697         case $PLATFORM in
698         sles)
699                 case $1 in
700                 start)
701                         service nfsserver start
702                         ;;
703                 stop)
704                         service nfsserver stop > /dev/null 2>&1
705                         ;;
706                 restart)
707                         set_proc "fs/nfsd/threads" 0
708                         service nfsserver stop > /dev/null 2>&1
709                         pkill -9 nfsd
710                         nfs_dump_some_threads
711                         service nfsserver start
712                         ;;
713                 esac
714                 ;;
715         rhel)
716                 case $1 in
717                 start)
718                         service nfslock start
719                         service nfs start
720                         ;;
721                 stop)
722                         service nfs stop
723                         service nfslock stop
724                         ;;
725                 restart)
726                         set_proc "fs/nfsd/threads" 0
727                         service nfs stop > /dev/null 2>&1
728                         service nfslock stop > /dev/null 2>&1
729                         pkill -9 nfsd
730                         nfs_dump_some_threads
731                         service nfslock start
732                         service nfs start
733                         ;;
734                 esac
735                 ;;
736         *)
737                 echo "Unknown platform. NFS is not supported with ctdb"
738                 exit 1
739                 ;;
740         esac
743 # Dump up to the configured number of nfsd thread backtraces.
744 nfs_dump_some_threads ()
746     [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || CTDB_NFS_DUMP_STUCK_THREADS=5
748     # Optimisation to avoid running an unnecessary pidof
749     [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
751     _count=0
752     for _pid in $(pidof nfsd) ; do
753         [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
755         # Do this first to avoid racing with thread exit
756         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
757         if [ -n "$_stack" ] ; then
758             echo "Stack trace for stuck nfsd thread [${_pid}]:"
759             echo "$_stack"
760             _count=$(($_count + 1))
761         fi
762     done
765 ########################################################
766 # start/stop the nfs lockmanager service on different platforms
767 ########################################################
768 startstop_nfslock() {
769         PLATFORM="unknown"
770         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
771                 PLATFORM="sles"
772         }
773         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
774             -r /usr/lib/systemd/system/nfs-lock.service ] && {
775                 PLATFORM="rhel"
776         }
778         case $PLATFORM in
779         sles)
780                 # for sles there is no service for lockmanager
781                 # so we instead just shutdown/restart nfs
782                 case $1 in
783                 start)
784                         service nfsserver start
785                         ;;
786                 stop)
787                         service nfsserver stop > /dev/null 2>&1
788                         ;;
789                 restart)
790                         service nfsserver stop > /dev/null 2>&1
791                         service nfsserver start
792                         ;;
793                 esac
794                 ;;
795         rhel)
796                 case $1 in
797                 start)
798                         service nfslock start
799                         ;;
800                 stop)
801                         service nfslock stop > /dev/null 2>&1
802                         ;;
803                 restart)
804                         service nfslock stop > /dev/null 2>&1
805                         service nfslock start
806                         ;;
807                 esac
808                 ;;
809         *)
810                 echo "Unknown platform. NFS locking is not supported with ctdb"
811                 exit 1
812                 ;;
813         esac
816 # Periodically update the statd database
817 nfs_statd_update ()
819     _update_period="$1"
821     _statd_update_trigger="$service_state_dir/update-trigger"
822     [ -f "$_statd_update_trigger" ] || touch "$_statd_update_trigger"
824     _last_update=$(stat --printf="%Y" "$_statd_update_trigger")
825     _current_time=$(date +"%s")
826     if [ $(( $_current_time - $_last_update)) -ge $_update_period ] ; then
827         touch "$_statd_update_trigger"
828         $CTDB_BASE/statd-callout updatelocal &
829         $CTDB_BASE/statd-callout updateremote &
830     fi
833 ########################################################
835 add_ip_to_iface ()
837     _iface=$1
838     _ip=$2
839     _maskbits=$3
841     # Ensure interface is up
842     ip link set "$_iface" up || \
843         die "Failed to bringup interface $_iface"
845     ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || {
846         echo "Failed to add $_ip/$_maskbits on dev $_iface"
847         return 1
848     }
851 delete_ip_from_iface()
853     _iface=$1
854     _ip=$2
855     _maskbits=$3
857     # This could be set globally for all interfaces but it is probably
858     # better to avoid surprises, so limit it the interfaces where CTDB
859     # has public IP addresses.  There isn't anywhere else convenient
860     # to do this so just set it each time.  This is much cheaper than
861     # remembering and re-adding secondaries.
862     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
864     ip addr del "$_ip/$_maskbits" dev "$_iface" || {
865         echo "Failed to del $_ip on dev $_iface"
866         return 1
867     }
870 # If the given IP is hosted then print 2 items: maskbits and iface 
871 ip_maskbits_iface ()
873     _addr="$1"
875     ip addr show to "${_addr}/32" 2>/dev/null | \
876         awk '$1 == "inet" { print gensub(".*/", "", 1, $2), $NF }'
879 drop_ip ()
881     _addr="${1%/*}"  # Remove optional maskbits
883     set -- $(ip_maskbits_iface $_addr)
884     if [ -n "$1" ] ; then
885         _maskbits="$1"
886         _iface="$2"
887         echo "Removing public address $_addr/$_maskbits from device $_iface"
888         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
889     fi
892 drop_all_public_ips ()
894     while read _ip _x ; do
895         drop_ip "$_ip"
896     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
899 ########################################################
900 # Simple counters
901 _ctdb_counter_common () {
902     _service_name="${1:-${service_name:-${script_name}}}"
903     _counter_file="$ctdb_fail_dir/$_service_name"
904     mkdir -p "${_counter_file%/*}" # dirname
906 ctdb_counter_init () {
907     _ctdb_counter_common "$1"
909     >"$_counter_file"
911 ctdb_counter_incr () {
912     _ctdb_counter_common "$1"
914     # unary counting!
915     echo -n 1 >> "$_counter_file"
917 ctdb_check_counter () {
918     _msg="${1:-error}"  # "error"  - anything else is silent on fail
919     _op="${2:--ge}"  # an integer operator supported by test
920     _limit="${3:-${service_fail_limit}}"
921     shift 3
922     _ctdb_counter_common "$1"
924     # unary counting!
925     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
926     _hit=false
927     if [ "$_op" != "%" ] ; then
928         if [ $_size $_op $_limit ] ; then
929             _hit=true
930         fi
931     else
932         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
933             _hit=true
934         fi
935     fi
936     if $_hit ; then
937         if [ "$_msg" = "error" ] ; then
938             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
939             exit 1              
940         else
941             return 1
942         fi
943     fi
946 ########################################################
948 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
949 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
951 ctdb_setup_service_state_dir ()
953     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
954     mkdir -p "$service_state_dir" || {
955         echo "Error creating state dir \"$service_state_dir\""
956         exit 1
957     }
960 ########################################################
961 # Managed status history, for auto-start/stop
963 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
965 _ctdb_managed_common ()
967     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
970 ctdb_service_managed ()
972     _ctdb_managed_common
973     mkdir -p "$ctdb_managed_dir"
974     touch "$_ctdb_managed_file"
977 ctdb_service_unmanaged ()
979     _ctdb_managed_common
980     rm -f "$_ctdb_managed_file"
983 is_ctdb_previously_managed_service ()
985     _ctdb_managed_common
986     [ -f "$_ctdb_managed_file" ]
989 ########################################################
990 # Check and set status
992 log_status_cat ()
994     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
997 ctdb_checkstatus ()
999     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
1000         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
1001         return 1
1002     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
1003         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
1004         return 2
1005     else
1006         return 0
1007     fi
1010 ctdb_setstatus ()
1012     d="$ctdb_status_dir/$script_name"
1013     case "$1" in
1014         unhealthy|banned)
1015             mkdir -p "$d"
1016             cat "$2" >"$d/$1"
1017             ;;
1018         *)
1019             for i in "banned" "unhealthy" ; do
1020                 rm -f "$d/$i"
1021             done
1022             ;;
1023     esac
1026 ##################################################################
1027 # Reconfigure a service on demand
1029 _ctdb_service_reconfigure_common ()
1031     _d="$ctdb_status_dir/${service_name}"
1032     mkdir -p "$_d"
1033     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1036 ctdb_service_needs_reconfigure ()
1038     _ctdb_service_reconfigure_common
1039     [ -e "$_ctdb_service_reconfigure_flag" ]
1042 ctdb_service_set_reconfigure ()
1044     _ctdb_service_reconfigure_common
1045     >"$_ctdb_service_reconfigure_flag"
1048 ctdb_service_unset_reconfigure ()
1050     _ctdb_service_reconfigure_common
1051     rm -f "$_ctdb_service_reconfigure_flag"
1054 ctdb_service_reconfigure ()
1056     echo "Reconfiguring service \"${service_name}\"..."
1057     ctdb_service_unset_reconfigure
1058     service_reconfigure || return $?
1059     ctdb_counter_init
1062 # Default service_reconfigure() function does nothing.
1063 service_reconfigure ()
1065     :
1068 ctdb_reconfigure_take_lock ()
1070     _ctdb_service_reconfigure_common
1071     _lock="${_d}/reconfigure_lock"
1072     mkdir -p "${_lock%/*}" # dirname
1073     touch "$_lock"
1075     (
1076         flock 0
1077         # This is overkill but will work if we need to extend this to
1078         # allow certain events to run multiple times in parallel
1079         # (e.g. takeip) and write multiple PIDs to the file.
1080         read _locker_event 
1081         if [ -n "$_locker_event" ] ; then
1082             while read _pid ; do
1083                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1084                     kill -0 "$_pid" 2>/dev/null ; then
1085                     exit 1
1086                 fi
1087             done
1088         fi
1090         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1091         exit 0
1092     ) <"$_lock"
1095 ctdb_reconfigure_release_lock ()
1097     _ctdb_service_reconfigure_common
1098     _lock="${_d}/reconfigure_lock"
1100     rm -f "$_lock"
1103 ctdb_replay_monitor_status ()
1105     echo "Replaying previous status for this script due to reconfigure..."
1106     # Leading colon (':') is missing in some versions...
1107     _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:")
1108     # Output looks like this:
1109     # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar:
1110     # This is the cheapest way of getting fields in the middle.
1111     set -- $(IFS=":" ; echo $_out)
1112     _code="$3"
1113     _status="$4"
1114     # The error output field can include colons so we'll try to
1115     # preserve them.  The weak checking at the beginning tries to make
1116     # this work for both broken (no leading ':') and fixed output.
1117     _out="${_out%:}"
1118     _err_out="${_out#*monitor:${script_name}:*:*:*:*:}"
1119     case "$_status" in
1120         OK) : ;;  # Do nothing special.
1121         TIMEDOUT)
1122             # Recast this as an error, since we can't exit with the
1123             # correct negative number.
1124             _code=1
1125             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1126             ;;
1127         DISABLED)
1128             # Recast this as an OK, since we can't exit with the
1129             # correct negative number.
1130             _code=0
1131             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1132             ;;
1133         *) : ;;  # Must be ERROR, do nothing special.
1134     esac
1135     if [ -n "$_err_out" ] ; then
1136         echo "$_err_out"
1137     fi
1138     exit $_code
1141 ctdb_service_check_reconfigure ()
1143     assert_service_name
1145     # We only care about some events in this function.  For others we
1146     # return now.
1147     case "$event_name" in
1148         monitor|ipreallocated|reconfigure) : ;;
1149         *) return 0 ;;
1150     esac
1152     if ctdb_reconfigure_take_lock ; then
1153         # No events covered by this function are running, so proceed
1154         # with gay abandon.
1155         case "$event_name" in
1156             reconfigure)
1157                 (ctdb_service_reconfigure)
1158                 exit $?
1159                 ;;
1160             ipreallocated)
1161                 if ctdb_service_needs_reconfigure ; then
1162                     ctdb_service_reconfigure
1163                 fi
1164                 ;;
1165         esac
1167         ctdb_reconfigure_release_lock
1168     else
1169         # Somebody else is running an event we don't want to collide
1170         # with.  We proceed with caution.
1171         case "$event_name" in
1172             reconfigure)
1173                 # Tell whoever called us to retry.
1174                 exit 2
1175                 ;;
1176             ipreallocated)
1177                 # Defer any scheduled reconfigure and just run the
1178                 # rest of the ipreallocated event, as per the
1179                 # eventscript.  There's an assumption here that the
1180                 # event doesn't depend on any scheduled reconfigure.
1181                 # This is true in the current code.
1182                 return 0
1183                 ;;
1184             monitor)
1185                 # There is most likely a reconfigure in progress so
1186                 # the service is possibly unstable.  As above, we
1187                 # defer any scheduled reconfigured.  We also replay
1188                 # the previous monitor status since that's the best
1189                 # information we have.
1190                 ctdb_replay_monitor_status
1191                 ;;
1192         esac
1193     fi
1196 ##################################################################
1197 # Does CTDB manage this service? - and associated auto-start/stop
1199 ctdb_compat_managed_service ()
1201     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1202         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1203     fi
1206 is_ctdb_managed_service ()
1208     assert_service_name
1210     # $t is used just for readability and to allow better accurate
1211     # matching via leading/trailing spaces
1212     t=" $CTDB_MANAGED_SERVICES "
1214     # Return 0 if "<space>$service_name<space>" appears in $t
1215     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1216         return 0
1217     fi
1219     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1220     # backward compatibility and try again.
1221     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1222     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1223     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1224     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1225     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1226     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1227     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1228     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1229     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1231     t=" $CTDB_MANAGED_SERVICES "
1233     # Return 0 if "<space>$service_name<space>" appears in $t
1234     [ "${t#* ${service_name} }" != "${t}" ]
1237 ctdb_start_stop_service ()
1239     assert_service_name
1241     # Allow service-start/service-stop pseudo-events to start/stop
1242     # services when we're not auto-starting/stopping and we're not
1243     # monitoring.
1244     case "$event_name" in
1245         service-start)
1246             if is_ctdb_managed_service ; then
1247                 die 'service-start event not permitted when service is managed'
1248             fi
1249             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1250                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1251             fi
1252             ctdb_service_start
1253             exit $?
1254             ;;
1255         service-stop)
1256             if is_ctdb_managed_service ; then
1257                 die 'service-stop event not permitted when service is managed'
1258             fi
1259             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1260                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1261             fi
1262             ctdb_service_stop
1263             exit $?
1264             ;;
1265     esac
1267     # Do nothing unless configured to...
1268     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1270     [ "$event_name" = "monitor" ] || return 0
1272     if is_ctdb_managed_service ; then
1273         if ! is_ctdb_previously_managed_service ; then
1274             echo "Starting service \"$service_name\" - now managed"
1275             background_with_logging ctdb_service_start
1276             exit $?
1277         fi
1278     else
1279         if is_ctdb_previously_managed_service ; then
1280             echo "Stopping service \"$service_name\" - no longer managed"
1281             background_with_logging ctdb_service_stop
1282             exit $?
1283         fi
1284     fi
1287 ctdb_service_start ()
1289     # The service is marked managed if we've ever tried to start it.
1290     ctdb_service_managed
1292     service_start || return $?
1294     ctdb_counter_init
1295     ctdb_check_tcp_init
1298 ctdb_service_stop ()
1300     ctdb_service_unmanaged
1301     service_stop
1304 # Default service_start() and service_stop() functions.
1306 # These may be overridden in an eventscript.
1307 service_start ()
1309     service "$service_name" start
1312 service_stop ()
1314     service "$service_name" stop
1317 ##################################################################
1319 ctdb_standard_event_handler ()
1321     case "$1" in
1322         status)
1323             ctdb_checkstatus
1324             exit
1325             ;;
1326         setstatus)
1327             shift
1328             ctdb_setstatus "$@"
1329             exit
1330             ;;
1331     esac
1334 # iptables doesn't like being re-entered, so flock-wrap it.
1335 iptables()
1337         flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
1340 # AIX (and perhaps others?) doesn't have mktemp
1341 if ! which mktemp >/dev/null 2>&1 ; then
1342     mktemp ()
1343     {
1344         _dir=false
1345         if [ "$1" = "-d" ] ; then
1346             _dir=true
1347             shift
1348         fi
1349         _d="${TMPDIR:-/tmp}"
1350         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1351             md5sum | \
1352             sed -e 's@\(..........\).*@\1@')
1353         _t="${_d}/tmp.${_hex10}"
1354         (
1355             umask 077
1356             if $_dir ; then
1357                 mkdir "$_t"
1358             else
1359                 >"$_t"
1360             fi
1361         )
1362         echo "$_t"
1363     }
1366 ########################################################
1367 # tickle handling
1368 ########################################################
1370 update_tickles ()
1372         _port="$1"
1374         tickledir="$CTDB_VARDIR/state/tickles"
1375         mkdir -p "$tickledir"
1377         # Who am I?
1378         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1380         # What public IPs do I hold?
1381         _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
1383         # IPs as a regexp choice
1384         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1386         # Record connections to our public IPs in a temporary file
1387         _my_connections="${tickledir}/${_port}.connections"
1388         rm -f "$_my_connections"
1389         netstat -tn |
1390         awk -v destpat="^${_ipschoice}:${_port}\$" \
1391           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1392         sort >"$_my_connections"
1394         # Record our current tickles in a temporary file
1395         _my_tickles="${tickledir}/${_port}.tickles"
1396         rm -f "$_my_tickles"
1397         for _i in $_ips ; do
1398                 ctdb -Y gettickles $_i $_port | 
1399                 awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1400         done |
1401         sort >"$_my_tickles"
1403         # Add tickles for connections that we haven't already got tickles for
1404         comm -23 "$_my_connections" "$_my_tickles" |
1405         while read _src _dst ; do
1406                 ctdb addtickle $_src $_dst
1407         done
1409         # Remove tickles for connections that are no longer there
1410         comm -13 "$_my_connections" "$_my_tickles" |
1411         while read _src _dst ; do
1412                 ctdb deltickle $_src $_dst
1413         done
1415         rm -f "$_my_connections" "$_my_tickles" 
1418 ########################################################
1419 # load a site local config file
1420 ########################################################
1422 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1423         . "$CTDB_RC_LOCAL"
1426 [ -x $CTDB_BASE/rc.local ] && {
1427         . $CTDB_BASE/rc.local
1430 [ -d $CTDB_BASE/rc.local.d ] && {
1431         for i in $CTDB_BASE/rc.local.d/* ; do
1432                 [ -x "$i" ] && . "$i"
1433         done
1436 script_name="${0##*/}"       # basename
1437 service_fail_limit=1
1438 event_name="$1"