s4:lib/socket: simplify iface_list_wildcard() and its callers
[Samba.git] / ctdb / config / functions
blob6efe60f0d3434201b46fd788c4ffcb3bcb4617c7
1 # Hey Emacs, this is a -*- shell-script -*- !!!
3 # utility functions for ctdb event scripts
5 [ -z "$CTDB_VARDIR" ] && {
6     if [ -d "/var/lib/ctdb" ] ; then
7         export CTDB_VARDIR="/var/lib/ctdb"
8     else
9         export CTDB_VARDIR="/var/ctdb"
10     fi
12 [ -z "$CTDB_ETCDIR" ] && {
13     export CTDB_ETCDIR="/etc"
16 #######################################
17 # pull in a system config file, if any
18 _loadconfig() {
20     if [ -z "$1" ] ; then
21         foo="${service_config:-${service_name}}"
22         if [ -n "$foo" ] ; then
23             loadconfig "$foo"
24             return
25         fi
26     fi
28     if [ "$1" != "ctdb" ] ; then
29         loadconfig "ctdb"
30     fi
32     if [ -z "$1" ] ; then
33         return
34     fi
36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
37         . $CTDB_ETCDIR/sysconfig/$1
38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
39         . $CTDB_ETCDIR/default/$1
40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
41         . $CTDB_BASE/sysconfig/$1
42     fi
44     if [ "$1" = "ctdb" ] ; then
45         _config="${CTDB_BASE}/ctdbd.conf"
46         if [ -r "$_config" ] ; then
47             . "$_config"
48         fi
49     fi
52 loadconfig () {
53     _loadconfig "$@"
56 ##############################################################
58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
59 # configuration file.
60 debug ()
62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
63         # If there are arguments then echo them.  Otherwise expect to
64         # use stdin, which allows us to pass lots of debug using a
65         # here document.
66         if [ -n "$1" ] ; then
67             echo "DEBUG: $*"
68         elif ! tty -s ; then
69             sed -e 's@^@DEBUG: @'
70         fi
71     fi
74 die ()
76     _msg="$1"
77     _rc="${2:-1}"
79     echo "$_msg"
80     exit $_rc
83 # Log given message or stdin to either syslog or a CTDB log file
84 # $1 is the tag passed to logger if syslog is in use.
85 script_log ()
87     _tag="$1" ; shift
89     if [ "$CTDB_SYSLOG" = "yes" ] ; then
90         logger -t "ctdbd: ${_tag}" $*
91     else
92         {
93             if [ -n "$*" ] ; then
94                 echo "$*"
95             else
96                 cat
97             fi
98         } >>"${CTDB_LOGFILE:-/var/log/log.ctdb}"
99     fi
102 # When things are run in the background in an eventscript then logging
103 # output might get lost.  This is the "solution".  :-)
104 background_with_logging ()
106     (
107         "$@" 2>&1 </dev/null |
108         script_log "${script_name}&"
109     )&
111     return 0
114 ##############################################################
115 # check number of args for different events
116 ctdb_check_args ()
118     case "$1" in
119         takeip|releaseip)
120             if [ $# != 4 ]; then
121                 echo "ERROR: must supply interface, IP and maskbits"
122                 exit 1
123             fi
124             ;;
125         updateip)
126             if [ $# != 5 ]; then
127                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
128                 exit 1
129             fi
130             ;;
131     esac
134 ##############################################################
135 # determine on what type of system (init style) we are running
136 detect_init_style()
138     # only do detection if not already set:
139     [ -z "$CTDB_INIT_STYLE" ] || return
141     if [ -x /sbin/startproc ]; then
142         CTDB_INIT_STYLE="suse"
143     elif [ -x /sbin/start-stop-daemon ]; then
144         CTDB_INIT_STYLE="debian"
145     else
146         CTDB_INIT_STYLE="redhat"
147     fi
150 ######################################################
151 # simulate /sbin/service on platforms that don't have it
152 # _service() makes it easier to hook the service() function for
153 # testing.
154 _service ()
156   _service_name="$1"
157   _op="$2"
159   # do nothing, when no service was specified
160   [ -z "$_service_name" ] && return
162   if [ -x /sbin/service ]; then
163       $_nice /sbin/service "$_service_name" "$_op"
164   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
165       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
166   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
167       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
168   fi
171 service()
173     _nice=""
174     _service "$@"
177 ######################################################
178 # simulate /sbin/service (niced) on platforms that don't have it
179 nice_service()
181     _nice="nice"
182     _service "$@"
185 ######################################################
186 # wrapper around /proc/ settings to allow them to be hooked
187 # for testing
188 # 1st arg is relative path under /proc/, 2nd arg is value to set
189 set_proc ()
191     echo "$2" >"/proc/$1"
194 ######################################################
195 # wrapper around getting file contents from /proc/ to allow
196 # this to be hooked for testing
197 # 1st arg is relative path under /proc/
198 get_proc ()
200     cat "/proc/$1"
203 ######################################################
204 # Check that an RPC service is healthy -
205 # this includes allowing a certain number of failures
206 # before marking the NFS service unhealthy.
208 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
210 # each triple is a set of 3 arguments: an operator, a 
211 # fail count limit and an action string.
213 # For example:
215 #       nfs_check_rpc_service "lockd" \
216 #           -ge 15 "verbose restart unhealthy" \
217 #           -eq 10 "restart:bs"
219 # says that if lockd is down for 15 iterations then do
220 # a verbose restart of lockd and mark the node unhealthy.
221 # Before this, after 10 iterations of failure, the
222 # service is restarted silently in the background.
223 # Order is important: the number of failures need to be
224 # specified in reverse order because processing stops
225 # after the first condition that is true.
226 ######################################################
227 nfs_check_rpc_service ()
229     _prog_name="$1" ; shift
231     if _nfs_check_rpc_common "$_prog_name" ; then
232         return
233     fi
235     while [ -n "$3" ] ; do
236         if _nfs_check_rpc_action "$1" "$2" "$3" ; then
237             break
238         fi
239         shift 3
240     done
243 # The new way of doing things...
244 nfs_check_rpc_services ()
246     # Files must end with .check - avoids editor backups, RPM fu, ...
247     for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
248         _t="${_f%.check}"
249         _prog_name="${_t##*/[0-9][0-9].}"
251         if _nfs_check_rpc_common "$_prog_name" ; then
252             # This RPC service is up, check next service...
253             continue
254         fi
256         # Check each line in the file in turn until one of the limit
257         # checks is hit...
258         while read _cmp _lim _rest ; do
259             # Skip comments
260             case "$_cmp" in
261                 \#*) continue ;;
262             esac
264             if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
265                 # Limit was hit on this line, no further checking...
266                 break
267             fi
268         done <"$_f"
269     done
272 _nfs_check_rpc_common ()
274     _prog_name="$1"
276     # Some platforms don't have separate programs for all services.
277     case "$_prog_name" in
278         statd)
279             which "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
280     esac
282     case "$_prog_name" in
283         nfsd)
284             _rpc_prog=nfs
285             _version=3
286             ;;
287         mountd)
288             _rpc_prog=mountd
289             _version=1
290             ;;
291         rquotad)
292             _rpc_prog=rquotad
293             _version=1
294             ;;
295         lockd)
296             _rpc_prog=nlockmgr
297             _version=4
298             ;;
299         statd)
300             _rpc_prog=status
301             _version=1
302             ;;
303         *)
304             echo "Internal error: unknown RPC program \"$_prog_name\"."
305             exit 1
306     esac
308     _service_name="nfs_${_prog_name}"
310     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
311         ctdb_counter_init "$_service_name"
312         return 0
313     fi
315     ctdb_counter_incr "$_service_name"
317     return 1
320 _nfs_check_rpc_action ()
322     _cmp="$1"
323     _limit="$2"
324     _actions="$3"
326     if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
327         return 1
328     fi
330     for _action in $_actions ; do
331         case "$_action" in
332             verbose)
333                 echo "$ctdb_check_rpc_out"
334                 ;;
335             restart)
336                 _nfs_restart_rpc_service "$_prog_name"
337                 ;;
338             restart:b)
339                 _nfs_restart_rpc_service "$_prog_name" true
340                 ;;
341             unhealthy)
342                 exit 1
343                 ;;
344             *)
345                 echo "Internal error: unknown action \"$_action\"."
346                 exit 1
347         esac
348     done
350     return 0
353 _nfs_restart_rpc_service ()
355     _prog_name="$1"
356     _background="${2:-false}"
358     if $_background ; then
359         _maybe_background="background_with_logging"
360     else
361         _maybe_background=""
362     fi
364     _p="rpc.${_prog_name}"
366     case "$_prog_name" in
367         nfsd)
368             echo "Trying to restart NFS service"
369             $_maybe_background startstop_nfs restart
370             ;;
371         mountd)
372             echo "Trying to restart $_prog_name [${_p}]"
373             killall -q -9 "$_p"
374             $_maybe_background $_p ${MOUNTD_PORT:+-p} $MOUNTD_PORT
375             ;;
376         rquotad)
377             echo "Trying to restart $_prog_name [${_p}]"
378             killall -q -9 "$_p"
379             $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
380             ;;
381         lockd)
382             echo "Trying to restart lock manager service"
383             $_maybe_background startstop_nfslock restart
384             ;;
385         statd)
386             echo "Trying to restart $_prog_name [${_p}]"
387             killall -q -9 "$_p"
388             $_maybe_background $_p \
389                 ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
390                 ${STATD_PORT:+-p} $STATD_PORT \
391                 ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
392             ;;
393         *)
394             echo "Internal error: unknown RPC program \"$_prog_name\"."
395             exit 1
396     esac
399 ######################################################
400 # check that a rpc server is registered with portmap
401 # and responding to requests
402 # usage: ctdb_check_rpc SERVICE_NAME VERSION
403 ######################################################
404 ctdb_check_rpc ()
406     progname="$1"
407     version="$2"
409     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
411     if ! ctdb_check_rpc_out=$(rpcinfo -u $_localhost $progname $version 2>&1) ; then
412         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
413 $ctdb_check_rpc_out"
414         echo "$ctdb_check_rpc_out"
415         return 1
416     fi
419 ######################################################
420 # Ensure $service_name is set
421 assert_service_name ()
423     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
426 ######################################################
427 # check a set of directories is available
428 # return 1 on a missing directory
429 # directories are read from stdin
430 ######################################################
431 ctdb_check_directories_probe()
433     while IFS="" read d ; do
434         case "$d" in
435             *%*)
436                 continue
437                 ;;
438             *)
439                 [ -d "${d}/." ] || return 1
440         esac
441     done
444 ######################################################
445 # check a set of directories is available
446 # directories are read from stdin
447 ######################################################
448 ctdb_check_directories()
450     ctdb_check_directories_probe || {
451         echo "ERROR: $service_name directory \"$d\" not available"
452         exit 1
453     }
456 ######################################################
457 # check a set of tcp ports
458 # usage: ctdb_check_tcp_ports <ports...>
459 ######################################################
461 # This flag file is created when a service is initially started.  It
462 # is deleted the first time TCP port checks for that service succeed.
463 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
464 # message if a port check fails.
465 _ctdb_check_tcp_common ()
467     assert_service_name
468     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
471 ctdb_check_tcp_init ()
473     _ctdb_check_tcp_common
474     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
475     touch "$_ctdb_service_started_file"
478 # Check whether something is listening on all of the given TCP ports
479 # using the "ctdb checktcpport" command.
480 ctdb_check_tcp_ports()
482     if [ -z "$1" ] ; then
483         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
484         exit 1
485     fi
487     for _p ; do  # process each function argument (port)
488         _cmd="ctdb checktcpport $_p"
489         _out=$($_cmd 2>&1)
490         _ret=$?
491         case "$_ret" in
492             0)
493                 _ctdb_check_tcp_common
494                 if [ ! -f "$_ctdb_service_started_file" ] ; then
495                     echo "ERROR: $service_name tcp port $_p is not responding"
496                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
497                 else
498                     echo "INFO: $service_name tcp port $_p is not responding"
499                 fi
501                 return 1
502                 ;;
503             98)
504                 # Couldn't bind, something already listening, next port...
505                 continue
506                 ;;
507             *)
508                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
509                 debug <<EOF
510 ctdb checktcpport (exited with $_ret) with output:
511 $_out"
513                 return $_ret
514         esac
515     done
517     # All ports listening
518     _ctdb_check_tcp_common
519     rm -f "$_ctdb_service_started_file"
520     return 0
523 ######################################################
524 # check a unix socket
525 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
526 ######################################################
527 ctdb_check_unix_socket() {
528     socket_path="$1"
529     [ -z "$socket_path" ] && return
531     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
532         echo "ERROR: $service_name socket $socket_path not found"
533         return 1
534     fi
537 ######################################################
538 # check a command returns zero status
539 # usage: ctdb_check_command <command>
540 ######################################################
541 ctdb_check_command ()
543     _out=$("$@" 2>&1) || {
544         echo "ERROR: $* returned error"
545         echo "$_out" | debug
546         exit 1
547     }
550 ################################################
551 # kill off any TCP connections with the given IP
552 ################################################
553 kill_tcp_connections ()
555     _ip="$1"
557     _oneway=false
558     if [ "$2" = "oneway" ] ; then
559         _oneway=true
560     fi
562     get_tcp_connections_for_ip "$_ip" | {
563         _killcount=0
564         _connections=""
565         _nl="
567         while read _dst _src; do
568             _destport="${_dst##*:}"
569             __oneway=$_oneway
570             case $_destport in
571                 # we only do one-way killtcp for CIFS
572                 139|445) __oneway=true ;;
573             esac
575             echo "Killing TCP connection $_src $_dst"
576             _connections="${_connections}${_nl}${_src} ${_dst}"
577             if ! $__oneway ; then
578                 _connections="${_connections}${_nl}${_dst} ${_src}"
579             fi
581             _killcount=$(($_killcount + 1))
582         done
584         if [ $_killcount -eq 0 ] ; then
585             return
586         fi
588         echo "$_connections" | ctdb killtcp || {
589             echo "Failed to send killtcp control"
590             return
591         }
593         _count=0
594         while : ; do
595             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
597             if [ $_remaining -eq 0 ] ; then
598                 echo "Killed $_killcount TCP connections to released IP $_ip"
599                 return
600             fi
602             _count=$(($_count + 1))
603             if [ $_count -gt 3 ] ; then
604                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
605                 return
606             fi
608             echo "Waiting for $_remaining connections to be killed for IP $_ip"
609             sleep 1
610         done
611     }
614 ##################################################################
615 # kill off the local end for any TCP connections with the given IP
616 ##################################################################
617 kill_tcp_connections_local_only ()
619     kill_tcp_connections "$1" "oneway"
622 ##################################################################
623 # tickle any TCP connections with the given IP
624 ##################################################################
625 tickle_tcp_connections ()
627     _ip="$1"
629     get_tcp_connections_for_ip "$_ip" |
630     {
631         _failed=false
633         while read dest src; do
634             echo "Tickle TCP connection $src $dest"
635             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
636             echo "Tickle TCP connection $dest $src"
637             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
638         done
640         if $_failed ; then
641             echo "Failed to send tickle control"
642         fi
643     }
646 get_tcp_connections_for_ip ()
648     _ip="$1"
650     netstat -tn | awk -v ip=$_ip \
651         'index($1, "tcp") == 1 && \
652          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
653          && $6 == "ESTABLISHED" \
654          {print $4" "$5}'
657 ########################################################
658 # start/stop the Ganesha nfs service
659 ########################################################
660 startstop_ganesha()
662     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
663     case "$1" in
664         start)
665             service "$_service_name" start
666             ;;
667         stop)
668             service "$_service_name" stop
669             ;;
670         restart)
671             service "$_service_name" restart
672             ;;
673     esac
676 ########################################################
677 # start/stop the nfs service on different platforms
678 ########################################################
679 startstop_nfs() {
680         PLATFORM="unknown"
681         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
682                 PLATFORM="sles"
683         }
684         [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
685                 PLATFORM="rhel"
686         }
688         case $PLATFORM in
689         sles)
690                 case $1 in
691                 start)
692                         service nfsserver start
693                         ;;
694                 stop)
695                         service nfsserver stop > /dev/null 2>&1
696                         ;;
697                 restart)
698                         set_proc "fs/nfsd/threads" 0
699                         service nfsserver stop > /dev/null 2>&1
700                         pkill -9 nfsd
701                         nfs_dump_some_threads
702                         service nfsserver start
703                         ;;
704                 esac
705                 ;;
706         rhel)
707                 case $1 in
708                 start)
709                         service nfslock start
710                         service nfs start
711                         ;;
712                 stop)
713                         service nfs stop
714                         service nfslock stop
715                         ;;
716                 restart)
717                         set_proc "fs/nfsd/threads" 0
718                         service nfs stop > /dev/null 2>&1
719                         service nfslock stop > /dev/null 2>&1
720                         pkill -9 nfsd
721                         nfs_dump_some_threads
722                         service nfslock start
723                         service nfs start
724                         ;;
725                 esac
726                 ;;
727         *)
728                 echo "Unknown platform. NFS is not supported with ctdb"
729                 exit 1
730                 ;;
731         esac
734 # Dump up to the configured number of nfsd thread backtraces.
735 nfs_dump_some_threads ()
737     [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || CTDB_NFS_DUMP_STUCK_THREADS=5
739     # Optimisation to avoid running an unnecessary pidof
740     [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
742     _count=0
743     for _pid in $(pidof nfsd) ; do
744         [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
746         # Do this first to avoid racing with thread exit
747         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
748         if [ -n "$_stack" ] ; then
749             echo "Stack trace for stuck nfsd thread [${_pid}]:"
750             echo "$_stack"
751             _count=$(($_count + 1))
752         fi
753     done
756 ########################################################
757 # start/stop the nfs lockmanager service on different platforms
758 ########################################################
759 startstop_nfslock() {
760         PLATFORM="unknown"
761         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
762                 PLATFORM="sles"
763         }
764         [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
765                 PLATFORM="rhel"
766         }
768         case $PLATFORM in
769         sles)
770                 # for sles there is no service for lockmanager
771                 # so we instead just shutdown/restart nfs
772                 case $1 in
773                 start)
774                         service nfsserver start
775                         ;;
776                 stop)
777                         service nfsserver stop > /dev/null 2>&1
778                         ;;
779                 restart)
780                         service nfsserver stop > /dev/null 2>&1
781                         service nfsserver start
782                         ;;
783                 esac
784                 ;;
785         rhel)
786                 case $1 in
787                 start)
788                         service nfslock start
789                         ;;
790                 stop)
791                         service nfslock stop > /dev/null 2>&1
792                         ;;
793                 restart)
794                         service nfslock stop > /dev/null 2>&1
795                         service nfslock start
796                         ;;
797                 esac
798                 ;;
799         *)
800                 echo "Unknown platform. NFS locking is not supported with ctdb"
801                 exit 1
802                 ;;
803         esac
806 # Periodically update the statd database
807 nfs_statd_update ()
809     _update_period="$1"
811     _statd_update_trigger="$service_state_dir/update-trigger"
812     [ -f "$_statd_update_trigger" ] || touch "$_statd_update_trigger"
814     _last_update=$(stat --printf="%Y" "$_statd_update_trigger")
815     _current_time=$(date +"%s")
816     if [ $(( $_current_time - $_last_update)) -ge $_update_period ] ; then
817         touch "$_statd_update_trigger"
818         $CTDB_BASE/statd-callout updatelocal &
819         $CTDB_BASE/statd-callout updateremote &
820     fi
823 ########################################################
825 add_ip_to_iface ()
827     _iface=$1
828     _ip=$2
829     _maskbits=$3
831     # Ensure interface is up
832     ip link set "$_iface" up || \
833         die "Failed to bringup interface $_iface"
835     ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || \
836         die "Failed to add $_ip/$_maskbits on dev $_iface"
839 delete_ip_from_iface()
841     _iface=$1
842     _ip=$2
843     _maskbits=$3
845     # This could be set globally for all interfaces but it is probably
846     # better to avoid surprises, so limit it the interfaces where CTDB
847     # has public IP addresses.  There isn't anywhere else convenient
848     # to do this so just set it each time.  This is much cheaper than
849     # remembering and re-adding secondaries.
850     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
852     ip addr del "$_ip/$_maskbits" dev "$_iface" || \
853         die "Failed to del $_ip on dev $_iface"
856 # If the given IP is hosted then print 2 items: maskbits and iface 
857 ip_maskbits_iface ()
859     _addr="$1"
861     ip addr show to "${_addr}/32" 2>/dev/null | \
862         awk '$1 == "inet" { print gensub(".*/", "", 1, $2), $NF }'
865 drop_ip ()
867     _addr="${1%/*}"  # Remove optional maskbits
869     set -- $(ip_maskbits_iface $_addr)
870     if [ -n "$1" ] ; then
871         _maskbits="$1"
872         _iface="$2"
873         echo "Removing public address $_addr/$_maskbits from device $_iface"
874         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
875     fi
878 drop_all_public_ips ()
880     while read _ip _x ; do
881         drop_ip "$_ip"
882     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
885 ########################################################
886 # Simple counters
887 _ctdb_counter_common () {
888     _service_name="${1:-${service_name:-${script_name}}}"
889     _counter_file="$ctdb_fail_dir/$_service_name"
890     mkdir -p "${_counter_file%/*}" # dirname
892 ctdb_counter_init () {
893     _ctdb_counter_common "$1"
895     >"$_counter_file"
897 ctdb_counter_incr () {
898     _ctdb_counter_common "$1"
900     # unary counting!
901     echo -n 1 >> "$_counter_file"
903 ctdb_check_counter () {
904     _msg="${1:-error}"  # "error"  - anything else is silent on fail
905     _op="${2:--ge}"  # an integer operator supported by test
906     _limit="${3:-${service_fail_limit}}"
907     shift 3
908     _ctdb_counter_common "$1"
910     # unary counting!
911     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
912     _hit=false
913     if [ "$_op" != "%" ] ; then
914         if [ $_size $_op $_limit ] ; then
915             _hit=true
916         fi
917     else
918         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
919             _hit=true
920         fi
921     fi
922     if $_hit ; then
923         if [ "$_msg" = "error" ] ; then
924             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
925             exit 1              
926         else
927             return 1
928         fi
929     fi
932 ########################################################
934 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
935 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
937 ctdb_setup_service_state_dir ()
939     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
940     mkdir -p "$service_state_dir" || {
941         echo "Error creating state dir \"$service_state_dir\""
942         exit 1
943     }
946 ########################################################
947 # Managed status history, for auto-start/stop
949 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
951 _ctdb_managed_common ()
953     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
956 ctdb_service_managed ()
958     _ctdb_managed_common
959     mkdir -p "$ctdb_managed_dir"
960     touch "$_ctdb_managed_file"
963 ctdb_service_unmanaged ()
965     _ctdb_managed_common
966     rm -f "$_ctdb_managed_file"
969 is_ctdb_previously_managed_service ()
971     _ctdb_managed_common
972     [ -f "$_ctdb_managed_file" ]
975 ########################################################
976 # Check and set status
978 log_status_cat ()
980     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
983 ctdb_checkstatus ()
985     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
986         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
987         return 1
988     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
989         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
990         return 2
991     else
992         return 0
993     fi
996 ctdb_setstatus ()
998     d="$ctdb_status_dir/$script_name"
999     case "$1" in
1000         unhealthy|banned)
1001             mkdir -p "$d"
1002             cat "$2" >"$d/$1"
1003             ;;
1004         *)
1005             for i in "banned" "unhealthy" ; do
1006                 rm -f "$d/$i"
1007             done
1008             ;;
1009     esac
1012 ##################################################################
1013 # Reconfigure a service on demand
1015 _ctdb_service_reconfigure_common ()
1017     _d="$ctdb_status_dir/${service_name}"
1018     mkdir -p "$_d"
1019     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1022 ctdb_service_needs_reconfigure ()
1024     _ctdb_service_reconfigure_common
1025     [ -e "$_ctdb_service_reconfigure_flag" ]
1028 ctdb_service_set_reconfigure ()
1030     _ctdb_service_reconfigure_common
1031     >"$_ctdb_service_reconfigure_flag"
1034 ctdb_service_unset_reconfigure ()
1036     _ctdb_service_reconfigure_common
1037     rm -f "$_ctdb_service_reconfigure_flag"
1040 ctdb_service_reconfigure ()
1042     echo "Reconfiguring service \"${service_name}\"..."
1043     ctdb_service_unset_reconfigure
1044     service_reconfigure || return $?
1045     ctdb_counter_init
1048 # Default service_reconfigure() function does nothing.
1049 service_reconfigure ()
1051     :
1054 ctdb_reconfigure_take_lock ()
1056     _ctdb_service_reconfigure_common
1057     _lock="${_d}/reconfigure_lock"
1058     mkdir -p "${_lock%/*}" # dirname
1059     touch "$_lock"
1061     (
1062         flock 0
1063         # This is overkill but will work if we need to extend this to
1064         # allow certain events to run multiple times in parallel
1065         # (e.g. takeip) and write multiple PIDs to the file.
1066         read _locker_event 
1067         if [ -n "$_locker_event" ] ; then
1068             while read _pid ; do
1069                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1070                     kill -0 "$_pid" 2>/dev/null ; then
1071                     exit 1
1072                 fi
1073             done
1074         fi
1076         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1077         exit 0
1078     ) <"$_lock"
1081 ctdb_reconfigure_release_lock ()
1083     _ctdb_service_reconfigure_common
1084     _lock="${_d}/reconfigure_lock"
1086     rm -f "$_lock"
1089 ctdb_replay_monitor_status ()
1091     echo "Replaying previous status for this script due to reconfigure..."
1092     # Leading colon (':') is missing in some versions...
1093     _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:")
1094     # Output looks like this:
1095     # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar:
1096     # This is the cheapest way of getting fields in the middle.
1097     set -- $(IFS=":" ; echo $_out)
1098     _code="$3"
1099     _status="$4"
1100     # The error output field can include colons so we'll try to
1101     # preserve them.  The weak checking at the beginning tries to make
1102     # this work for both broken (no leading ':') and fixed output.
1103     _out="${_out%:}"
1104     _err_out="${_out#*monitor:${script_name}:*:*:*:*:}"
1105     case "$_status" in
1106         OK) : ;;  # Do nothing special.
1107         TIMEDOUT)
1108             # Recast this as an error, since we can't exit with the
1109             # correct negative number.
1110             _code=1
1111             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1112             ;;
1113         DISABLED)
1114             # Recast this as an OK, since we can't exit with the
1115             # correct negative number.
1116             _code=0
1117             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1118             ;;
1119         *) : ;;  # Must be ERROR, do nothing special.
1120     esac
1121     if [ -n "$_err_out" ] ; then
1122         echo "$_err_out"
1123     fi
1124     exit $_code
1127 ctdb_service_check_reconfigure ()
1129     assert_service_name
1131     # We only care about some events in this function.  For others we
1132     # return now.
1133     case "$event_name" in
1134         monitor|ipreallocated|reconfigure) : ;;
1135         *) return 0 ;;
1136     esac
1138     if ctdb_reconfigure_take_lock ; then
1139         # No events covered by this function are running, so proceed
1140         # with gay abandon.
1141         case "$event_name" in
1142             reconfigure)
1143                 (ctdb_service_reconfigure)
1144                 exit $?
1145                 ;;
1146             ipreallocated)
1147                 if ctdb_service_needs_reconfigure ; then
1148                     ctdb_service_reconfigure
1149                 fi
1150                 ;;
1151         esac
1153         ctdb_reconfigure_release_lock
1154     else
1155         # Somebody else is running an event we don't want to collide
1156         # with.  We proceed with caution.
1157         case "$event_name" in
1158             reconfigure)
1159                 # Tell whoever called us to retry.
1160                 exit 2
1161                 ;;
1162             ipreallocated)
1163                 # Defer any scheduled reconfigure and just run the
1164                 # rest of the ipreallocated event, as per the
1165                 # eventscript.  There's an assumption here that the
1166                 # event doesn't depend on any scheduled reconfigure.
1167                 # This is true in the current code.
1168                 return 0
1169                 ;;
1170             monitor)
1171                 # There is most likely a reconfigure in progress so
1172                 # the service is possibly unstable.  As above, we
1173                 # defer any scheduled reconfigured.  We also replay
1174                 # the previous monitor status since that's the best
1175                 # information we have.
1176                 ctdb_replay_monitor_status
1177                 ;;
1178         esac
1179     fi
1182 ##################################################################
1183 # Does CTDB manage this service? - and associated auto-start/stop
1185 ctdb_compat_managed_service ()
1187     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1188         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1189     fi
1192 is_ctdb_managed_service ()
1194     assert_service_name
1196     # $t is used just for readability and to allow better accurate
1197     # matching via leading/trailing spaces
1198     t=" $CTDB_MANAGED_SERVICES "
1200     # Return 0 if "<space>$service_name<space>" appears in $t
1201     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1202         return 0
1203     fi
1205     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1206     # backward compatibility and try again.
1207     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1208     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1209     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1210     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1211     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1212     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1213     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1214     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1215     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1217     t=" $CTDB_MANAGED_SERVICES "
1219     # Return 0 if "<space>$service_name<space>" appears in $t
1220     [ "${t#* ${service_name} }" != "${t}" ]
1223 ctdb_start_stop_service ()
1225     assert_service_name
1227     # Allow service-start/service-stop pseudo-events to start/stop
1228     # services when we're not auto-starting/stopping and we're not
1229     # monitoring.
1230     case "$event_name" in
1231         service-start)
1232             if is_ctdb_managed_service ; then
1233                 die 'service-start event not permitted when service is managed'
1234             fi
1235             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1236                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1237             fi
1238             ctdb_service_start
1239             exit $?
1240             ;;
1241         service-stop)
1242             if is_ctdb_managed_service ; then
1243                 die 'service-stop event not permitted when service is managed'
1244             fi
1245             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1246                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1247             fi
1248             ctdb_service_stop
1249             exit $?
1250             ;;
1251     esac
1253     # Do nothing unless configured to...
1254     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1256     [ "$event_name" = "monitor" ] || return 0
1258     if is_ctdb_managed_service ; then
1259         if ! is_ctdb_previously_managed_service ; then
1260             echo "Starting service \"$service_name\" - now managed"
1261             background_with_logging ctdb_service_start
1262             exit $?
1263         fi
1264     else
1265         if is_ctdb_previously_managed_service ; then
1266             echo "Stopping service \"$service_name\" - no longer managed"
1267             background_with_logging ctdb_service_stop
1268             exit $?
1269         fi
1270     fi
1273 ctdb_service_start ()
1275     # The service is marked managed if we've ever tried to start it.
1276     ctdb_service_managed
1278     service_start || return $?
1280     ctdb_counter_init
1281     ctdb_check_tcp_init
1284 ctdb_service_stop ()
1286     ctdb_service_unmanaged
1287     service_stop
1290 # Default service_start() and service_stop() functions.
1292 # These may be overridden in an eventscript.
1293 service_start ()
1295     service "$service_name" start
1298 service_stop ()
1300     service "$service_name" stop
1303 ##################################################################
1305 ctdb_standard_event_handler ()
1307     case "$1" in
1308         status)
1309             ctdb_checkstatus
1310             exit
1311             ;;
1312         setstatus)
1313             shift
1314             ctdb_setstatus "$@"
1315             exit
1316             ;;
1317     esac
1320 # iptables doesn't like being re-entered, so flock-wrap it.
1321 iptables()
1323         flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
1326 # AIX (and perhaps others?) doesn't have mktemp
1327 if ! which mktemp >/dev/null 2>&1 ; then
1328     mktemp ()
1329     {
1330         _dir=false
1331         if [ "$1" = "-d" ] ; then
1332             _dir=true
1333             shift
1334         fi
1335         _d="${TMPDIR:-/tmp}"
1336         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1337             md5sum | \
1338             sed -e 's@\(..........\).*@\1@')
1339         _t="${_d}/tmp.${_hex10}"
1340         (
1341             umask 077
1342             if $_dir ; then
1343                 mkdir "$_t"
1344             else
1345                 >"$_t"
1346             fi
1347         )
1348         echo "$_t"
1349     }
1352 ########################################################
1353 # tickle handling
1354 ########################################################
1356 update_tickles ()
1358         _port="$1"
1360         tickledir="$CTDB_VARDIR/state/tickles"
1361         mkdir -p "$tickledir"
1363         # Who am I?
1364         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1366         # What public IPs do I hold?
1367         _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
1369         # IPs as a regexp choice
1370         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1372         # Record connections to our public IPs in a temporary file
1373         _my_connections="${tickledir}/${_port}.connections"
1374         rm -f "$_my_connections"
1375         netstat -tn |
1376         awk -v destpat="^${_ipschoice}:${_port}\$" \
1377           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1378         sort >"$_my_connections"
1380         # Record our current tickles in a temporary file
1381         _my_tickles="${tickledir}/${_port}.tickles"
1382         rm -f "$_my_tickles"
1383         for _i in $_ips ; do
1384                 ctdb -Y gettickles $_i $_port | 
1385                 awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1386         done |
1387         sort >"$_my_tickles"
1389         # Add tickles for connections that we haven't already got tickles for
1390         comm -23 "$_my_connections" "$_my_tickles" |
1391         while read _src _dst ; do
1392                 ctdb addtickle $_src $_dst
1393         done
1395         # Remove tickles for connections that are no longer there
1396         comm -13 "$_my_connections" "$_my_tickles" |
1397         while read _src _dst ; do
1398                 ctdb deltickle $_src $_dst
1399         done
1401         rm -f "$_my_connections" "$_my_tickles" 
1404 ########################################################
1405 # load a site local config file
1406 ########################################################
1408 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1409         . "$CTDB_RC_LOCAL"
1412 [ -x $CTDB_BASE/rc.local ] && {
1413         . $CTDB_BASE/rc.local
1416 [ -d $CTDB_BASE/rc.local.d ] && {
1417         for i in $CTDB_BASE/rc.local.d/* ; do
1418                 [ -x "$i" ] && . "$i"
1419         done
1422 script_name="${0##*/}"       # basename
1423 service_fail_limit=1
1424 event_name="$1"