ctdb/config/functions

   1 # Hey Emacs, this is a -*- shell-script -*- !!!
   2
   3 # utility functions for ctdb event scripts
   4
   5 [ -z "$CTDB_VARDIR" ] && {
   6     if [ -d "/var/lib/ctdb" ] ; then
   7         export CTDB_VARDIR="/var/lib/ctdb"
   8     else
   9         export CTDB_VARDIR="/var/ctdb"
  10     fi
  11 }
  12 [ -z "$CTDB_ETCDIR" ] && {
  13     export CTDB_ETCDIR="/etc"
  14 }
  15
  16 #######################################
  17 # pull in a system config file, if any
  18 _loadconfig() {
  19
  20     if [ -z "$1" ] ; then
  21         foo="${service_config:-${service_name}}"
  22         if [ -n "$foo" ] ; then
  23             loadconfig "$foo"
  24             return
  25         fi
  26     fi
  27
  28     if [ "$1" != "ctdb" ] ; then
  29         loadconfig "ctdb"
  30     fi
  31
  32     if [ -z "$1" ] ; then
  33         return
  34     fi
  35
  36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
  37         . $CTDB_ETCDIR/sysconfig/$1
  38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
  39         . $CTDB_ETCDIR/default/$1
  40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
  41         . $CTDB_BASE/sysconfig/$1
  42     fi
  43
  44     if [ "$1" = "ctdb" ] ; then
  45         _config="${CTDB_BASE}/ctdbd.conf"
  46         if [ -r "$_config" ] ; then
  47             . "$_config"
  48         fi
  49     fi
  50 }
  51
  52 loadconfig () {
  53     _loadconfig "$@"
  54 }
  55
  56 ##############################################################
  57
  58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
  59 # configuration file.
  60 debug ()
  61 {
  62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
  63         # If there are arguments then echo them.  Otherwise expect to
  64         # use stdin, which allows us to pass lots of debug using a
  65         # here document.
  66         if [ -n "$1" ] ; then
  67             echo "DEBUG: $*"
  68         elif ! tty -s ; then
  69             sed -e 's@^@DEBUG: @'
  70         fi
  71     fi
  72 }
  73
  74 die ()
  75 {
  76     _msg="$1"
  77     _rc="${2:-1}"
  78
  79     echo "$_msg"
  80     exit $_rc
  81 }
  82
  83 # Log given message or stdin to either syslog or a CTDB log file
  84 # $1 is the tag passed to logger if syslog is in use.
  85 script_log ()
  86 {
  87     _tag="$1" ; shift
  88
  89     case "$CTDB_LOGGING" in
  90         file:*|"")
  91             if [ -n "$CTDB_LOGGING" ] ; then
  92                 _file="${CTDB_LOGGING#file:}"
  93             else
  94                 _file="/var/log/log.ctdb"
  95             fi
  96             {
  97                 if [ -n "$*" ] ; then
  98                     echo "$*"
  99                 else
 100                     cat
 101                 fi
 102             } >>"$_file"
 103             ;;
 104         *)
 105             logger -t "ctdbd: ${_tag}" $*
 106             ;;
 107     esac
 108 }
 109
 110 # When things are run in the background in an eventscript then logging
 111 # output might get lost.  This is the "solution".  :-)
 112 background_with_logging ()
 113 {
 114     (
 115         "$@" 2>&1 </dev/null |
 116         script_log "${script_name}&"
 117     )&
 118
 119     return 0
 120 }
 121
 122 ##############################################################
 123 # check number of args for different events
 124 ctdb_check_args ()
 125 {
 126     case "$1" in
 127         takeip|releaseip)
 128             if [ $# != 4 ]; then
 129                 echo "ERROR: must supply interface, IP and maskbits"
 130                 exit 1
 131             fi
 132             ;;
 133         updateip)
 134             if [ $# != 5 ]; then
 135                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
 136                 exit 1
 137             fi
 138             ;;
 139     esac
 140 }
 141
 142 ##############################################################
 143 # determine on what type of system (init style) we are running
 144 detect_init_style()
 145 {
 146     # only do detection if not already set:
 147     [ -z "$CTDB_INIT_STYLE" ] || return
 148
 149     if [ -x /sbin/startproc ]; then
 150         CTDB_INIT_STYLE="suse"
 151     elif [ -x /sbin/start-stop-daemon ]; then
 152         CTDB_INIT_STYLE="debian"
 153     else
 154         CTDB_INIT_STYLE="redhat"
 155     fi
 156 }
 157
 158 ######################################################
 159 # simulate /sbin/service on platforms that don't have it
 160 # _service() makes it easier to hook the service() function for
 161 # testing.
 162 _service ()
 163 {
 164   _service_name="$1"
 165   _op="$2"
 166
 167   # do nothing, when no service was specified
 168   [ -z "$_service_name" ] && return
 169
 170   if [ -x /sbin/service ]; then
 171       $_nice /sbin/service "$_service_name" "$_op"
 172   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
 173       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
 174   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
 175       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
 176   fi
 177 }
 178
 179 service()
 180 {
 181     _nice=""
 182     _service "$@"
 183 }
 184
 185 ######################################################
 186 # simulate /sbin/service (niced) on platforms that don't have it
 187 nice_service()
 188 {
 189     _nice="nice"
 190     _service "$@"
 191 }
 192
 193 ######################################################
 194 # wrapper around /proc/ settings to allow them to be hooked
 195 # for testing
 196 # 1st arg is relative path under /proc/, 2nd arg is value to set
 197 set_proc ()
 198 {
 199     echo "$2" >"/proc/$1"
 200 }
 201
 202 ######################################################
 203 # wrapper around getting file contents from /proc/ to allow
 204 # this to be hooked for testing
 205 # 1st arg is relative path under /proc/
 206 get_proc ()
 207 {
 208     cat "/proc/$1"
 209 }
 210
 211 ######################################################
 212 # Check that an RPC service is healthy -
 213 # this includes allowing a certain number of failures
 214 # before marking the NFS service unhealthy.
 215 #
 216 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
 217 #
 218 # each triple is a set of 3 arguments: an operator, a
 219 # fail count limit and an action string.
 220 #
 221 # For example:
 222 #
 223 #       nfs_check_rpc_service "lockd" \
 224 #           -ge 15 "verbose restart unhealthy" \
 225 #           -eq 10 "restart:bs"
 226 #
 227 # says that if lockd is down for 15 iterations then do
 228 # a verbose restart of lockd and mark the node unhealthy.
 229 # Before this, after 10 iterations of failure, the
 230 # service is restarted silently in the background.
 231 # Order is important: the number of failures need to be
 232 # specified in reverse order because processing stops
 233 # after the first condition that is true.
 234 ######################################################
 235 nfs_check_rpc_service ()
 236 {
 237     _prog_name="$1" ; shift
 238
 239     if _nfs_check_rpc_common "$_prog_name" ; then
 240         return
 241     fi
 242
 243     while [ -n "$3" ] ; do
 244         if _nfs_check_rpc_action "$1" "$2" "$3" ; then
 245             break
 246         fi
 247         shift 3
 248     done
 249 }
 250
 251 # The new way of doing things...
 252 nfs_check_rpc_services ()
 253 {
 254     # Files must end with .check - avoids editor backups, RPM fu, ...
 255     for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
 256         _t="${_f%.check}"
 257         _prog_name="${_t##*/[0-9][0-9].}"
 258
 259         if _nfs_check_rpc_common "$_prog_name" ; then
 260             # This RPC service is up, check next service...
 261             continue
 262         fi
 263
 264         # Check each line in the file in turn until one of the limit
 265         # checks is hit...
 266         while read _cmp _lim _rest ; do
 267             # Skip comments
 268             case "$_cmp" in
 269                 \#*) continue ;;
 270             esac
 271
 272             if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
 273                 # Limit was hit on this line, no further checking...
 274                 break
 275             fi
 276         done <"$_f"
 277     done
 278 }
 279
 280 _nfs_check_rpc_common ()
 281 {
 282     _prog_name="$1"
 283
 284     # Some platforms don't have separate programs for all services.
 285     case "$_prog_name" in
 286         statd)
 287             which "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
 288     esac
 289
 290     case "$_prog_name" in
 291         nfsd)
 292             _rpc_prog=nfs
 293             _version=3
 294             ;;
 295         mountd)
 296             _rpc_prog=mountd
 297             _version=1
 298             ;;
 299         rquotad)
 300             _rpc_prog=rquotad
 301             _version=1
 302             ;;
 303         lockd)
 304             _rpc_prog=nlockmgr
 305             _version=4
 306             ;;
 307         statd)
 308             _rpc_prog=status
 309             _version=1
 310             ;;
 311         *)
 312             echo "Internal error: unknown RPC program \"$_prog_name\"."
 313             exit 1
 314     esac
 315
 316     _service_name="nfs_${_prog_name}"
 317
 318     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
 319         ctdb_counter_init "$_service_name"
 320         return 0
 321     fi
 322
 323     ctdb_counter_incr "$_service_name"
 324
 325     return 1
 326 }
 327
 328 _nfs_check_rpc_action ()
 329 {
 330     _cmp="$1"
 331     _limit="$2"
 332     _actions="$3"
 333
 334     if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
 335         return 1
 336     fi
 337
 338     for _action in $_actions ; do
 339         case "$_action" in
 340             verbose)
 341                 echo "$ctdb_check_rpc_out"
 342                 ;;
 343             restart)
 344                 _nfs_restart_rpc_service "$_prog_name"
 345                 ;;
 346             restart:b)
 347                 _nfs_restart_rpc_service "$_prog_name" true
 348                 ;;
 349             unhealthy)
 350                 exit 1
 351                 ;;
 352             *)
 353                 echo "Internal error: unknown action \"$_action\"."
 354                 exit 1
 355         esac
 356     done
 357
 358     return 0
 359 }
 360
 361 _nfs_restart_rpc_service ()
 362 {
 363     _prog_name="$1"
 364     _background="${2:-false}"
 365
 366     if $_background ; then
 367         _maybe_background="background_with_logging"
 368     else
 369         _maybe_background=""
 370     fi
 371
 372     _p="rpc.${_prog_name}"
 373
 374     case "$_prog_name" in
 375         nfsd)
 376             echo "Trying to restart NFS service"
 377             $_maybe_background startstop_nfs restart
 378             ;;
 379         mountd)
 380             echo "Trying to restart $_prog_name [${_p}]"
 381             killall -q -9 "$_p"
 382             $_maybe_background $_p ${MOUNTD_PORT:+-p} $MOUNTD_PORT
 383             ;;
 384         rquotad)
 385             echo "Trying to restart $_prog_name [${_p}]"
 386             killall -q -9 "$_p"
 387             $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
 388             ;;
 389         lockd)
 390             echo "Trying to restart lock manager service"
 391             $_maybe_background startstop_nfslock restart
 392             ;;
 393         statd)
 394             echo "Trying to restart $_prog_name [${_p}]"
 395             killall -q -9 "$_p"
 396             $_maybe_background $_p \
 397                 ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
 398                 ${STATD_PORT:+-p} $STATD_PORT \
 399                 ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
 400             ;;
 401         *)
 402             echo "Internal error: unknown RPC program \"$_prog_name\"."
 403             exit 1
 404     esac
 405 }
 406
 407 ######################################################
 408 # check that a rpc server is registered with portmap
 409 # and responding to requests
 410 # usage: ctdb_check_rpc SERVICE_NAME VERSION
 411 ######################################################
 412 ctdb_check_rpc ()
 413 {
 414     progname="$1"
 415     version="$2"
 416
 417     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
 418
 419     if ! ctdb_check_rpc_out=$(rpcinfo -u $_localhost $progname $version 2>&1) ; then
 420         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 421 $ctdb_check_rpc_out"
 422         echo "$ctdb_check_rpc_out"
 423         return 1
 424     fi
 425 }
 426
 427 ######################################################
 428 # Ensure $service_name is set
 429 assert_service_name ()
 430 {
 431     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
 432 }
 433
 434 ######################################################
 435 # check a set of directories is available
 436 # return 1 on a missing directory
 437 # directories are read from stdin
 438 ######################################################
 439 ctdb_check_directories_probe()
 440 {
 441     while IFS="" read d ; do
 442         case "$d" in
 443             *%*)
 444                 continue
 445                 ;;
 446             *)
 447                 [ -d "${d}/." ] || return 1
 448         esac
 449     done
 450 }
 451
 452 ######################################################
 453 # check a set of directories is available
 454 # directories are read from stdin
 455 ######################################################
 456 ctdb_check_directories()
 457 {
 458     ctdb_check_directories_probe || {
 459         echo "ERROR: $service_name directory \"$d\" not available"
 460         exit 1
 461     }
 462 }
 463
 464 ######################################################
 465 # check a set of tcp ports
 466 # usage: ctdb_check_tcp_ports <ports...>
 467 ######################################################
 468
 469 # This flag file is created when a service is initially started.  It
 470 # is deleted the first time TCP port checks for that service succeed.
 471 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
 472 # message if a port check fails.
 473 _ctdb_check_tcp_common ()
 474 {
 475     assert_service_name
 476     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
 477 }
 478
 479 ctdb_check_tcp_init ()
 480 {
 481     _ctdb_check_tcp_common
 482     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
 483     touch "$_ctdb_service_started_file"
 484 }
 485
 486 # Check whether something is listening on all of the given TCP ports
 487 # using the "ctdb checktcpport" command.
 488 ctdb_check_tcp_ports()
 489 {
 490     if [ -z "$1" ] ; then
 491         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
 492         exit 1
 493     fi
 494
 495     for _p ; do  # process each function argument (port)
 496         _cmd="ctdb checktcpport $_p"
 497         _out=$($_cmd 2>&1)
 498         _ret=$?
 499         case "$_ret" in
 500             0)
 501                 _ctdb_check_tcp_common
 502                 if [ ! -f "$_ctdb_service_started_file" ] ; then
 503                     echo "ERROR: $service_name tcp port $_p is not responding"
 504                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
 505                 else
 506                     echo "INFO: $service_name tcp port $_p is not responding"
 507                 fi
 508
 509                 return 1
 510                 ;;
 511             98)
 512                 # Couldn't bind, something already listening, next port...
 513                 continue
 514                 ;;
 515             *)
 516                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
 517                 debug <<EOF
 518 ctdb checktcpport (exited with $_ret) with output:
 519 $_out"
 520 EOF
 521                 return $_ret
 522         esac
 523     done
 524
 525     # All ports listening
 526     _ctdb_check_tcp_common
 527     rm -f "$_ctdb_service_started_file"
 528     return 0
 529 }
 530
 531 ######################################################
 532 # check a unix socket
 533 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
 534 ######################################################
 535 ctdb_check_unix_socket() {
 536     socket_path="$1"
 537     [ -z "$socket_path" ] && return
 538
 539     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
 540         echo "ERROR: $service_name socket $socket_path not found"
 541         return 1
 542     fi
 543 }
 544
 545 ######################################################
 546 # check a command returns zero status
 547 # usage: ctdb_check_command <command>
 548 ######################################################
 549 ctdb_check_command ()
 550 {
 551     _out=$("$@" 2>&1) || {
 552         echo "ERROR: $* returned error"
 553         echo "$_out" | debug
 554         exit 1
 555     }
 556 }
 557
 558 ################################################
 559 # kill off any TCP connections with the given IP
 560 ################################################
 561 kill_tcp_connections ()
 562 {
 563     _ip="$1"
 564
 565     _oneway=false
 566     if [ "$2" = "oneway" ] ; then
 567         _oneway=true
 568     fi
 569
 570     get_tcp_connections_for_ip "$_ip" | {
 571         _killcount=0
 572         _connections=""
 573         _nl="
 574 "
 575         while read _dst _src; do
 576             _destport="${_dst##*:}"
 577             __oneway=$_oneway
 578             case $_destport in
 579                 # we only do one-way killtcp for CIFS
 580                 139|445) __oneway=true ;;
 581             esac
 582
 583             echo "Killing TCP connection $_src $_dst"
 584             _connections="${_connections}${_nl}${_src} ${_dst}"
 585             if ! $__oneway ; then
 586                 _connections="${_connections}${_nl}${_dst} ${_src}"
 587             fi
 588
 589             _killcount=$(($_killcount + 1))
 590         done
 591
 592         if [ $_killcount -eq 0 ] ; then
 593             return
 594         fi
 595
 596         echo "$_connections" | ctdb killtcp || {
 597             echo "Failed to send killtcp control"
 598             return
 599         }
 600
 601         _count=0
 602         while : ; do
 603             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
 604
 605             if [ $_remaining -eq 0 ] ; then
 606                 echo "Killed $_killcount TCP connections to released IP $_ip"
 607                 return
 608             fi
 609
 610             _count=$(($_count + 1))
 611             if [ $_count -gt 3 ] ; then
 612                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
 613                 return
 614             fi
 615
 616             echo "Waiting for $_remaining connections to be killed for IP $_ip"
 617             sleep 1
 618         done
 619     }
 620 }
 621
 622 ##################################################################
 623 # kill off the local end for any TCP connections with the given IP
 624 ##################################################################
 625 kill_tcp_connections_local_only ()
 626 {
 627     kill_tcp_connections "$1" "oneway"
 628 }
 629
 630 ##################################################################
 631 # tickle any TCP connections with the given IP
 632 ##################################################################
 633 tickle_tcp_connections ()
 634 {
 635     _ip="$1"
 636
 637     get_tcp_connections_for_ip "$_ip" |
 638     {
 639         _failed=false
 640
 641         while read dest src; do
 642             echo "Tickle TCP connection $src $dest"
 643             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
 644             echo "Tickle TCP connection $dest $src"
 645             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
 646         done
 647
 648         if $_failed ; then
 649             echo "Failed to send tickle control"
 650         fi
 651     }
 652 }
 653
 654 get_tcp_connections_for_ip ()
 655 {
 656     _ip="$1"
 657
 658     netstat -tn | awk -v ip=$_ip \
 659         'index($1, "tcp") == 1 && \
 660          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
 661          && $6 == "ESTABLISHED" \
 662          {print $4" "$5}'
 663 }
 664
 665 ########################################################
 666 # start/stop the Ganesha nfs service
 667 ########################################################
 668 startstop_ganesha()
 669 {
 670     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
 671     case "$1" in
 672         start)
 673             service "$_service_name" start
 674             ;;
 675         stop)
 676             service "$_service_name" stop
 677             ;;
 678         restart)
 679             service "$_service_name" restart
 680             ;;
 681     esac
 682 }
 683
 684 ########################################################
 685 # start/stop the nfs service on different platforms
 686 ########################################################
 687 startstop_nfs() {
 688         PLATFORM="unknown"
 689         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 690                 PLATFORM="sles"
 691         }
 692         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
 693             -r /usr/lib/systemd/system/nfs-lock.service ] && {
 694                 PLATFORM="rhel"
 695         }
 696
 697         case $PLATFORM in
 698         sles)
 699                 case $1 in
 700                 start)
 701                         service nfsserver start
 702                         ;;
 703                 stop)
 704                         service nfsserver stop > /dev/null 2>&1
 705                         ;;
 706                 restart)
 707                         set_proc "fs/nfsd/threads" 0
 708                         service nfsserver stop > /dev/null 2>&1
 709                         pkill -9 nfsd
 710                         nfs_dump_some_threads
 711                         service nfsserver start
 712                         ;;
 713                 esac
 714                 ;;
 715         rhel)
 716                 case $1 in
 717                 start)
 718                         service nfslock start
 719                         service nfs start
 720                         ;;
 721                 stop)
 722                         service nfs stop
 723                         service nfslock stop
 724                         ;;
 725                 restart)
 726                         set_proc "fs/nfsd/threads" 0
 727                         service nfs stop > /dev/null 2>&1
 728                         service nfslock stop > /dev/null 2>&1
 729                         pkill -9 nfsd
 730                         nfs_dump_some_threads
 731                         service nfslock start
 732                         service nfs start
 733                         ;;
 734                 esac
 735                 ;;
 736         *)
 737                 echo "Unknown platform. NFS is not supported with ctdb"
 738                 exit 1
 739                 ;;
 740         esac
 741 }
 742
 743 # Dump up to the configured number of nfsd thread backtraces.
 744 nfs_dump_some_threads ()
 745 {
 746     [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || CTDB_NFS_DUMP_STUCK_THREADS=5
 747
 748     # Optimisation to avoid running an unnecessary pidof
 749     [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
 750
 751     _count=0
 752     for _pid in $(pidof nfsd) ; do
 753         [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
 754
 755         # Do this first to avoid racing with thread exit
 756         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
 757         if [ -n "$_stack" ] ; then
 758             echo "Stack trace for stuck nfsd thread [${_pid}]:"
 759             echo "$_stack"
 760             _count=$(($_count + 1))
 761         fi
 762     done
 763 }
 764
 765 ########################################################
 766 # start/stop the nfs lockmanager service on different platforms
 767 ########################################################
 768 startstop_nfslock() {
 769         PLATFORM="unknown"
 770         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 771                 PLATFORM="sles"
 772         }
 773         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
 774             -r /usr/lib/systemd/system/nfs-lock.service ] && {
 775                 PLATFORM="rhel"
 776         }
 777
 778         case $PLATFORM in
 779         sles)
 780                 # for sles there is no service for lockmanager
 781                 # so we instead just shutdown/restart nfs
 782                 case $1 in
 783                 start)
 784                         service nfsserver start
 785                         ;;
 786                 stop)
 787                         service nfsserver stop > /dev/null 2>&1
 788                         ;;
 789                 restart)
 790                         service nfsserver stop > /dev/null 2>&1
 791                         service nfsserver start
 792                         ;;
 793                 esac
 794                 ;;
 795         rhel)
 796                 case $1 in
 797                 start)
 798                         service nfslock start
 799                         ;;
 800                 stop)
 801                         service nfslock stop > /dev/null 2>&1
 802                         ;;
 803                 restart)
 804                         service nfslock stop > /dev/null 2>&1
 805                         service nfslock start
 806                         ;;
 807                 esac
 808                 ;;
 809         *)
 810                 echo "Unknown platform. NFS locking is not supported with ctdb"
 811                 exit 1
 812                 ;;
 813         esac
 814 }
 815
 816 # Periodically update the statd database
 817 nfs_statd_update ()
 818 {
 819     _update_period="$1"
 820
 821     _statd_update_trigger="$service_state_dir/update-trigger"
 822     [ -f "$_statd_update_trigger" ] || touch "$_statd_update_trigger"
 823
 824     _last_update=$(stat --printf="%Y" "$_statd_update_trigger")
 825     _current_time=$(date +"%s")
 826     if [ $(( $_current_time - $_last_update)) -ge $_update_period ] ; then
 827         touch "$_statd_update_trigger"
 828         $CTDB_BASE/statd-callout updatelocal &
 829         $CTDB_BASE/statd-callout updateremote &
 830     fi
 831 }
 832
 833 ########################################################
 834
 835 add_ip_to_iface ()
 836 {
 837     _iface=$1
 838     _ip=$2
 839     _maskbits=$3
 840
 841     # Ensure interface is up
 842     ip link set "$_iface" up || \
 843         die "Failed to bringup interface $_iface"
 844
 845     ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || {
 846         echo "Failed to add $_ip/$_maskbits on dev $_iface"
 847         return 1
 848     }
 849 }
 850
 851 delete_ip_from_iface()
 852 {
 853     _iface=$1
 854     _ip=$2
 855     _maskbits=$3
 856
 857     # This could be set globally for all interfaces but it is probably
 858     # better to avoid surprises, so limit it the interfaces where CTDB
 859     # has public IP addresses.  There isn't anywhere else convenient
 860     # to do this so just set it each time.  This is much cheaper than
 861     # remembering and re-adding secondaries.
 862     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
 863
 864     ip addr del "$_ip/$_maskbits" dev "$_iface" || {
 865         echo "Failed to del $_ip on dev $_iface"
 866         return 1
 867     }
 868 }
 869
 870 # If the given IP is hosted then print 2 items: maskbits and iface
 871 ip_maskbits_iface ()
 872 {
 873     _addr="$1"
 874
 875     ip addr show to "${_addr}/32" 2>/dev/null | \
 876         awk '$1 == "inet" { print gensub(".*/", "", 1, $2), $NF }'
 877 }
 878
 879 drop_ip ()
 880 {
 881     _addr="${1%/*}"  # Remove optional maskbits
 882
 883     set -- $(ip_maskbits_iface $_addr)
 884     if [ -n "$1" ] ; then
 885         _maskbits="$1"
 886         _iface="$2"
 887         echo "Removing public address $_addr/$_maskbits from device $_iface"
 888         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
 889     fi
 890 }
 891
 892 drop_all_public_ips ()
 893 {
 894     while read _ip _x ; do
 895         drop_ip "$_ip"
 896     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
 897 }
 898
 899 ########################################################
 900 # Simple counters
 901 _ctdb_counter_common () {
 902     _service_name="${1:-${service_name:-${script_name}}}"
 903     _counter_file="$ctdb_fail_dir/$_service_name"
 904     mkdir -p "${_counter_file%/*}" # dirname
 905 }
 906 ctdb_counter_init () {
 907     _ctdb_counter_common "$1"
 908
 909     >"$_counter_file"
 910 }
 911 ctdb_counter_incr () {
 912     _ctdb_counter_common "$1"
 913
 914     # unary counting!
 915     echo -n 1 >> "$_counter_file"
 916 }
 917 ctdb_check_counter () {
 918     _msg="${1:-error}"  # "error"  - anything else is silent on fail
 919     _op="${2:--ge}"  # an integer operator supported by test
 920     _limit="${3:-${service_fail_limit}}"
 921     shift 3
 922     _ctdb_counter_common "$1"
 923
 924     # unary counting!
 925     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
 926     _hit=false
 927     if [ "$_op" != "%" ] ; then
 928         if [ $_size $_op $_limit ] ; then
 929             _hit=true
 930         fi
 931     else
 932         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
 933             _hit=true
 934         fi
 935     fi
 936     if $_hit ; then
 937         if [ "$_msg" = "error" ] ; then
 938             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
 939             exit 1
 940         else
 941             return 1
 942         fi
 943     fi
 944 }
 945
 946 ########################################################
 947
 948 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
 949 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
 950
 951 ctdb_setup_service_state_dir ()
 952 {
 953     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
 954     mkdir -p "$service_state_dir" || {
 955         echo "Error creating state dir \"$service_state_dir\""
 956         exit 1
 957     }
 958 }
 959
 960 ########################################################
 961 # Managed status history, for auto-start/stop
 962
 963 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
 964
 965 _ctdb_managed_common ()
 966 {
 967     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
 968 }
 969
 970 ctdb_service_managed ()
 971 {
 972     _ctdb_managed_common
 973     mkdir -p "$ctdb_managed_dir"
 974     touch "$_ctdb_managed_file"
 975 }
 976
 977 ctdb_service_unmanaged ()
 978 {
 979     _ctdb_managed_common
 980     rm -f "$_ctdb_managed_file"
 981 }
 982
 983 is_ctdb_previously_managed_service ()
 984 {
 985     _ctdb_managed_common
 986     [ -f "$_ctdb_managed_file" ]
 987 }
 988
 989 ########################################################
 990 # Check and set status
 991
 992 log_status_cat ()
 993 {
 994     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
 995 }
 996
 997 ctdb_checkstatus ()
 998 {
 999     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
1000         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
1001         return 1
1002     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
1003         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
1004         return 2
1005     else
1006         return 0
1007     fi
1008 }
1009
1010 ctdb_setstatus ()
1011 {
1012     d="$ctdb_status_dir/$script_name"
1013     case "$1" in
1014         unhealthy|banned)
1015             mkdir -p "$d"
1016             cat "$2" >"$d/$1"
1017             ;;
1018         *)
1019             for i in "banned" "unhealthy" ; do
1020                 rm -f "$d/$i"
1021             done
1022             ;;
1023     esac
1024 }
1025
1026 ##################################################################
1027 # Reconfigure a service on demand
1028
1029 _ctdb_service_reconfigure_common ()
1030 {
1031     _d="$ctdb_status_dir/${service_name}"
1032     mkdir -p "$_d"
1033     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1034 }
1035
1036 ctdb_service_needs_reconfigure ()
1037 {
1038     _ctdb_service_reconfigure_common
1039     [ -e "$_ctdb_service_reconfigure_flag" ]
1040 }
1041
1042 ctdb_service_set_reconfigure ()
1043 {
1044     _ctdb_service_reconfigure_common
1045     >"$_ctdb_service_reconfigure_flag"
1046 }
1047
1048 ctdb_service_unset_reconfigure ()
1049 {
1050     _ctdb_service_reconfigure_common
1051     rm -f "$_ctdb_service_reconfigure_flag"
1052 }
1053
1054 ctdb_service_reconfigure ()
1055 {
1056     echo "Reconfiguring service \"${service_name}\"..."
1057     ctdb_service_unset_reconfigure
1058     service_reconfigure || return $?
1059     ctdb_counter_init
1060 }
1061
1062 # Default service_reconfigure() function does nothing.
1063 service_reconfigure ()
1064 {
1065     :
1066 }
1067
1068 ctdb_reconfigure_take_lock ()
1069 {
1070     _ctdb_service_reconfigure_common
1071     _lock="${_d}/reconfigure_lock"
1072     mkdir -p "${_lock%/*}" # dirname
1073     touch "$_lock"
1074
1075     (
1076         flock 0
1077         # This is overkill but will work if we need to extend this to
1078         # allow certain events to run multiple times in parallel
1079         # (e.g. takeip) and write multiple PIDs to the file.
1080         read _locker_event
1081         if [ -n "$_locker_event" ] ; then
1082             while read _pid ; do
1083                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1084                     kill -0 "$_pid" 2>/dev/null ; then
1085                     exit 1
1086                 fi
1087             done
1088         fi
1089
1090         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1091         exit 0
1092     ) <"$_lock"
1093 }
1094
1095 ctdb_reconfigure_release_lock ()
1096 {
1097     _ctdb_service_reconfigure_common
1098     _lock="${_d}/reconfigure_lock"
1099
1100     rm -f "$_lock"
1101 }
1102
1103 ctdb_replay_monitor_status ()
1104 {
1105     echo "Replaying previous status for this script due to reconfigure..."
1106     # Leading colon (':') is missing in some versions...
1107     _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:")
1108     # Output looks like this:
1109     # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar:
1110     # This is the cheapest way of getting fields in the middle.
1111     set -- $(IFS=":" ; echo $_out)
1112     _code="$3"
1113     _status="$4"
1114     # The error output field can include colons so we'll try to
1115     # preserve them.  The weak checking at the beginning tries to make
1116     # this work for both broken (no leading ':') and fixed output.
1117     _out="${_out%:}"
1118     _err_out="${_out#*monitor:${script_name}:*:*:*:*:}"
1119     case "$_status" in
1120         OK) : ;;  # Do nothing special.
1121         TIMEDOUT)
1122             # Recast this as an error, since we can't exit with the
1123             # correct negative number.
1124             _code=1
1125             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1126             ;;
1127         DISABLED)
1128             # Recast this as an OK, since we can't exit with the
1129             # correct negative number.
1130             _code=0
1131             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1132             ;;
1133         *) : ;;  # Must be ERROR, do nothing special.
1134     esac
1135     if [ -n "$_err_out" ] ; then
1136         echo "$_err_out"
1137     fi
1138     exit $_code
1139 }
1140
1141 ctdb_service_check_reconfigure ()
1142 {
1143     assert_service_name
1144
1145     # We only care about some events in this function.  For others we
1146     # return now.
1147     case "$event_name" in
1148         monitor|ipreallocated|reconfigure) : ;;
1149         *) return 0 ;;
1150     esac
1151
1152     if ctdb_reconfigure_take_lock ; then
1153         # No events covered by this function are running, so proceed
1154         # with gay abandon.
1155         case "$event_name" in
1156             reconfigure)
1157                 (ctdb_service_reconfigure)
1158                 exit $?
1159                 ;;
1160             ipreallocated)
1161                 if ctdb_service_needs_reconfigure ; then
1162                     ctdb_service_reconfigure
1163                 fi
1164                 ;;
1165         esac
1166
1167         ctdb_reconfigure_release_lock
1168     else
1169         # Somebody else is running an event we don't want to collide
1170         # with.  We proceed with caution.
1171         case "$event_name" in
1172             reconfigure)
1173                 # Tell whoever called us to retry.
1174                 exit 2
1175                 ;;
1176             ipreallocated)
1177                 # Defer any scheduled reconfigure and just run the
1178                 # rest of the ipreallocated event, as per the
1179                 # eventscript.  There's an assumption here that the
1180                 # event doesn't depend on any scheduled reconfigure.
1181                 # This is true in the current code.
1182                 return 0
1183                 ;;
1184             monitor)
1185                 # There is most likely a reconfigure in progress so
1186                 # the service is possibly unstable.  As above, we
1187                 # defer any scheduled reconfigured.  We also replay
1188                 # the previous monitor status since that's the best
1189                 # information we have.
1190                 ctdb_replay_monitor_status
1191                 ;;
1192         esac
1193     fi
1194 }
1195
1196 ##################################################################
1197 # Does CTDB manage this service? - and associated auto-start/stop
1198
1199 ctdb_compat_managed_service ()
1200 {
1201     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1202         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1203     fi
1204 }
1205
1206 is_ctdb_managed_service ()
1207 {
1208     assert_service_name
1209
1210     # $t is used just for readability and to allow better accurate
1211     # matching via leading/trailing spaces
1212     t=" $CTDB_MANAGED_SERVICES "
1213
1214     # Return 0 if "<space>$service_name<space>" appears in $t
1215     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1216         return 0
1217     fi
1218
1219     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1220     # backward compatibility and try again.
1221     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1222     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1223     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1224     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1225     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1226     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1227     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1228     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1229     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1230
1231     t=" $CTDB_MANAGED_SERVICES "
1232
1233     # Return 0 if "<space>$service_name<space>" appears in $t
1234     [ "${t#* ${service_name} }" != "${t}" ]
1235 }
1236
1237 ctdb_start_stop_service ()
1238 {
1239     assert_service_name
1240
1241     # Allow service-start/service-stop pseudo-events to start/stop
1242     # services when we're not auto-starting/stopping and we're not
1243     # monitoring.
1244     case "$event_name" in
1245         service-start)
1246             if is_ctdb_managed_service ; then
1247                 die 'service-start event not permitted when service is managed'
1248             fi
1249             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1250                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1251             fi
1252             ctdb_service_start
1253             exit $?
1254             ;;
1255         service-stop)
1256             if is_ctdb_managed_service ; then
1257                 die 'service-stop event not permitted when service is managed'
1258             fi
1259             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1260                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1261             fi
1262             ctdb_service_stop
1263             exit $?
1264             ;;
1265     esac
1266
1267     # Do nothing unless configured to...
1268     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1269
1270     [ "$event_name" = "monitor" ] || return 0
1271
1272     if is_ctdb_managed_service ; then
1273         if ! is_ctdb_previously_managed_service ; then
1274             echo "Starting service \"$service_name\" - now managed"
1275             background_with_logging ctdb_service_start
1276             exit $?
1277         fi
1278     else
1279         if is_ctdb_previously_managed_service ; then
1280             echo "Stopping service \"$service_name\" - no longer managed"
1281             background_with_logging ctdb_service_stop
1282             exit $?
1283         fi
1284     fi
1285 }
1286
1287 ctdb_service_start ()
1288 {
1289     # The service is marked managed if we've ever tried to start it.
1290     ctdb_service_managed
1291
1292     service_start || return $?
1293
1294     ctdb_counter_init
1295     ctdb_check_tcp_init
1296 }
1297
1298 ctdb_service_stop ()
1299 {
1300     ctdb_service_unmanaged
1301     service_stop
1302 }
1303
1304 # Default service_start() and service_stop() functions.
1305
1306 # These may be overridden in an eventscript.
1307 service_start ()
1308 {
1309     service "$service_name" start
1310 }
1311
1312 service_stop ()
1313 {
1314     service "$service_name" stop
1315 }
1316
1317 ##################################################################
1318
1319 ctdb_standard_event_handler ()
1320 {
1321     case "$1" in
1322         status)
1323             ctdb_checkstatus
1324             exit
1325             ;;
1326         setstatus)
1327             shift
1328             ctdb_setstatus "$@"
1329             exit
1330             ;;
1331     esac
1332 }
1333
1334 # iptables doesn't like being re-entered, so flock-wrap it.
1335 iptables()
1336 {
1337         flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
1338 }
1339
1340 # AIX (and perhaps others?) doesn't have mktemp
1341 if ! which mktemp >/dev/null 2>&1 ; then
1342     mktemp ()
1343     {
1344         _dir=false
1345         if [ "$1" = "-d" ] ; then
1346             _dir=true
1347             shift
1348         fi
1349         _d="${TMPDIR:-/tmp}"
1350         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1351             md5sum | \
1352             sed -e 's@\(..........\).*@\1@')
1353         _t="${_d}/tmp.${_hex10}"
1354         (
1355             umask 077
1356             if $_dir ; then
1357                 mkdir "$_t"
1358             else
1359                 >"$_t"
1360             fi
1361         )
1362         echo "$_t"
1363     }
1364 fi
1365
1366 ########################################################
1367 # tickle handling
1368 ########################################################
1369
1370 update_tickles ()
1371 {
1372         _port="$1"
1373
1374         tickledir="$CTDB_VARDIR/state/tickles"
1375         mkdir -p "$tickledir"
1376
1377         # Who am I?
1378         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1379
1380         # What public IPs do I hold?
1381         _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
1382
1383         # IPs as a regexp choice
1384         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1385
1386         # Record connections to our public IPs in a temporary file
1387         _my_connections="${tickledir}/${_port}.connections"
1388         rm -f "$_my_connections"
1389         netstat -tn |
1390         awk -v destpat="^${_ipschoice}:${_port}\$" \
1391           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1392         sort >"$_my_connections"
1393
1394         # Record our current tickles in a temporary file
1395         _my_tickles="${tickledir}/${_port}.tickles"
1396         rm -f "$_my_tickles"
1397         for _i in $_ips ; do
1398                 ctdb -Y gettickles $_i $_port |
1399                 awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1400         done |
1401         sort >"$_my_tickles"
1402
1403         # Add tickles for connections that we haven't already got tickles for
1404         comm -23 "$_my_connections" "$_my_tickles" |
1405         while read _src _dst ; do
1406                 ctdb addtickle $_src $_dst
1407         done
1408
1409         # Remove tickles for connections that are no longer there
1410         comm -13 "$_my_connections" "$_my_tickles" |
1411         while read _src _dst ; do
1412                 ctdb deltickle $_src $_dst
1413         done
1414
1415         rm -f "$_my_connections" "$_my_tickles"
1416 }
1417
1418 ########################################################
1419 # load a site local config file
1420 ########################################################
1421
1422 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1423         . "$CTDB_RC_LOCAL"
1424 }
1425
1426 [ -x $CTDB_BASE/rc.local ] && {
1427         . $CTDB_BASE/rc.local
1428 }
1429
1430 [ -d $CTDB_BASE/rc.local.d ] && {
1431         for i in $CTDB_BASE/rc.local.d/* ; do
1432                 [ -x "$i" ] && . "$i"
1433         done
1434 }
1435
1436 script_name="${0##*/}"       # basename
1437 service_fail_limit=1
1438 event_name="$1"