ctdb/config/functions

   1 # Hey Emacs, this is a -*- shell-script -*- !!!
   2
   3 # utility functions for ctdb event scripts
   4
   5 [ -z "$CTDB_VARDIR" ] && {
   6     if [ -d "/var/lib/ctdb" ] ; then
   7         export CTDB_VARDIR="/var/lib/ctdb"
   8     else
   9         export CTDB_VARDIR="/var/ctdb"
  10     fi
  11 }
  12 [ -z "$CTDB_ETCDIR" ] && {
  13     export CTDB_ETCDIR="/etc"
  14 }
  15
  16 #######################################
  17 # pull in a system config file, if any
  18 _loadconfig() {
  19
  20     if [ -z "$1" ] ; then
  21         foo="${service_config:-${service_name}}"
  22         if [ -n "$foo" ] ; then
  23             loadconfig "$foo"
  24             return
  25         fi
  26     fi
  27
  28     if [ "$1" != "ctdb" ] ; then
  29         loadconfig "ctdb"
  30     fi
  31
  32     if [ -z "$1" ] ; then
  33         return
  34     fi
  35
  36     if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
  37         . $CTDB_ETCDIR/sysconfig/$1
  38     elif [ -f $CTDB_ETCDIR/default/$1 ]; then
  39         . $CTDB_ETCDIR/default/$1
  40     elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
  41         . $CTDB_BASE/sysconfig/$1
  42     fi
  43
  44     if [ "$1" = "ctdb" ] ; then
  45         _config="${CTDB_BASE}/ctdbd.conf"
  46         if [ -r "$_config" ] ; then
  47             . "$_config"
  48         fi
  49     fi
  50 }
  51
  52 loadconfig () {
  53     _loadconfig "$@"
  54 }
  55
  56 ##############################################################
  57
  58 # CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
  59 # configuration file.
  60 debug ()
  61 {
  62     if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
  63         # If there are arguments then echo them.  Otherwise expect to
  64         # use stdin, which allows us to pass lots of debug using a
  65         # here document.
  66         if [ -n "$1" ] ; then
  67             echo "DEBUG: $*"
  68         elif ! tty -s ; then
  69             sed -e 's@^@DEBUG: @'
  70         fi
  71     fi
  72 }
  73
  74 die ()
  75 {
  76     _msg="$1"
  77     _rc="${2:-1}"
  78
  79     echo "$_msg"
  80     exit $_rc
  81 }
  82
  83 # Log given message or stdin to either syslog or a CTDB log file
  84 # $1 is the tag passed to logger if syslog is in use.
  85 script_log ()
  86 {
  87     _tag="$1" ; shift
  88
  89     if [ "$CTDB_SYSLOG" = "yes" ] ; then
  90         logger -t "ctdbd: ${_tag}" $*
  91     else
  92         {
  93             if [ -n "$*" ] ; then
  94                 echo "$*"
  95             else
  96                 cat
  97             fi
  98         } >>"${CTDB_LOGFILE:-/var/log/log.ctdb}"
  99     fi
 100 }
 101
 102 # When things are run in the background in an eventscript then logging
 103 # output might get lost.  This is the "solution".  :-)
 104 background_with_logging ()
 105 {
 106     (
 107         "$@" 2>&1 </dev/null |
 108         script_log "${script_name}&"
 109     )&
 110
 111     return 0
 112 }
 113
 114 ##############################################################
 115 # check number of args for different events
 116 ctdb_check_args ()
 117 {
 118     case "$1" in
 119         takeip|releaseip)
 120             if [ $# != 4 ]; then
 121                 echo "ERROR: must supply interface, IP and maskbits"
 122                 exit 1
 123             fi
 124             ;;
 125         updateip)
 126             if [ $# != 5 ]; then
 127                 echo "ERROR: must supply old interface, new interface, IP and maskbits"
 128                 exit 1
 129             fi
 130             ;;
 131     esac
 132 }
 133
 134 ##############################################################
 135 # determine on what type of system (init style) we are running
 136 detect_init_style()
 137 {
 138     # only do detection if not already set:
 139     [ -z "$CTDB_INIT_STYLE" ] || return
 140
 141     if [ -x /sbin/startproc ]; then
 142         CTDB_INIT_STYLE="suse"
 143     elif [ -x /sbin/start-stop-daemon ]; then
 144         CTDB_INIT_STYLE="debian"
 145     else
 146         CTDB_INIT_STYLE="redhat"
 147     fi
 148 }
 149
 150 ######################################################
 151 # simulate /sbin/service on platforms that don't have it
 152 # _service() makes it easier to hook the service() function for
 153 # testing.
 154 _service ()
 155 {
 156   _service_name="$1"
 157   _op="$2"
 158
 159   # do nothing, when no service was specified
 160   [ -z "$_service_name" ] && return
 161
 162   if [ -x /sbin/service ]; then
 163       $_nice /sbin/service "$_service_name" "$_op"
 164   elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
 165       $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
 166   elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
 167       $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
 168   fi
 169 }
 170
 171 service()
 172 {
 173     _nice=""
 174     _service "$@"
 175 }
 176
 177 ######################################################
 178 # simulate /sbin/service (niced) on platforms that don't have it
 179 nice_service()
 180 {
 181     _nice="nice"
 182     _service "$@"
 183 }
 184
 185 ######################################################
 186 # wrapper around /proc/ settings to allow them to be hooked
 187 # for testing
 188 # 1st arg is relative path under /proc/, 2nd arg is value to set
 189 set_proc ()
 190 {
 191     echo "$2" >"/proc/$1"
 192 }
 193
 194 ######################################################
 195 # wrapper around getting file contents from /proc/ to allow
 196 # this to be hooked for testing
 197 # 1st arg is relative path under /proc/
 198 get_proc ()
 199 {
 200     cat "/proc/$1"
 201 }
 202
 203 ######################################################
 204 # Check that an RPC service is healthy -
 205 # this includes allowing a certain number of failures
 206 # before marking the NFS service unhealthy.
 207 #
 208 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
 209 #
 210 # each triple is a set of 3 arguments: an operator, a
 211 # fail count limit and an action string.
 212 #
 213 # For example:
 214 #
 215 #       nfs_check_rpc_service "lockd" \
 216 #           -ge 15 "verbose restart unhealthy" \
 217 #           -eq 10 "restart:bs"
 218 #
 219 # says that if lockd is down for 15 iterations then do
 220 # a verbose restart of lockd and mark the node unhealthy.
 221 # Before this, after 10 iterations of failure, the
 222 # service is restarted silently in the background.
 223 # Order is important: the number of failures need to be
 224 # specified in reverse order because processing stops
 225 # after the first condition that is true.
 226 ######################################################
 227 nfs_check_rpc_service ()
 228 {
 229     _prog_name="$1" ; shift
 230
 231     if _nfs_check_rpc_common "$_prog_name" ; then
 232         return
 233     fi
 234
 235     while [ -n "$3" ] ; do
 236         if _nfs_check_rpc_action "$1" "$2" "$3" ; then
 237             break
 238         fi
 239         shift 3
 240     done
 241 }
 242
 243 # The new way of doing things...
 244 nfs_check_rpc_services ()
 245 {
 246     # Files must end with .check - avoids editor backups, RPM fu, ...
 247     for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
 248         _t="${_f%.check}"
 249         _prog_name="${_t##*/[0-9][0-9].}"
 250
 251         if _nfs_check_rpc_common "$_prog_name" ; then
 252             # This RPC service is up, check next service...
 253             continue
 254         fi
 255
 256         # Check each line in the file in turn until one of the limit
 257         # checks is hit...
 258         while read _cmp _lim _rest ; do
 259             # Skip comments
 260             case "$_cmp" in
 261                 \#*) continue ;;
 262             esac
 263
 264             if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
 265                 # Limit was hit on this line, no further checking...
 266                 break
 267             fi
 268         done <"$_f"
 269     done
 270 }
 271
 272 _nfs_check_rpc_common ()
 273 {
 274     _prog_name="$1"
 275
 276     # Some platforms don't have separate programs for all services.
 277     case "$_prog_name" in
 278         statd)
 279             which "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
 280     esac
 281
 282     case "$_prog_name" in
 283         nfsd)
 284             _rpc_prog=nfs
 285             _version=3
 286             ;;
 287         mountd)
 288             _rpc_prog=mountd
 289             _version=1
 290             ;;
 291         rquotad)
 292             _rpc_prog=rquotad
 293             _version=1
 294             ;;
 295         lockd)
 296             _rpc_prog=nlockmgr
 297             _version=4
 298             ;;
 299         statd)
 300             _rpc_prog=status
 301             _version=1
 302             ;;
 303         *)
 304             echo "Internal error: unknown RPC program \"$_prog_name\"."
 305             exit 1
 306     esac
 307
 308     _service_name="nfs_${_prog_name}"
 309
 310     if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
 311         ctdb_counter_init "$_service_name"
 312         return 0
 313     fi
 314
 315     ctdb_counter_incr "$_service_name"
 316
 317     return 1
 318 }
 319
 320 _nfs_check_rpc_action ()
 321 {
 322     _cmp="$1"
 323     _limit="$2"
 324     _actions="$3"
 325
 326     if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
 327         return 1
 328     fi
 329
 330     for _action in $_actions ; do
 331         case "$_action" in
 332             verbose)
 333                 echo "$ctdb_check_rpc_out"
 334                 ;;
 335             restart)
 336                 _nfs_restart_rpc_service "$_prog_name"
 337                 ;;
 338             restart:b)
 339                 _nfs_restart_rpc_service "$_prog_name" true
 340                 ;;
 341             unhealthy)
 342                 exit 1
 343                 ;;
 344             *)
 345                 echo "Internal error: unknown action \"$_action\"."
 346                 exit 1
 347         esac
 348     done
 349
 350     return 0
 351 }
 352
 353 _nfs_restart_rpc_service ()
 354 {
 355     _prog_name="$1"
 356     _background="${2:-false}"
 357
 358     if $_background ; then
 359         _maybe_background="background_with_logging"
 360     else
 361         _maybe_background=""
 362     fi
 363
 364     _p="rpc.${_prog_name}"
 365
 366     case "$_prog_name" in
 367         nfsd)
 368             echo "Trying to restart NFS service"
 369             $_maybe_background startstop_nfs restart
 370             ;;
 371         mountd)
 372             echo "Trying to restart $_prog_name [${_p}]"
 373             killall -q -9 "$_p"
 374             $_maybe_background $_p ${MOUNTD_PORT:+-p} $MOUNTD_PORT
 375             ;;
 376         rquotad)
 377             echo "Trying to restart $_prog_name [${_p}]"
 378             killall -q -9 "$_p"
 379             $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
 380             ;;
 381         lockd)
 382             echo "Trying to restart lock manager service"
 383             $_maybe_background startstop_nfslock restart
 384             ;;
 385         statd)
 386             echo "Trying to restart $_prog_name [${_p}]"
 387             killall -q -9 "$_p"
 388             $_maybe_background $_p \
 389                 ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
 390                 ${STATD_PORT:+-p} $STATD_PORT \
 391                 ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
 392             ;;
 393         *)
 394             echo "Internal error: unknown RPC program \"$_prog_name\"."
 395             exit 1
 396     esac
 397 }
 398
 399 ######################################################
 400 # check that a rpc server is registered with portmap
 401 # and responding to requests
 402 # usage: ctdb_check_rpc SERVICE_NAME VERSION
 403 ######################################################
 404 ctdb_check_rpc ()
 405 {
 406     progname="$1"
 407     version="$2"
 408
 409     _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
 410
 411     if ! ctdb_check_rpc_out=$(rpcinfo -u $_localhost $progname $version 2>&1) ; then
 412         ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 413 $ctdb_check_rpc_out"
 414         echo "$ctdb_check_rpc_out"
 415         return 1
 416     fi
 417 }
 418
 419 ######################################################
 420 # Ensure $service_name is set
 421 assert_service_name ()
 422 {
 423     [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
 424 }
 425
 426 ######################################################
 427 # check a set of directories is available
 428 # return 1 on a missing directory
 429 # directories are read from stdin
 430 ######################################################
 431 ctdb_check_directories_probe()
 432 {
 433     while IFS="" read d ; do
 434         case "$d" in
 435             *%*)
 436                 continue
 437                 ;;
 438             *)
 439                 [ -d "${d}/." ] || return 1
 440         esac
 441     done
 442 }
 443
 444 ######################################################
 445 # check a set of directories is available
 446 # directories are read from stdin
 447 ######################################################
 448 ctdb_check_directories()
 449 {
 450     ctdb_check_directories_probe || {
 451         echo "ERROR: $service_name directory \"$d\" not available"
 452         exit 1
 453     }
 454 }
 455
 456 ######################################################
 457 # check a set of tcp ports
 458 # usage: ctdb_check_tcp_ports <ports...>
 459 ######################################################
 460
 461 # This flag file is created when a service is initially started.  It
 462 # is deleted the first time TCP port checks for that service succeed.
 463 # Until then ctdb_check_tcp_ports() prints a more subtle "error"
 464 # message if a port check fails.
 465 _ctdb_check_tcp_common ()
 466 {
 467     assert_service_name
 468     _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
 469 }
 470
 471 ctdb_check_tcp_init ()
 472 {
 473     _ctdb_check_tcp_common
 474     mkdir -p "${_ctdb_service_started_file%/*}" # dirname
 475     touch "$_ctdb_service_started_file"
 476 }
 477
 478 # Check whether something is listening on all of the given TCP ports
 479 # using the "ctdb checktcpport" command.
 480 ctdb_check_tcp_ports()
 481 {
 482     if [ -z "$1" ] ; then
 483         echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
 484         exit 1
 485     fi
 486
 487     for _p ; do  # process each function argument (port)
 488         _cmd="ctdb checktcpport $_p"
 489         _out=$($_cmd 2>&1)
 490         _ret=$?
 491         case "$_ret" in
 492             0)
 493                 _ctdb_check_tcp_common
 494                 if [ ! -f "$_ctdb_service_started_file" ] ; then
 495                     echo "ERROR: $service_name tcp port $_p is not responding"
 496                     debug "\"ctdb checktcpport $_p\" was able to bind to port"
 497                 else
 498                     echo "INFO: $service_name tcp port $_p is not responding"
 499                 fi
 500
 501                 return 1
 502                 ;;
 503             98)
 504                 # Couldn't bind, something already listening, next port...
 505                 continue
 506                 ;;
 507             *)
 508                 echo "ERROR: unexpected error running \"ctdb checktcpport\""
 509                 debug <<EOF
 510 ctdb checktcpport (exited with $_ret) with output:
 511 $_out"
 512 EOF
 513                 return $_ret
 514         esac
 515     done
 516
 517     # All ports listening
 518     _ctdb_check_tcp_common
 519     rm -f "$_ctdb_service_started_file"
 520     return 0
 521 }
 522
 523 ######################################################
 524 # check a unix socket
 525 # usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
 526 ######################################################
 527 ctdb_check_unix_socket() {
 528     socket_path="$1"
 529     [ -z "$socket_path" ] && return
 530
 531     if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
 532         echo "ERROR: $service_name socket $socket_path not found"
 533         return 1
 534     fi
 535 }
 536
 537 ######################################################
 538 # check a command returns zero status
 539 # usage: ctdb_check_command <command>
 540 ######################################################
 541 ctdb_check_command ()
 542 {
 543     _out=$("$@" 2>&1) || {
 544         echo "ERROR: $* returned error"
 545         echo "$_out" | debug
 546         exit 1
 547     }
 548 }
 549
 550 ################################################
 551 # kill off any TCP connections with the given IP
 552 ################################################
 553 kill_tcp_connections ()
 554 {
 555     _ip="$1"
 556
 557     _oneway=false
 558     if [ "$2" = "oneway" ] ; then
 559         _oneway=true
 560     fi
 561
 562     get_tcp_connections_for_ip "$_ip" | {
 563         _killcount=0
 564         _connections=""
 565         _nl="
 566 "
 567         while read _dst _src; do
 568             _destport="${_dst##*:}"
 569             __oneway=$_oneway
 570             case $_destport in
 571                 # we only do one-way killtcp for CIFS
 572                 139|445) __oneway=true ;;
 573             esac
 574
 575             echo "Killing TCP connection $_src $_dst"
 576             _connections="${_connections}${_nl}${_src} ${_dst}"
 577             if ! $__oneway ; then
 578                 _connections="${_connections}${_nl}${_dst} ${_src}"
 579             fi
 580
 581             _killcount=$(($_killcount + 1))
 582         done
 583
 584         if [ $_killcount -eq 0 ] ; then
 585             return
 586         fi
 587
 588         echo "$_connections" | ctdb killtcp || {
 589             echo "Failed to send killtcp control"
 590             return
 591         }
 592
 593         _count=0
 594         while : ; do
 595             _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
 596
 597             if [ $_remaining -eq 0 ] ; then
 598                 echo "Killed $_killcount TCP connections to released IP $_ip"
 599                 return
 600             fi
 601
 602             _count=$(($_count + 1))
 603             if [ $_count -gt 3 ] ; then
 604                 echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
 605                 return
 606             fi
 607
 608             echo "Waiting for $_remaining connections to be killed for IP $_ip"
 609             sleep 1
 610         done
 611     }
 612 }
 613
 614 ##################################################################
 615 # kill off the local end for any TCP connections with the given IP
 616 ##################################################################
 617 kill_tcp_connections_local_only ()
 618 {
 619     kill_tcp_connections "$1" "oneway"
 620 }
 621
 622 ##################################################################
 623 # tickle any TCP connections with the given IP
 624 ##################################################################
 625 tickle_tcp_connections ()
 626 {
 627     _ip="$1"
 628
 629     get_tcp_connections_for_ip "$_ip" |
 630     {
 631         _failed=false
 632
 633         while read dest src; do
 634             echo "Tickle TCP connection $src $dest"
 635             ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
 636             echo "Tickle TCP connection $dest $src"
 637             ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
 638         done
 639
 640         if $_failed ; then
 641             echo "Failed to send tickle control"
 642         fi
 643     }
 644 }
 645
 646 get_tcp_connections_for_ip ()
 647 {
 648     _ip="$1"
 649
 650     netstat -tn | awk -v ip=$_ip \
 651         'index($1, "tcp") == 1 && \
 652          (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
 653          && $6 == "ESTABLISHED" \
 654          {print $4" "$5}'
 655 }
 656
 657 ########################################################
 658 # start/stop the Ganesha nfs service
 659 ########################################################
 660 startstop_ganesha()
 661 {
 662     _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
 663     case "$1" in
 664         start)
 665             service "$_service_name" start
 666             ;;
 667         stop)
 668             service "$_service_name" stop
 669             ;;
 670         restart)
 671             service "$_service_name" restart
 672             ;;
 673     esac
 674 }
 675
 676 ########################################################
 677 # start/stop the nfs service on different platforms
 678 ########################################################
 679 startstop_nfs() {
 680         PLATFORM="unknown"
 681         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 682                 PLATFORM="sles"
 683         }
 684         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
 685             -r /usr/lib/systemd/system/nfs-lock.service ] && {
 686                 PLATFORM="rhel"
 687         }
 688
 689         case $PLATFORM in
 690         sles)
 691                 case $1 in
 692                 start)
 693                         service nfsserver start
 694                         ;;
 695                 stop)
 696                         service nfsserver stop > /dev/null 2>&1
 697                         ;;
 698                 restart)
 699                         set_proc "fs/nfsd/threads" 0
 700                         service nfsserver stop > /dev/null 2>&1
 701                         pkill -9 nfsd
 702                         nfs_dump_some_threads
 703                         service nfsserver start
 704                         ;;
 705                 esac
 706                 ;;
 707         rhel)
 708                 case $1 in
 709                 start)
 710                         service nfslock start
 711                         service nfs start
 712                         ;;
 713                 stop)
 714                         service nfs stop
 715                         service nfslock stop
 716                         ;;
 717                 restart)
 718                         set_proc "fs/nfsd/threads" 0
 719                         service nfs stop > /dev/null 2>&1
 720                         service nfslock stop > /dev/null 2>&1
 721                         pkill -9 nfsd
 722                         nfs_dump_some_threads
 723                         service nfslock start
 724                         service nfs start
 725                         ;;
 726                 esac
 727                 ;;
 728         *)
 729                 echo "Unknown platform. NFS is not supported with ctdb"
 730                 exit 1
 731                 ;;
 732         esac
 733 }
 734
 735 # Dump up to the configured number of nfsd thread backtraces.
 736 nfs_dump_some_threads ()
 737 {
 738     [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || CTDB_NFS_DUMP_STUCK_THREADS=5
 739
 740     # Optimisation to avoid running an unnecessary pidof
 741     [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
 742
 743     _count=0
 744     for _pid in $(pidof nfsd) ; do
 745         [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
 746
 747         # Do this first to avoid racing with thread exit
 748         _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
 749         if [ -n "$_stack" ] ; then
 750             echo "Stack trace for stuck nfsd thread [${_pid}]:"
 751             echo "$_stack"
 752             _count=$(($_count + 1))
 753         fi
 754     done
 755 }
 756
 757 ########################################################
 758 # start/stop the nfs lockmanager service on different platforms
 759 ########################################################
 760 startstop_nfslock() {
 761         PLATFORM="unknown"
 762         [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
 763                 PLATFORM="sles"
 764         }
 765         [ -x $CTDB_ETCDIR/init.d/nfslock -o \
 766             -r /usr/lib/systemd/system/nfs-lock.service ] && {
 767                 PLATFORM="rhel"
 768         }
 769
 770         case $PLATFORM in
 771         sles)
 772                 # for sles there is no service for lockmanager
 773                 # so we instead just shutdown/restart nfs
 774                 case $1 in
 775                 start)
 776                         service nfsserver start
 777                         ;;
 778                 stop)
 779                         service nfsserver stop > /dev/null 2>&1
 780                         ;;
 781                 restart)
 782                         service nfsserver stop > /dev/null 2>&1
 783                         service nfsserver start
 784                         ;;
 785                 esac
 786                 ;;
 787         rhel)
 788                 case $1 in
 789                 start)
 790                         service nfslock start
 791                         ;;
 792                 stop)
 793                         service nfslock stop > /dev/null 2>&1
 794                         ;;
 795                 restart)
 796                         service nfslock stop > /dev/null 2>&1
 797                         service nfslock start
 798                         ;;
 799                 esac
 800                 ;;
 801         *)
 802                 echo "Unknown platform. NFS locking is not supported with ctdb"
 803                 exit 1
 804                 ;;
 805         esac
 806 }
 807
 808 # Periodically update the statd database
 809 nfs_statd_update ()
 810 {
 811     _update_period="$1"
 812
 813     _statd_update_trigger="$service_state_dir/update-trigger"
 814     [ -f "$_statd_update_trigger" ] || touch "$_statd_update_trigger"
 815
 816     _last_update=$(stat --printf="%Y" "$_statd_update_trigger")
 817     _current_time=$(date +"%s")
 818     if [ $(( $_current_time - $_last_update)) -ge $_update_period ] ; then
 819         touch "$_statd_update_trigger"
 820         $CTDB_BASE/statd-callout updatelocal &
 821         $CTDB_BASE/statd-callout updateremote &
 822     fi
 823 }
 824
 825 ########################################################
 826
 827 add_ip_to_iface ()
 828 {
 829     _iface=$1
 830     _ip=$2
 831     _maskbits=$3
 832
 833     # Ensure interface is up
 834     ip link set "$_iface" up || \
 835         die "Failed to bringup interface $_iface"
 836
 837     ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || {
 838         echo "Failed to add $_ip/$_maskbits on dev $_iface"
 839         return 1
 840     }
 841 }
 842
 843 delete_ip_from_iface()
 844 {
 845     _iface=$1
 846     _ip=$2
 847     _maskbits=$3
 848
 849     # This could be set globally for all interfaces but it is probably
 850     # better to avoid surprises, so limit it the interfaces where CTDB
 851     # has public IP addresses.  There isn't anywhere else convenient
 852     # to do this so just set it each time.  This is much cheaper than
 853     # remembering and re-adding secondaries.
 854     set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
 855
 856     ip addr del "$_ip/$_maskbits" dev "$_iface" || {
 857         echo "Failed to del $_ip on dev $_iface"
 858         return 1
 859     }
 860 }
 861
 862 # If the given IP is hosted then print 2 items: maskbits and iface
 863 ip_maskbits_iface ()
 864 {
 865     _addr="$1"
 866
 867     ip addr show to "${_addr}/32" 2>/dev/null | \
 868         awk '$1 == "inet" { print gensub(".*/", "", 1, $2), $NF }'
 869 }
 870
 871 drop_ip ()
 872 {
 873     _addr="${1%/*}"  # Remove optional maskbits
 874
 875     set -- $(ip_maskbits_iface $_addr)
 876     if [ -n "$1" ] ; then
 877         _maskbits="$1"
 878         _iface="$2"
 879         echo "Removing public address $_addr/$_maskbits from device $_iface"
 880         delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
 881     fi
 882 }
 883
 884 drop_all_public_ips ()
 885 {
 886     while read _ip _x ; do
 887         drop_ip "$_ip"
 888     done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
 889 }
 890
 891 ########################################################
 892 # Simple counters
 893 _ctdb_counter_common () {
 894     _service_name="${1:-${service_name:-${script_name}}}"
 895     _counter_file="$ctdb_fail_dir/$_service_name"
 896     mkdir -p "${_counter_file%/*}" # dirname
 897 }
 898 ctdb_counter_init () {
 899     _ctdb_counter_common "$1"
 900
 901     >"$_counter_file"
 902 }
 903 ctdb_counter_incr () {
 904     _ctdb_counter_common "$1"
 905
 906     # unary counting!
 907     echo -n 1 >> "$_counter_file"
 908 }
 909 ctdb_check_counter () {
 910     _msg="${1:-error}"  # "error"  - anything else is silent on fail
 911     _op="${2:--ge}"  # an integer operator supported by test
 912     _limit="${3:-${service_fail_limit}}"
 913     shift 3
 914     _ctdb_counter_common "$1"
 915
 916     # unary counting!
 917     _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
 918     _hit=false
 919     if [ "$_op" != "%" ] ; then
 920         if [ $_size $_op $_limit ] ; then
 921             _hit=true
 922         fi
 923     else
 924         if [ $(($_size $_op $_limit)) -eq 0 ] ; then
 925             _hit=true
 926         fi
 927     fi
 928     if $_hit ; then
 929         if [ "$_msg" = "error" ] ; then
 930             echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
 931             exit 1
 932         else
 933             return 1
 934         fi
 935     fi
 936 }
 937
 938 ########################################################
 939
 940 ctdb_status_dir="$CTDB_VARDIR/state/service_status"
 941 ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
 942
 943 ctdb_setup_service_state_dir ()
 944 {
 945     service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
 946     mkdir -p "$service_state_dir" || {
 947         echo "Error creating state dir \"$service_state_dir\""
 948         exit 1
 949     }
 950 }
 951
 952 ########################################################
 953 # Managed status history, for auto-start/stop
 954
 955 ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
 956
 957 _ctdb_managed_common ()
 958 {
 959     _ctdb_managed_file="$ctdb_managed_dir/$service_name"
 960 }
 961
 962 ctdb_service_managed ()
 963 {
 964     _ctdb_managed_common
 965     mkdir -p "$ctdb_managed_dir"
 966     touch "$_ctdb_managed_file"
 967 }
 968
 969 ctdb_service_unmanaged ()
 970 {
 971     _ctdb_managed_common
 972     rm -f "$_ctdb_managed_file"
 973 }
 974
 975 is_ctdb_previously_managed_service ()
 976 {
 977     _ctdb_managed_common
 978     [ -f "$_ctdb_managed_file" ]
 979 }
 980
 981 ########################################################
 982 # Check and set status
 983
 984 log_status_cat ()
 985 {
 986     echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
 987 }
 988
 989 ctdb_checkstatus ()
 990 {
 991     if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
 992         log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
 993         return 1
 994     elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
 995         log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
 996         return 2
 997     else
 998         return 0
 999     fi
1000 }
1001
1002 ctdb_setstatus ()
1003 {
1004     d="$ctdb_status_dir/$script_name"
1005     case "$1" in
1006         unhealthy|banned)
1007             mkdir -p "$d"
1008             cat "$2" >"$d/$1"
1009             ;;
1010         *)
1011             for i in "banned" "unhealthy" ; do
1012                 rm -f "$d/$i"
1013             done
1014             ;;
1015     esac
1016 }
1017
1018 ##################################################################
1019 # Reconfigure a service on demand
1020
1021 _ctdb_service_reconfigure_common ()
1022 {
1023     _d="$ctdb_status_dir/${service_name}"
1024     mkdir -p "$_d"
1025     _ctdb_service_reconfigure_flag="$_d/reconfigure"
1026 }
1027
1028 ctdb_service_needs_reconfigure ()
1029 {
1030     _ctdb_service_reconfigure_common
1031     [ -e "$_ctdb_service_reconfigure_flag" ]
1032 }
1033
1034 ctdb_service_set_reconfigure ()
1035 {
1036     _ctdb_service_reconfigure_common
1037     >"$_ctdb_service_reconfigure_flag"
1038 }
1039
1040 ctdb_service_unset_reconfigure ()
1041 {
1042     _ctdb_service_reconfigure_common
1043     rm -f "$_ctdb_service_reconfigure_flag"
1044 }
1045
1046 ctdb_service_reconfigure ()
1047 {
1048     echo "Reconfiguring service \"${service_name}\"..."
1049     ctdb_service_unset_reconfigure
1050     service_reconfigure || return $?
1051     ctdb_counter_init
1052 }
1053
1054 # Default service_reconfigure() function does nothing.
1055 service_reconfigure ()
1056 {
1057     :
1058 }
1059
1060 ctdb_reconfigure_take_lock ()
1061 {
1062     _ctdb_service_reconfigure_common
1063     _lock="${_d}/reconfigure_lock"
1064     mkdir -p "${_lock%/*}" # dirname
1065     touch "$_lock"
1066
1067     (
1068         flock 0
1069         # This is overkill but will work if we need to extend this to
1070         # allow certain events to run multiple times in parallel
1071         # (e.g. takeip) and write multiple PIDs to the file.
1072         read _locker_event
1073         if [ -n "$_locker_event" ] ; then
1074             while read _pid ; do
1075                 if [ -n "$_pid" -a "$_pid" != $$ ] && \
1076                     kill -0 "$_pid" 2>/dev/null ; then
1077                     exit 1
1078                 fi
1079             done
1080         fi
1081
1082         printf "%s\n%s\n" "$event_name" $$ >"$_lock"
1083         exit 0
1084     ) <"$_lock"
1085 }
1086
1087 ctdb_reconfigure_release_lock ()
1088 {
1089     _ctdb_service_reconfigure_common
1090     _lock="${_d}/reconfigure_lock"
1091
1092     rm -f "$_lock"
1093 }
1094
1095 ctdb_replay_monitor_status ()
1096 {
1097     echo "Replaying previous status for this script due to reconfigure..."
1098     # Leading colon (':') is missing in some versions...
1099     _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:")
1100     # Output looks like this:
1101     # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar:
1102     # This is the cheapest way of getting fields in the middle.
1103     set -- $(IFS=":" ; echo $_out)
1104     _code="$3"
1105     _status="$4"
1106     # The error output field can include colons so we'll try to
1107     # preserve them.  The weak checking at the beginning tries to make
1108     # this work for both broken (no leading ':') and fixed output.
1109     _out="${_out%:}"
1110     _err_out="${_out#*monitor:${script_name}:*:*:*:*:}"
1111     case "$_status" in
1112         OK) : ;;  # Do nothing special.
1113         TIMEDOUT)
1114             # Recast this as an error, since we can't exit with the
1115             # correct negative number.
1116             _code=1
1117             _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
1118             ;;
1119         DISABLED)
1120             # Recast this as an OK, since we can't exit with the
1121             # correct negative number.
1122             _code=0
1123             _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
1124             ;;
1125         *) : ;;  # Must be ERROR, do nothing special.
1126     esac
1127     if [ -n "$_err_out" ] ; then
1128         echo "$_err_out"
1129     fi
1130     exit $_code
1131 }
1132
1133 ctdb_service_check_reconfigure ()
1134 {
1135     assert_service_name
1136
1137     # We only care about some events in this function.  For others we
1138     # return now.
1139     case "$event_name" in
1140         monitor|ipreallocated|reconfigure) : ;;
1141         *) return 0 ;;
1142     esac
1143
1144     if ctdb_reconfigure_take_lock ; then
1145         # No events covered by this function are running, so proceed
1146         # with gay abandon.
1147         case "$event_name" in
1148             reconfigure)
1149                 (ctdb_service_reconfigure)
1150                 exit $?
1151                 ;;
1152             ipreallocated)
1153                 if ctdb_service_needs_reconfigure ; then
1154                     ctdb_service_reconfigure
1155                 fi
1156                 ;;
1157         esac
1158
1159         ctdb_reconfigure_release_lock
1160     else
1161         # Somebody else is running an event we don't want to collide
1162         # with.  We proceed with caution.
1163         case "$event_name" in
1164             reconfigure)
1165                 # Tell whoever called us to retry.
1166                 exit 2
1167                 ;;
1168             ipreallocated)
1169                 # Defer any scheduled reconfigure and just run the
1170                 # rest of the ipreallocated event, as per the
1171                 # eventscript.  There's an assumption here that the
1172                 # event doesn't depend on any scheduled reconfigure.
1173                 # This is true in the current code.
1174                 return 0
1175                 ;;
1176             monitor)
1177                 # There is most likely a reconfigure in progress so
1178                 # the service is possibly unstable.  As above, we
1179                 # defer any scheduled reconfigured.  We also replay
1180                 # the previous monitor status since that's the best
1181                 # information we have.
1182                 ctdb_replay_monitor_status
1183                 ;;
1184         esac
1185     fi
1186 }
1187
1188 ##################################################################
1189 # Does CTDB manage this service? - and associated auto-start/stop
1190
1191 ctdb_compat_managed_service ()
1192 {
1193     if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
1194         CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
1195     fi
1196 }
1197
1198 is_ctdb_managed_service ()
1199 {
1200     assert_service_name
1201
1202     # $t is used just for readability and to allow better accurate
1203     # matching via leading/trailing spaces
1204     t=" $CTDB_MANAGED_SERVICES "
1205
1206     # Return 0 if "<space>$service_name<space>" appears in $t
1207     if [ "${t#* ${service_name} }" != "${t}" ] ; then
1208         return 0
1209     fi
1210
1211     # If above didn't match then update $CTDB_MANAGED_SERVICES for
1212     # backward compatibility and try again.
1213     ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
1214     ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
1215     ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
1216     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
1217     ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
1218     ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
1219     ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
1220     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
1221     ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
1222
1223     t=" $CTDB_MANAGED_SERVICES "
1224
1225     # Return 0 if "<space>$service_name<space>" appears in $t
1226     [ "${t#* ${service_name} }" != "${t}" ]
1227 }
1228
1229 ctdb_start_stop_service ()
1230 {
1231     assert_service_name
1232
1233     # Allow service-start/service-stop pseudo-events to start/stop
1234     # services when we're not auto-starting/stopping and we're not
1235     # monitoring.
1236     case "$event_name" in
1237         service-start)
1238             if is_ctdb_managed_service ; then
1239                 die 'service-start event not permitted when service is managed'
1240             fi
1241             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1242                 die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1243             fi
1244             ctdb_service_start
1245             exit $?
1246             ;;
1247         service-stop)
1248             if is_ctdb_managed_service ; then
1249                 die 'service-stop event not permitted when service is managed'
1250             fi
1251             if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
1252                 die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
1253             fi
1254             ctdb_service_stop
1255             exit $?
1256             ;;
1257     esac
1258
1259     # Do nothing unless configured to...
1260     [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
1261
1262     [ "$event_name" = "monitor" ] || return 0
1263
1264     if is_ctdb_managed_service ; then
1265         if ! is_ctdb_previously_managed_service ; then
1266             echo "Starting service \"$service_name\" - now managed"
1267             background_with_logging ctdb_service_start
1268             exit $?
1269         fi
1270     else
1271         if is_ctdb_previously_managed_service ; then
1272             echo "Stopping service \"$service_name\" - no longer managed"
1273             background_with_logging ctdb_service_stop
1274             exit $?
1275         fi
1276     fi
1277 }
1278
1279 ctdb_service_start ()
1280 {
1281     # The service is marked managed if we've ever tried to start it.
1282     ctdb_service_managed
1283
1284     service_start || return $?
1285
1286     ctdb_counter_init
1287     ctdb_check_tcp_init
1288 }
1289
1290 ctdb_service_stop ()
1291 {
1292     ctdb_service_unmanaged
1293     service_stop
1294 }
1295
1296 # Default service_start() and service_stop() functions.
1297
1298 # These may be overridden in an eventscript.
1299 service_start ()
1300 {
1301     service "$service_name" start
1302 }
1303
1304 service_stop ()
1305 {
1306     service "$service_name" stop
1307 }
1308
1309 ##################################################################
1310
1311 ctdb_standard_event_handler ()
1312 {
1313     case "$1" in
1314         status)
1315             ctdb_checkstatus
1316             exit
1317             ;;
1318         setstatus)
1319             shift
1320             ctdb_setstatus "$@"
1321             exit
1322             ;;
1323     esac
1324 }
1325
1326 # iptables doesn't like being re-entered, so flock-wrap it.
1327 iptables()
1328 {
1329         flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
1330 }
1331
1332 # AIX (and perhaps others?) doesn't have mktemp
1333 if ! which mktemp >/dev/null 2>&1 ; then
1334     mktemp ()
1335     {
1336         _dir=false
1337         if [ "$1" = "-d" ] ; then
1338             _dir=true
1339             shift
1340         fi
1341         _d="${TMPDIR:-/tmp}"
1342         _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
1343             md5sum | \
1344             sed -e 's@\(..........\).*@\1@')
1345         _t="${_d}/tmp.${_hex10}"
1346         (
1347             umask 077
1348             if $_dir ; then
1349                 mkdir "$_t"
1350             else
1351                 >"$_t"
1352             fi
1353         )
1354         echo "$_t"
1355     }
1356 fi
1357
1358 ########################################################
1359 # tickle handling
1360 ########################################################
1361
1362 update_tickles ()
1363 {
1364         _port="$1"
1365
1366         tickledir="$CTDB_VARDIR/state/tickles"
1367         mkdir -p "$tickledir"
1368
1369         # Who am I?
1370         _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
1371
1372         # What public IPs do I hold?
1373         _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
1374
1375         # IPs as a regexp choice
1376         _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
1377
1378         # Record connections to our public IPs in a temporary file
1379         _my_connections="${tickledir}/${_port}.connections"
1380         rm -f "$_my_connections"
1381         netstat -tn |
1382         awk -v destpat="^${_ipschoice}:${_port}\$" \
1383           '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
1384         sort >"$_my_connections"
1385
1386         # Record our current tickles in a temporary file
1387         _my_tickles="${tickledir}/${_port}.tickles"
1388         rm -f "$_my_tickles"
1389         for _i in $_ips ; do
1390                 ctdb -Y gettickles $_i $_port |
1391                 awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
1392         done |
1393         sort >"$_my_tickles"
1394
1395         # Add tickles for connections that we haven't already got tickles for
1396         comm -23 "$_my_connections" "$_my_tickles" |
1397         while read _src _dst ; do
1398                 ctdb addtickle $_src $_dst
1399         done
1400
1401         # Remove tickles for connections that are no longer there
1402         comm -13 "$_my_connections" "$_my_tickles" |
1403         while read _src _dst ; do
1404                 ctdb deltickle $_src $_dst
1405         done
1406
1407         rm -f "$_my_connections" "$_my_tickles"
1408 }
1409
1410 ########################################################
1411 # load a site local config file
1412 ########################################################
1413
1414 [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
1415         . "$CTDB_RC_LOCAL"
1416 }
1417
1418 [ -x $CTDB_BASE/rc.local ] && {
1419         . $CTDB_BASE/rc.local
1420 }
1421
1422 [ -d $CTDB_BASE/rc.local.d ] && {
1423         for i in $CTDB_BASE/rc.local.d/* ; do
1424                 [ -x "$i" ] && . "$i"
1425         done
1426 }
1427
1428 script_name="${0##*/}"       # basename
1429 service_fail_limit=1
1430 event_name="$1"