libpcp_web: JSON API coding consistency, small improvements and fixes
[pcp.git] / src / pmie / pmie_check.sh
blob40b1824287e39675845f71ee46a552da819f7727
1 #! /bin/sh
3 # Copyright (c) 2013-2016 Red Hat.
4 # Copyright (c) 1998-2000,2003 Silicon Graphics, Inc. All Rights Reserved.
5 #
6 # This program is free software; you can redistribute it and/or modify it
7 # under the terms of the GNU General Public License as published by the
8 # Free Software Foundation; either version 2 of the License, or (at your
9 # option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 # for more details.
16 # Administrative script to check pmie processes are alive, and restart
17 # them as required.
20 # Get standard environment
21 . $PCP_DIR/etc/pcp.env
22 . $PCP_SHARE_DIR/lib/rc-proc.sh
24 PMIE="$PCP_BIN_DIR/pmie"
25 PMIECONF="$PCP_BIN_DIR/pmieconf"
27 # error messages should go to stderr, not the GUI notifiers
29 unset PCP_STDERR
31 # added to handle problem when /var/log/pcp is a symlink, as first
32 # reported by Micah_Altman@harvard.edu in Nov 2001
34 _unsymlink_path()
36 [ -z "$1" ] && return
37 __d=`dirname $1`
38 __real_d=`cd $__d 2>/dev/null && $PWDCMND`
39 if [ -z "$__real_d" ]
40 then
41 echo $1
42 else
43 echo $__real_d/`basename $1`
47 # constant setup
49 tmp=`mktemp -d /tmp/pcp.XXXXXXXXX` || exit 1
50 status=0
51 echo >$tmp/lock
52 prog=`basename $0`
53 PROGLOG=$PCP_LOG_DIR/pmie/$prog.log
54 USE_SYSLOG=true
56 _cleanup()
58 $USE_SYSLOG && [ $status -ne 0 ] && \
59 $PCP_SYSLOG_PROG -p daemon.error "$prog failed - see $PROGLOG"
60 [ -s "$PROGLOG" ] || rm -f "$PROGLOG"
61 lockfile=`cat $tmp/lock 2>/dev/null`
62 rm -f "$lockfile"
63 rm -rf $tmp
65 trap "_cleanup; exit \$status" 0 1 2 3 15
67 # control files for pmie administration ... edit the entries in this
68 # file (and optional directory) to reflect your local configuration;
69 # see also -c option below.
71 CONTROL=$PCP_PMIECONTROL_PATH
72 CONTROLDIR=$PCP_PMIECONTROL_PATH.d
74 # determine path for pwd command to override shell built-in
75 PWDCMND=`which pwd 2>/dev/null | $PCP_AWK_PROG '
76 BEGIN { i = 0 }
77 / not in / { i = 1 }
78 / aliased to / { i = 1 }
79 { if ( i == 0 ) print }
81 [ -z "$PWDCMND" ] && PWDCMND=/bin/pwd
82 eval $PWDCMND -P >/dev/null 2>&1
83 [ $? -eq 0 ] && PWDCMND="$PWDCMND -P"
84 here=`$PWDCMND`
86 # option parsing
88 SHOWME=false
89 MV=mv
90 RM=rm
91 CP=cp
92 KILL=pmsignal
93 TERSE=false
94 VERBOSE=false
95 VERY_VERBOSE=false
96 CHECK_RUNLEVEL=false
97 START_PMIE=true
98 STOP_PMIE=false
100 echo > $tmp/usage
101 cat >> $tmp/usage << EOF
102 Options:
103 -c=FILE,--control=FILE configuration of pmie instances to manage
104 -l=FILE,--logfile=FILE send important diagnostic messages to FILE
105 -C query system service runlevel information
106 -N,--showme perform a dry run, showing what would be done
107 -s,--stop stop pmie processes instead of starting them
108 -T,--terse produce a terser form of output
109 -V,--verbose increase diagnostic verbosity
110 --help
113 ARGS=`pmgetopt --progname=$prog --config=$tmp/usage -- "$@"`
114 [ $? != 0 ] && exit 1
116 eval set -- "$ARGS"
117 while [ $# -gt 0 ]
119 case "$1"
121 -c) CONTROL="$2"
122 CONTROLDIR="$2.d"
123 shift
125 -C) CHECK_RUNLEVEL=true
127 -l) PROGLOG="$2"
128 USE_SYSLOG=false
129 shift
131 -N) SHOWME=true
132 USE_SYSLOG=false
133 MV="echo + mv"
134 RM="echo + rm"
135 CP="echo + cp"
136 KILL="echo + kill"
138 -s) START_PMIE=false
139 STOP_PMIE=true
141 -T) TERSE=true
143 -V) if $VERBOSE
144 then
145 VERY_VERBOSE=true
146 else
147 VERBOSE=true
150 --) shift
151 break
153 -\?) pmgetopt --usage --progname=$prog --config=$tmp/usage
154 status=1
155 exit
157 esac
158 shift
159 done
161 if [ $# -ne 0 ]
162 then
163 pmgetopt --usage --progname=$prog --config=$tmp/usage
164 status=1
165 exit
168 # after argument checking, everything must be logged to ensure no mail is
169 # accidentally sent from cron. Close stdout and stderr, then open stdout
170 # as our logfile and redirect stderr there too.
172 PROGLOGDIR=`dirname "$PROGLOG"`
173 [ -d "$PROGLOGDIR" ] || mkdir -p -m 775 "$PROGLOGDIR" 2>/dev/null
174 [ -f "$PROGLOG" ] && mv "$PROGLOG" "$PROGLOG.prev"
175 exec 1>"$PROGLOG"
176 exec 2>&1
178 _error()
180 echo "$prog: [$controlfile:$line]"
181 echo "Error: $@"
182 echo "... automated performance reasoning for host \"$host\" unchanged"
183 touch $tmp/err
186 _warning()
188 echo "$prog [$controlfile:$line]"
189 echo "Warning: $@"
192 _restarting()
194 $PCP_ECHO_PROG $PCP_ECHO_N "Restarting pmie for host \"$host\" ...""$PCP_ECHO_C"
197 _lock()
199 # demand mutual exclusion
201 rm -f $tmp/stamp $tmp/out
202 delay=200 # tenths of a second
203 while [ $delay -ne 0 ]
205 if pmlock -v $logfile.lock >$tmp/out
206 then
207 echo $logfile.lock >$tmp/lock
208 break
209 else
210 if [ ! -f $tmp/stamp ]
211 then
212 touch -t `pmdate -30M %Y%m%d%H%M` $tmp/stamp
214 if [ -n "`find $logfile.lock ! -newer $tmp/stamp -print 2>/dev/null`" ]
215 then
216 _warning "removing lock file older than 30 minutes"
217 ls -l $logfile.lock
218 rm -f $logfile.lock
221 pmsleep 0.1
222 delay=`expr $delay - 1`
223 done
225 if [ $delay -eq 0 ]
226 then
227 # failed to gain mutex lock
229 if [ -f $logfile.lock ]
230 then
231 _warning "is another PCP cron job running concurrently?"
232 ls -l $logfile.lock
233 else
234 echo "$prog: `cat $tmp/out`"
236 _warning "failed to acquire exclusive lock ($logfile.lock) ..."
237 continue
241 _unlock()
243 rm -f $logfile.lock
244 echo >$tmp/lock
247 _check_logfile()
249 if [ ! -f $logfile ]
250 then
251 echo "$prog: Error: cannot find pmie output file at \"$logfile\""
252 if $TERSE
253 then
255 else
256 logdir=`dirname $logfile`
257 echo "Directory (`cd $logdir; $PWDCMND`) contents:"
258 LC_TIME=POSIX ls -la $logdir
260 else
261 echo "Contents of pmie output file \"$logfile\" ..."
262 cat $logfile
266 _check_pmie()
268 $VERBOSE && $PCP_ECHO_PROG $PCP_ECHO_N " [process $1] ""$PCP_ECHO_C"
270 # wait until pmie process starts, or exits
272 delay=5
273 [ ! -z "$PMCD_CONNECT_TIMEOUT" ] && delay=$PMCD_CONNECT_TIMEOUT
275 [ ! -z "$PMCD_REQUEST_TIMEOUT" ] && x=$PMCD_REQUEST_TIMEOUT
277 # wait for maximum time of a connection and 20 requests
279 delay=`expr \( $delay + 20 \* $x \) \* 10` # tenths of a second
280 while [ $delay -ne 0 ]
282 if [ -f $logfile ]
283 then
284 # $logfile was previously removed, if it has appeared again then
285 # we know pmie has started ... if not just sleep and try again
287 if ls "$PCP_TMP_DIR/pmie/$1" >$tmp/out 2>&1
288 then
289 if grep "No such file or directory" $tmp/out >/dev/null
290 then
292 else
293 $VERBOSE && echo " done"
294 return 0
298 _plist=`_get_pids_by_name pmie`
299 _found=false
300 for _p in `echo $_plist`
302 [ $_p -eq $1 ] && _found=true
303 done
305 if $_found
306 then
307 # process still here, just hasn't created its status file
308 # yet, try again
310 else
311 $VERBOSE || _restarting
312 echo " process exited!"
313 if $TERSE
314 then
316 else
317 echo "$prog: Error: failed to restart pmie"
318 echo "Current pmie processes:"
319 $PCP_PS_PROG $PCP_PS_ALL_FLAGS | tee $tmp/tmp | sed -n -e 1p
320 for _p in `echo $_plist`
322 sed -n -e "/^[ ]*[^ ]* [ ]*$_p /p" < $tmp/tmp
323 done
324 echo
326 _check_logfile
327 return 1
330 pmsleep 0.1
331 delay=`expr $delay - 1`
332 $VERBOSE && [ `expr $delay % 10` -eq 0 ] && \
333 $PCP_ECHO_PROG $PCP_ECHO_N ".""$PCP_ECHO_C"
334 done
335 $VERBOSE || _restarting
336 echo " timed out waiting!"
337 if $TERSE
338 then
340 else
341 sed -e 's/^/ /' $tmp/out
343 _check_logfile
344 return 1
347 _get_configfile()
349 # extract the pmie configuration file (-c) from a list of arguments
351 echo $@ | sed -n \
352 -e 's/^/ /' \
353 -e 's/[ ][ ]*/ /g' \
354 -e 's/-c /-c/' \
355 -e 's/.* -c\([^ ]*\).*/\1/p'
358 _configure_pmie()
360 # update a pmie configuration file if it should be created/modified
362 tmpconfig="$1.tmp"
363 configfile="$1"
364 isprimary="$3"
366 if [ -f "$configfile" ]
367 then
368 # look for "magic" string at start of file, and ensure we created it
369 sed 1q "$configfile" | grep '^// pmieconf-pmie [0-9]' >/dev/null
370 magic=$?
371 grep '^// Auto-generated by pmieconf' "$configfile" >/dev/null
372 owned=$?
373 if [ $magic -eq 0 -a $owned -eq 0 ]
374 then
375 # pmieconf file, see if re-generation is needed
376 cp "$configfile" "$tmpconfig"
377 if $PMIECONF -f "$tmpconfig" -cF >$tmp/diag 2>&1
378 then
379 [ $isprimary = y ] && $PMIECONF -f "$tmpconfig" modify primary enabled yes
380 grep -v "generated by pmieconf" "$configfile" >$tmp/old
381 grep -v "generated by pmieconf" "$tmpconfig" >$tmp/new
382 if ! diff $tmp/old $tmp/new >/dev/null
383 then
384 if [ -w "$configfile" ]
385 then
386 $VERBOSE && echo "Reconfigured: \"$configfile\" (pmieconf)"
387 eval $CP "$tmpconfig" "$configfile"
388 else
389 _warning "no write access to pmieconf file \"$configfile\", skip reconfiguration"
390 ls -l "$configfile"
393 else
394 _warning "pmieconf failed to reconfigure \"$configfile\""
395 sed -e "s;$tmpconfig;$configfile;g" $tmp/diag
396 echo "=== start pmieconf file ==="
397 cat "$tmpconfig"
398 echo "=== end pmieconf file ==="
401 elif [ ! -e "$configfile" ]
402 then
403 # file does not exist, generate it, if possible
404 if $SHOWME
405 then
406 echo "+ $PMIECONF -f $configfile -cF"
407 elif ! $PMIECONF -f "$configfile" -cF >$tmp/diag 2>&1
408 then
409 _warning "pmieconf failed to generate \"$configfile\""
410 cat $tmp/diag
411 echo "=== start pmieconf file ==="
412 cat "$configfile"
413 echo "=== end pmieconf file ==="
414 else
415 [ $isprimary = y ] && $PMIECONF -f "$configfile" modify primary enabled yes
416 chown $PCP_USER:$PCP_GROUP "$configfile" >/dev/null 2>&1
421 QUIETLY=false
422 if [ $CHECK_RUNLEVEL = true ]
423 then
424 # determine whether to start pmie based on runlevel settings - we
425 # need to do this when running unilaterally from cron, else we'll
426 # always start pmie up (even when we shouldn't).
428 QUIETLY=true
429 if is_chkconfig_on pmie
430 then
431 START_PMIE=true
432 else
433 START_PMIE=false
437 if [ $STOP_PMIE = true ]
438 then
439 # if pmie has never been started, there's no work to do to stop it
440 [ ! -d "$PCP_TMP_DIR/pmie" ] && exit
441 $QUIETLY || $PCP_BINADM_DIR/pmpost "stop pmie from $prog"
442 elif [ $START_PMIE = false ]
443 then
444 exit
447 if [ ! -f "$CONTROL" ]
448 then
449 echo "$prog: Error: cannot find control file ($CONTROL)"
450 status=1
451 exit
454 # note on control file format version
455 # 1.0 was the first release, and did not include the primary field
456 # [this is the default for backwards compatibility]
457 # 1.1 adds the primary field (ala pmlogger control file) indicating
458 # localhost-specific rules should be enabled
460 version=''
462 rm -f $tmp/err $tmp/pmies
464 _parse_control()
466 controlfile="$1"
467 line=0
469 if echo "$controlfile" | grep -q -e '\.rpmsave' -e '\.rpmnew'
470 then
471 _warning "ignored backup control file \"$controlfile\""
472 return
475 sed -e "s;PCP_LOG_DIR;$PCP_LOG_DIR;g" $controlfile | \
476 while read host primary socks logfile args
478 # start in one place for each iteration (beware relative paths)
479 cd "$here"
480 line=`expr $line + 1`
482 case "$host"
484 \#*|'') # comment or empty
485 continue
487 \$*) # in-line variable assignment
488 $SHOWME && echo "# $host $primary $socks $logfile $args"
489 cmd=`echo "$host $primary $socks $logfile $args" \
490 | sed -n \
491 -e "/='/s/\(='[^']*'\).*/\1/" \
492 -e '/="/s/\(="[^"]*"\).*/\1/' \
493 -e '/=[^"'"'"']/s/[;&<>|].*$//' \
494 -e '/^\\$[A-Za-z][A-Za-z0-9_]*=/{
495 s/^\\$//
496 /^\([A-Za-z][A-Za-z0-9_]*\)=/s//export \1; \1=/p
498 if [ -z "$cmd" ]
499 then
500 # in-line command, not a variable assignment
501 _warning "in-line command is not a variable assignment, line ignored"
502 else
503 case "$cmd"
505 'export PATH;'*)
506 _warning "cannot change \$PATH, line ignored"
508 'export IFS;'*)
509 _warning "cannot change \$IFS, line ignored"
512 $SHOWME && echo "+ $cmd"
513 echo eval $cmd >>$tmp/cmd
514 eval $cmd
516 esac
518 continue
520 esac
522 # set the version and other variables
524 if [ -f $tmp/cmd ]
525 then
526 . $tmp/cmd
527 if grep 'version=' $tmp/cmd >/dev/null
528 then
529 case "$version"
531 1.0|1.1)
534 _error "bad version ($version) in control file"
535 return
537 esac
541 if [ -z "$version" ]
542 then
543 _warning "missing \$version, assuming version 1.0 control format"
544 version=1.0
546 if [ "$version" = "1.0" ]
547 then
548 # handle backwards compatibility
549 # one less field and primary defaults to "n" for version 1.0
551 if [ -n "$logfile" ]
552 then
553 args="$logfile $args"
554 else
555 # missing "args" ... this is bad, but will be reported below
556 # ... guard avoids setting "args" to " " which would defeat
557 # the test below
559 args=""
561 logfile="$socks"
562 socks="$primary"
563 primary=n
566 if [ -z "$primary" -o -z "$socks" -o -z "$logfile" -o -z "$args" ]
567 then
568 _error "insufficient fields in control file record"
569 continue
571 if [ "$primary" != y -a "$primary" != n ]
572 then
573 _error "primary field in control file record must be y or n, not \"$primary\""
574 continue
576 if [ "$socks" != y -a "$socks" != n ]
577 then
578 _error "socks field in control file record must be y or n, not \"$socks\""
579 continue
582 # args should begin with a hyphen
584 case "$args"
589 _error "args field in control file must begin with a hyphen not \"$args\""
590 continue
592 esac
594 # substitute LOCALHOSTNAME marker in this config line
595 # (differently for logfile and pcp -h HOST arguments)
597 logfilehost=`hostname || echo localhost`
598 logfile=`echo $logfile | sed -e "s;LOCALHOSTNAME;$logfilehost;"`
599 logfile=`_unsymlink_path $logfile`
600 [ $primary = y -o "x$host" = xLOCALHOSTNAME ] && host=local:
602 dir=`dirname $logfile`
603 $VERY_VERBOSE && echo "Check pmie -h $host -l $logfile ..."
605 # make sure output directory exists
607 if [ ! -d "$dir" ]
608 then
609 mkdir -p -m 755 "$dir" >$tmp/err 2>&1
610 if [ ! -d "$dir" ]
611 then
612 cat $tmp/err
613 _error "cannot create directory ($dir) for pmie log file"
614 continue
618 # and the user pcp can write there
620 chown $PCP_USER:$PCP_GROUP "$dir" >/dev/null 2>&1
622 # and the logfile is writeable, if it exists
624 [ -f "$logfile" ] && chown $PCP_USER:$PCP_GROUP "$logfile" >/dev/null 2>&1
626 if cd "$dir"
627 then
629 else
630 _error "cannot chdir to directory ($dir) for pmie log file"
631 continue
633 dir=`$PWDCMND`
634 $SHOWME && echo "+ cd $dir"
636 if [ ! -w "$dir" ]
637 then
638 _warning "no write access in $dir, skip lock file processing"
639 ls -ld "$dir"
640 else
641 _lock
644 # match $logfile from control file to running pmies
645 pid=""
646 for pidfile in $PCP_TMP_DIR/pmie/[0-9]*
648 [ "$pidfile" = "$PCP_TMP_DIR/pmie/[0-9]*" ] && continue
649 $VERY_VERBOSE && $PCP_ECHO_PROG $PCP_ECHO_N "... try $pidfile: ""$PCP_ECHO_C"
651 p_id=`echo $pidfile | sed -e 's,.*/,,'`
652 p_logfile=""
653 p_pmcd_host=""
655 # throw away stderr in case $pidfile has been removed by now
656 eval `$PCP_BINADM_DIR/pmiestatus $pidfile 2>/dev/null | $PCP_AWK_PROG '
657 NR == 2 { printf "p_logfile=\"%s\"\n", $0; next }
658 NR == 3 { printf "p_pmcd_host=\"%s\"\n", $0; next }
659 { next }'`
661 p_logfile=`_unsymlink_path $p_logfile`
662 if [ "$p_logfile" != $logfile ]
663 then
664 $VERY_VERBOSE && echo "different logfile, skip"
665 $VERY_VERBOSE && echo " $p_logfile differs to $logfile"
666 elif _get_pids_by_name pmie | grep "^$p_id\$" >/dev/null
667 then
668 $VERY_VERBOSE && echo "pmie process $p_id identified, OK"
669 pid=$p_id
670 break
671 else
672 $VERY_VERBOSE && echo "pmie process $p_id not running, skip"
673 $VERY_VERBOSE && _get_pids_by_name pmie
675 done
677 if $VERY_VERBOSE
678 then
679 if [ -z "$pid" ]
680 then
681 echo "No current pmie process exists for:"
682 else
683 echo "Found pmie process $pid monitoring:"
685 echo " host = $host"
686 echo " log file = $logfile"
689 if [ -z "$pid" -a $START_PMIE = true ]
690 then
691 configfile=`_get_configfile $args`
692 if [ ! -z "$configfile" ]
693 then
694 # if this is a relative path and not relative to cwd,
695 # substitute in the default pmie search location.
697 if [ ! -f "$configfile" -a "`basename $configfile`" = "$configfile" ]
698 then
699 configfile="$PCP_VAR_DIR/config/pmie/$configfile"
702 # check configuration file exists and is up to date
703 _configure_pmie "$configfile" "$host" "$primary"
706 args="-h $host -l $logfile $args"
708 $VERBOSE && _restarting
710 sock_me=''
711 if [ "$socks" = y ]
712 then
713 # only check for pmsocks if it's specified in the control file
714 have_pmsocks=false
715 if which pmsocks >/dev/null 2>&1
716 then
717 # check if pmsocks has been set up correctly
718 if pmsocks ls >/dev/null 2>&1
719 then
720 have_pmsocks=true
724 if $have_pmsocks
725 then
726 sock_me="pmsocks "
727 else
728 echo "$prog: Warning: no pmsocks available, would run without"
729 sock_me=""
733 [ -f "$logfile" ] && eval $MV -f "$logfile" "$logfile.prior"
735 if $SHOWME
736 then
737 $VERBOSE && echo
738 echo "+ ${sock_me}$PMIE -b $args"
739 _unlock
740 continue
741 else
742 # since this is launched as a sort of daemon, any output should
743 # go on pmie's stderr, i.e. $logfile ... use -b for this
745 $VERY_VERBOSE && ( echo; $PCP_ECHO_PROG $PCP_ECHO_N "+ ${sock_me}$PMIE -b $args""$PCP_ECHO_C"; echo "..." )
746 $PCP_BINADM_DIR/pmpost "start pmie from $prog for host $host"
747 ${sock_me}$PMIE -b $args &
748 pid=$!
751 # wait for pmie to get started, and check on its health
752 _check_pmie $pid
754 elif [ ! -z "$pid" -a $STOP_PMIE = true ]
755 then
756 # Send pmie a SIGTERM, which is noted as a pending shutdown.
757 # Add pid to list of pmies sent SIGTERM - may need SIGKILL later.
759 $VERY_VERBOSE && echo "+ $KILL -s TERM $pid"
760 eval $KILL -s TERM $pid
761 $PCP_ECHO_PROG $PCP_ECHO_N "$pid ""$PCP_ECHO_C" >> $tmp/pmies
764 _unlock
765 done
768 _parse_control $CONTROL
769 append=`ls $CONTROLDIR 2>/dev/null | LC_COLLATE=POSIX sort`
770 for controlfile in $append
772 _parse_control $CONTROLDIR/$controlfile
773 done
775 # check all the SIGTERM'd pmies really died - if not, use a bigger hammer.
777 if $SHOWME
778 then
780 elif [ $STOP_PMIE = true -a -s $tmp/pmies ]
781 then
782 pmielist=`cat $tmp/pmies`
783 if $PCP_PS_PROG -p "$pmielist" >/dev/null 2>&1
784 then
785 $VERY_VERBOSE && ( echo; $PCP_ECHO_PROG $PCP_ECHO_N "+ $KILL -KILL `cat $tmp/pmies` ...""$PCP_ECHO_C" )
786 eval $KILL -s KILL $pmielist >/dev/null 2>&1
787 delay=30 # tenths of a second
788 while $PCP_PS_PROG -f -p "$pmielist" >$tmp/alive 2>&1
790 if [ $delay -gt 0 ]
791 then
792 pmsleep 0.1
793 delay=`expr $delay - 1`
794 continue
796 echo "$prog: Error: pmie process(es) will not die"
797 cat $tmp/alive
798 status=1
799 break
800 done
804 [ -f $tmp/err ] && status=1
805 exit