pmrep: update TODO, man page
[pcp.git] / qa / check
blobd1f5bb776cdb90b566bce522d4d1173fb6be1193
1 #! /bin/sh
3 # Control script for running PCP QA tests
5 # Copyright (c) 1997-2002 Silicon Graphics, Inc. All Rights Reserved.
8 mypid=$$
9 status=0
10 needwrap=true
11 try=0
12 n_bad=0
13 bad=""
14 notrun=""
15 interrupt=true
16 myname=`basename $0`
17 iam=$myname # a synonym
19 # status and log files
20 CHECKLOCK=/tmp/check-LOCK
21 CHECKSTS=/tmp/check.sts # If you change these, hangcheck.pcpqa
22 CHECKPID=/tmp/check.pid # will need to change, too.
23 CHECKSLOG=/var/tmp/check-start.log # A check.log already exists for
24 # another reason.
27 _wallclock()
29 date "+%H %M %S" | $PCP_AWK_PROG '{ print $1*3600 + $2*60 + $3 }'
32 _timestamp()
34 now=`date "+%D-%T"`
35 $PCP_ECHO_PROG $PCP_ECHO_N " [$now]""$PCP_ECHO_C"
38 _release_lock()
40 if [ -f "$CHECKLOCK" ]
41 then
42 LOCKOWNER=`cat "$CHECKLOCK" 2>/dev/null` || return 0
43 [ "$LOCKOWNER" = "$mypid" ] && rm -f "$CHECKLOCK"
46 return 0
49 _wrapup()
51 # for hangcheck ...
52 # remove files that were used by hangcheck
54 if [ "$HANGCHECK" = true -a "$USER" = pcpqa ]
55 then
56 checkpid=`cat "$CHECKPID"`
57 [ "$checkpid" = "$mypid" -a -f "$CHECKSTS" ] && rm -f "$CHECKSTS"
58 [ "$checkpid" = "$mypid" -a -f "$CHECKPID" ] && rm -f "$CHECKPID"
61 if [ -z "$tmp" ]
62 then
63 # did not get very far into the intialization!
65 else
66 # release the lock and remove backup files
67 _release_lock
68 [ -d $tmp ] && ( rm -rf $tmp/checksums ; rmdir $tmp )
70 if $showme
71 then
73 elif $needwrap
74 then
75 if [ -f check.time -a -f $tmp.time ]
76 then
77 cat check.time $tmp.time \
78 | $PCP_AWK_PROG '
79 { t[$1] = $2 }
80 END { if (NR > 0) {
81 for (i in t) print i " " t[i]
83 }' \
84 | sort -n >$tmp.out
85 mv $tmp.out check.time
88 echo "" >>check.log
89 date >>check.log
90 echo $list | fmt | sed -e 's/^/ /' >>check.log
91 $interrupt && echo "Interrupted! [running $seq]" >>check.log
93 if [ ! -z "$notrun" ]
94 then
95 [ $color = true ] && tput bold && tput setaf 4 # blue
96 echo "Not run:$notrun"
97 [ $color = true ] && tput sgr0 # reset
98 echo "Not run:$notrun" | fmt >>check.log
100 if [ ! -z "$n_bad" -a "$n_bad" != 0 ]
101 then
102 [ $color = true ] && tput bold && tput setaf 1 # red
103 echo "Failures:$bad"
104 echo "Failed $n_bad of $try tests"
105 [ $color = true ] && tput sgr0 # reset
106 echo "Failures:$bad" | fmt >>check.log
107 echo "Failed $n_bad of $try tests" >>check.log
108 else
109 if [ $try != 0 ]
110 then
111 [ $color = true ] && tput bold && tput setaf 2 # green
112 echo "Passed all $try tests"
113 [ $color = true ] && tput sgr0 # reset
114 echo "Passed all $try tests" >>check.log
117 needwrap=false
120 rm -f $tmp.*
124 _addfiles ()
126 af=$1
127 [ "$af" = "" ] && return 1
128 [ ! -f "$af" ] && touch "$af"
129 shift
131 for fn in "$@"
133 fgrep -s "$fn" "$af" >/dev/null
134 [ $? = 1 ] && echo "$fn" >>"$af"
135 done
137 return 0
140 _check_lock() {
141 # Check that a check process of that process ID found in
142 # $CHECKLOCK exists, and if not, release the lock.
144 [ ! -f "$CHECKLOCK" ] && return 0
145 PID=`cat "$CHECKLOCK" 2>/dev/null` || return 0
147 CCNT=`ps -e -o "pid args" | grep -v grep | grep "$PID" | grep check | \
148 $PCP_AWK_PROG '{ print $1 }'`
149 if [ "$PID" != "$CCNT" ]
150 then
151 # We can remove the lock; no check process found with that ID
152 $sudo rm -f "$CHECKLOCK"
155 return 0
158 _get_lock()
160 # Does someone else have a lock on check at this time? If so, we
161 # can't run a test until the lock is removed.
163 # NOTE: the use of check-LOCK rather than check.pid was done so that
164 # people running check manually (rather than run.pcpqa running check)
165 # can have tests running between themselves. This is better than
166 # having people waiting on one long series of tests passed to check
167 # and having spent 10 minutes waiting for nothing.
169 # Check that an instance of check who claims to have the lock actually
170 # exists!
171 _check_lock
173 # Get (make) a lock
174 echomessage=true
175 for sleeptime in \
176 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
177 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
178 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
179 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
180 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
181 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
182 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
183 5 5 5 5 5 5 5 5 5 5 5 5 \
184 5 5 5 5 5 5 5 5 5 5 5 5 \
185 5 5 5 5 5 5 5 5 5 5 5 5 \
186 5 5 5 5 5 5 5 5 5 5 5 5 \
187 5 5 5 5 5 5 5 5 5 5 5 5 0 # 10 minutes waiting time per test...
189 if [ -f "$CHECKLOCK" ]
190 then
191 LOCKOWNER=`cat "$CHECKLOCK" 2>/dev/null` || continue
193 if [ "$LOCKOWNER" != $mypid ]
194 then
195 # wait until lock disappears...
196 if [ "$sleeptime" = 0 ]
197 then
198 # We should leave... something's terribly wrong.
199 echo ""
200 return 1
201 else
202 $echomessage && \
203 $PCP_ECHO_PROG $PCP_ECHO_N " waiting for lock [owner pid=$LOCKOWNER]... ""$PCP_ECHO_C" && \
204 echomessage=false
205 sleep $sleeptime
207 else
208 # already have lock
209 break
211 else
212 # make lock
213 echo "$mypid" >"$CHECKLOCK"
214 chmod a+r "$CHECKLOCK"
215 break
217 done
218 $echomessage || echo "got it; proceeding: $seq"
220 return 0
223 _make_checkfiles()
225 if [ ! -f $tmp.checkfiles ]
226 then
227 [ -z "$PCP_PMCDOPTIONS_PATH" ] && \
228 PCP_PMCDOPTIONS_PATH="$PCP_SYSCONF_DIR/pmcd/pmcd.options"
229 [ -z "$PCP_PMLOGGERCONTROL_PATH" ] && \
230 PCP_PMLOGGERCONTROL_PATH="$PCP_SYSCONF_DIR/pmlogger/control"
231 [ -z "$PCP_PMIECONTROL_PATH" ] && \
232 PCP_PMIECONTROL_PATH="$PCP_SYSCONF_DIR/pmie/control"
233 _checkfiles="$PCP_PMCDCONF_PATH \
234 $PCP_PMLOGGERCONTROL_PATH \
235 $PCP_VAR_DIR/config/pmlogger/config.default \
236 $PCP_PMLOGGERCONTROL_PATH \
237 $PCP_PMCDOPTIONS_PATH \
238 $PCP_DIR/etc/init.d/pcp \
239 $PCP_DIR/etc/pcp.conf $PCP_DIR/etc/pcp.env \
240 $PCP_PMDAS_DIR/sample/dsohelp.dir \
241 $PCP_PMDAS_DIR/sample/dsohelp.pag \
242 $PCP_PMDAS_DIR/sample/help.dir \
243 $PCP_PMDAS_DIR/sample/help.pag \
244 $PCP_PMDAS_DIR/simple/simple.conf"
248 _checksums()
250 cmd="$1"
252 _make_checkfiles
254 case "$cmd"
256 get)
257 mkdir -p $tmp/checksums
258 chmod a+w $tmp/checksums
259 for f in `cat $tmp.checkfiles`
261 buf=`echo $f | sed -e 's;/;+;g'`
262 buf=$tmp/checksums/$buf
263 [ -f $f ] && sum $f
264 [ -f $f -a ! -f $buf ] && $sudo cp $f $buf
265 done
268 check)
269 for f in `cat $tmp.checkfiles`
271 buf=`echo $f | sed -e 's;/;+;g'`
272 buf=$tmp/checksums/$buf
273 if [ ! -f $f ]
274 then
275 if fgrep "$f" $2 >/dev/null 2>&1
276 then
277 echo " Missing: \"$f\""
278 [ -f $buf ] && $sudo cp -f $buf $f
280 else
281 _cs=`sum $f`
282 if fgrep "$_cs" $2 >/dev/null 2>&1
283 then
284 $sudo rm -f $f.$seq.O
285 else
286 echo " Changed: \"$f\""
287 $sudo cp -f $f $f.$seq.O
288 [ -f $buf ] && $sudo cp -f $buf $f
292 done
296 bozo
298 esac
299 return 0
302 trap "_wrapup; exit \$status" 0 1 2 3 15
304 # by default don't output timestamps
305 timestamp=false
307 # extra stuff for tracing QA runs - off/on via $qatrace
308 qatrace=false
309 qadepot=mazur.melbourne
310 qasrc=`hostname`
311 # constants - meaningful as state transitions in qavis
312 qanotyet=1 # test not yet started
313 qarunning=2 # test still going
314 qafailed=3 # test failed
315 qapassed=4 # test passed
318 PCP_TRACE_TIMEOUT=15
319 export PCP_TRACE_TIMEOUT
321 # generic initialization... this may take a while to run, because (unless
322 # $quick is true) make is run.
323 . ./common
325 # we have to cheat a bit... but we need to create a check.[pid|sts] file
326 # to tell hangcheck that we are alive, but not ready to run yet.
327 if [ "$HANGCHECK" = true -a "$USER" = pcpqa ]
328 then
329 # for hangcheck ...
330 # Save pid of check in a well known place, so that hangcheck can be sure it
331 # has the right pid (getting the pid from ps output is not reliable enough).
333 if [ -f "$CHECKPID" ]
334 then
335 checkpidowner=`/bin/sh "ls -l $CHECKPID" | $PCP_AWK_PROG '{ print $3 }'`
336 if [ "$checkpidowner" != pcpqa ]
337 then
338 $sudo rm -f "$CHECKPID"
339 else
340 # There should be a BIG FAT WARNING here if QA is trying to
341 # run tests twice!
342 echo "$myname: a check.pid file already exists... are you already running tests?!" >&2
343 exit 1
346 [ ! -f "$CHECKPID" ] && echo "$mypid" >"$CHECKPID"
348 # for hangcheck ...
349 # Save the status of check in a well known place, so that hangcheck can be
350 # sure to know where check is up to (getting test number from ps output is
351 # not reliable enough since the trace stuff has been introduced).
353 if [ -f "$CHECKSTS" ]
354 then
355 checkpidowner=`/bin/sh "ls -l $CHECKSTS" | $PCP_AWK_PROG '{ print $3 }'`
356 if [ "$checkpidowner" != pcpqa ]
357 then
358 $sudo rm -f "$CHECKSTS"
359 else
360 echo "$myname: a check.sts file already exists... are you already running tests?!" >&2
361 exit 1
364 [ ! -f "$CHECKSTS" ] && echo "preamble" >"$CHECKSTS"
367 [ -f check.time ] || touch check.time
369 [ "`_get_config pmcd`" != on ] && _change_config pmcd on
371 if $showme
372 then
373 qatrace=false
376 if $qatrace
377 then
378 # if tracing turned on, make sure trace agent running ok
379 switchon=`pmprobe -h $qadepot trace.control.reset 2>&1 | $PCP_AWK_PROG '{ print $2 }'`
380 [ "$switchon" != "1" ] && qatrace=false
383 if $qatrace
384 then
385 for seq in $list
387 $verbose && printf "Preparing pmtrace tags: %-.16s:%s\r" \
388 "$qasrc" "$seq"
389 pmtrace -qh $qadepot -v $qanotyet "$qasrc:$seq" 2>/dev/null
390 done
391 $verbose && printf "%68s\r" " "
394 torun=`echo $list | wc -w | sed -e 's/ //g'`
395 haverun=0
397 for seq in $list
399 err=false
400 if $showme
401 then
402 echo $seq
403 continue
405 if [ $torun -gt 9 ]
406 then
407 pct=`expr 100 \* $haverun / $torun`
408 haverun=`expr $haverun + 1`
409 $PCP_ECHO_PROG $PCP_ECHO_N "[$pct%] ""$PCP_ECHO_C"
411 $PCP_ECHO_PROG $PCP_ECHO_N "$seq""$PCP_ECHO_C"
412 if [ ! -f $seq ]
413 then
414 echo " [not run, missing]"
415 notrun="$notrun $seq"
416 continue
417 else
418 # really going to try and run this one
420 rm -f $seq.out.bad
421 lasttime=`sed -n -e "/^$seq /s/.* //p" <check.time`
422 [ "X$lasttime" != X ] && $PCP_ECHO_PROG $PCP_ECHO_N " ${lasttime}s ...""$PCP_ECHO_C"
423 rm -f core $seq.notrun
425 # acquire lock
426 _get_lock
427 if [ $? != 0 ]
428 then echo "$myname: could not acquire lock; exiting" 2>&1
431 if $check_config
432 then
433 # save checksums for critical conf and control files
434 [ ! -f $tmp.checksums ] && _checksums get >$tmp.checksums
437 start=`_wallclock`
438 $timestamp && _timestamp
440 # for hangcheck ...
441 [ "$HANGCHECK" = true -a "$USER" = pcpqa ] && echo "$seq" >"$CHECKSTS"
443 if $qatrace
444 then
445 pmtrace -qh $qadepot -v $qarunning "$qasrc:$seq" 2>/dev/null
446 pmtrace -qh $qadepot -e "./$seq" "$qasrc:$seq" >$tmp.out.1 2>&1
447 sts=$?
448 # check for trace errors on first line of test & blow them away
449 $PCP_AWK_PROG '/pmtrace: / {if (NR != 1) print $0; next} {print $0}' $tmp.out.1 > $tmp.out
450 else
451 ./$seq >$tmp.out 2>&1
452 sts=$?
454 $timestamp && _timestamp
455 stop=`_wallclock`
457 # for hangcheck ...
458 [ "$HANGCHECK" = true -a "$USER" = pcpqa ] && echo "working" >"$CHECKSTS"
460 if $check_config
461 then
462 # check the saved checksums
463 _checksums check $tmp.checksums >$tmp.check
464 if [ -s $tmp.check ]
465 then
466 echo "$myname: $seq: ERROR: test failed to restore the following config files:" >>$tmp.out
467 cat $tmp.check >>$tmp.out
468 $PCP_ECHO_PROG $PCP_ECHO_N " [config not restored]""$PCP_ECHO_C"
472 # remove the lock
473 _release_lock
475 if [ -f core ]
476 then
477 $PCP_ECHO_PROG $PCP_ECHO_N " [dumped core]""$PCP_ECHO_C"
478 mv core $seq.core
479 err=true
482 if [ -f $seq.notrun ]
483 then
484 [ $color = true ] && tput bold && tput setaf 4 # blue
485 echo " [not run] `cat $seq.notrun`"
486 [ $color = true ] && tput sgr0 # reset
487 notrun="$notrun $seq"
488 else
489 if [ $sts -ne 0 ]
490 then
491 $PCP_ECHO_PROG $PCP_ECHO_N " [failed, exit status $sts]""$PCP_ECHO_C"
492 err=true
494 if [ ! -f $seq.out ]
495 then
496 $PCP_ECHO_PROG $PCP_ECHO_N " - no qualified output""$PCP_ECHO_C"
497 mv $tmp.out $seq.out.bad
498 err=true
499 else
500 if diff $seq.out $tmp.out >/dev/null 2>&1
501 then
502 if $err
503 then
505 else
506 echo "$seq `expr $stop - $start`" >>$tmp.time
508 else
509 [ $color = true ] && tput bold && tput setaf 1 # red
510 $PCP_ECHO_PROG $PCP_ECHO_N " - output mismatch (see $seq.out.bad)""$PCP_ECHO_C"
511 [ $color = true ] && tput sgr0 # reset
512 mv $tmp.out $seq.out.bad
513 $PCP_ECHO_PROG
514 $diff $seq.out $seq.out.bad
515 err=true
519 # make sure this test did not muck up the permissions or
520 # ownership of key installed files and directories
522 sh 994 --fix >$tmp.out
523 if [ -s $tmp.out ]
524 then
525 $PCP_ECHO_PROG $PCP_ECHO_N " - failed permissions check""$PCP_ECHO_C"
526 echo >$tmp.head
527 echo "*** Failed permissions/ownership checks ***" >>$tmp.head
528 if [ -f $seq.out.bad ]
529 then
530 cat $tmp.head $tmp.out >>$seq.out.bad
531 elif [ -f $seq.out ]
532 then
533 cp $seq.out $seq.out.bad
534 cat $tmp.head $tmp.out >>$seq.out.bad
535 else
536 cat $tmp.head $tmp.out >$seq.out.bad
538 err=true
540 $PCP_ECHO_PROG ""
542 # really tried to run the test, update the state
544 if $qatrace
545 then
546 if $err
547 then
548 pmtrace -qh $qadepot -v $qafailed "$qasrc:$seq:$qaown" 2>/dev/null
549 else
550 pmtrace -qh $qadepot -v $qapassed "$qasrc:$seq:$qaown" 2>/dev/null
556 # come here for each test, except when $showme is true
558 if $err
559 then
560 bad="$bad $seq"
561 n_bad=`expr $n_bad + 1`
562 quick=false
563 [ $diff = true ] || echo "Check local PMCD is still alive ..."
564 $OPTION_AGENTS && _haveagents
565 $OPTION_LOGGER && _havelogger
567 [ -f $seq.notrun ] || try=`expr $try + 1`
568 rm -f $seq.notrun
570 # optional callback
572 [ -x check.callback ] && check.callback $seq
573 done
575 interrupt=false
576 status=$n_bad
577 exit