Refactoring: Changed all check parameters starting with an 'o' to the new rulespec...
[check_mk.git] / agents / plugins / mk_db2.aix
blobed59cc37a3fbcbb3696a3fb50fb45520143b13d3
1 #!/usr/bin/ksh
2 # Monitor DB/2 databases on AIX
3 # $HOME/sqllib/db2profile
5 # This script can be called in two ways
6 # Without any arguments:
7 # Checks if cache of the instances is up to date and starts the
8 # command 'mk_db.aix query {instance}' if applicable
9 # If its outdated the script calls itself with the argument 'query'
10 # With 'query {instance}' as argument:
11 # Does the actual queries to the db2 instance and writes this info
12 # into the cache file
13 # Each instance has its own cache file and all of them are filled in parallel
15 if [ ! "$MK_CONFDIR" ] ; then
16 echo "MK_CONFDIR not set!" >&2
17 exit 1
20 if [ ! "$MK_VARDIR" ] ; then
21 export MK_VARDIR=$MK_CONFDIR
24 function waitmax
26 TIMEOUT=${1}0
27 SIGNAL=9
28 shift
30 # Run command in background
31 if [ "${#}" -ge 1 ] ; then
32 ksh -c "$*" &
33 else
34 TEST=$(cat)
35 ksh -c "$TEST" &
38 PID=$!
40 # Wait for termination within TIMOUT seconds
41 while [ $TIMEOUT -gt 0 ]
43 TIMEOUT=$((TIMEOUT - 1))
44 if [ ! -e /proc/$PID ] ; then
45 return 0
47 perl -e "select(undef, undef, undef, 0.1);"
48 done
50 # Process did not terminate in time. Kill and
51 # return with an error
52 kill -9 $PID
53 return 255
56 function query_instance {
57 INSTANCE=$1
58 # find home directory
59 HOMEDIR=$(grep "^$INSTANCE" /etc/passwd | awk -F: '{print $6}' | grep "$INSTANCE$")
60 NOW=$(perl -e "print time();")
62 waitmax 200 << WAITMAX
63 su $INSTANCE << EOF
65 if [ ! -f $HOMEDIR/sqllib/db2profile ] ;
66 then
67 exit 0
70 . $HOMEDIR/sqllib/db2profile >/dev/null 2>&1 ;
73 function compare_version_greater_equal {
74 GREATER_ONE=\\\$(echo "\\\$1 \\\$2" | awk "{if (\\\$1 >= \\\$2) print \\\$1; else print \\\$2}")
75 if [ \\\$GREATER_ONE == \\\$1 ] ; then
76 return 0
77 else
78 return 1
82 echo '<<<db2_version:sep(1)>>>'
83 DBVERSION=\\\$(db2 get snapshot for dbm | grep -e 'Product name' -e 'Service level' | awk -v FS='=' '{print \\\$2}' | sed 'N;s/\n/,/g' | sed 's/ //g')
84 echo $INSTANCE \\\$DBVERSION
85 VERSION_NUMBER=\\\$(echo \\\$DBVERSION | sed -e 's/DB2v\\\(.*\),.*/\\\1/' | awk -v FS="." '{print \\\$1"."\\\$2}')
87 DBS=\\\$(db2 list database directory on $HOMEDIR | grep 'Database name' | awk '{ print \\\$NF }')
89 GET_PORT=1
90 DB_PORT='port 0'
91 for DB in \\\$DBS; do
92 db2 connect to \\\$DB > /dev/null;
93 if [ $? -nq 0 ] ; then
94 exit 1
97 if [ 1 -eq \\\$GET_PORT ] ; then
98 # Each database in an instance has the same port information
99 db2_tcp_service=\\\$(db2 -x get dbm cfg | grep $INSTANCE | grep "TCP/IP Service" | awk -v FS='=' '{print \\\$2}'|tr -d ' ')
100 if ( grep \\\$db2_tcp_service /etc/services | grep -q "^\\\$db2_tcp_service " ); then
101 DB_PORT='port '\\\$(grep \\\$db2_tcp_service /etc/services | grep "^\\\$db2_tcp_service " | awk '{print \\\$2}' | awk -v FS="/" '{print \\\$1}')
103 GET_PORT=0
106 echo "<<<db2_tablespaces>>>"
107 echo "[[[$INSTANCE:\\\$DB]]]"
108 db2 "SELECT tbsp_name, tbsp_type, tbsp_state, tbsp_usable_size_kb, tbsp_total_size_kb, tbsp_used_size_kb, tbsp_free_size_kb FROM sysibmadm.tbsp_utilization WHERE tbsp_type = 'DMS' UNION ALL SELECT tu.tbsp_name, tu.tbsp_type, tu.tbsp_state, tu.tbsp_usable_size_kb, tu.tbsp_total_size_kb, tu.tbsp_used_size_kb, (cu.fs_total_size_kb - cu.fs_used_size_kb) AS tbsp_free_size_kb FROM sysibmadm.tbsp_utilization tu INNER JOIN ( SELECT tbsp_id, 1 AS fs_total_size_kb, 0 AS fs_used_size_kb FROM sysibmadm.container_utilization WHERE (fs_total_size_kb IS NULL OR fs_used_size_kb IS NULL) GROUP BY tbsp_id) cu ON (tu.tbsp_type = 'SMS' AND tu.tbsp_id = cu.tbsp_id) UNION ALL SELECT tu.tbsp_name, tu.tbsp_type, tu.tbsp_state, tu.tbsp_usable_size_kb, tu.tbsp_total_size_kb, tu.tbsp_used_size_kb, (cu.fs_total_size_kb - cu.fs_used_size_kb) AS tbsp_free_size_kb FROM sysibmadm.tbsp_utilization tu INNER JOIN ( SELECT tbsp_id, SUM(fs_total_size_kb) AS fs_total_size_kb, SUM(fs_used_size_kb) AS fs_used_size_kb FROM sysibmadm.container_utilization WHERE (fs_total_size_kb IS NOT NULL AND fs_used_size_kb IS NOT NULL) GROUP BY tbsp_id) cu ON (tu.tbsp_type = 'SMS' AND tu.tbsp_id = cu.tbsp_id)" | awk '{print \\\$1" "\\\$2" "\\\$3" "\\\$4" "\\\$5" "\\\$6" "\\\$7}' | sed -e '/^[ ]*$/d' -e '/^-/d' -e '/selected/d'
110 echo "<<<db2_counters>>>"
111 echo "TIMESTAMP $NOW"
112 cat \\\$(db2 get dbm cfg|grep "Default database path"|awk -v FS="=" '{print \\\$2"/sqllib/db2nodes.cfg"}'|tr -d ' ') | sed "s/\(.*\)/$INSTANCE:\\\$DB node \1/"
113 db2 -x "SELECT deadlocks from sysibmadm.snapdb" | tr -d ' ' | sed "s/\(.*\)/$INSTANCE:\\\$DB deadlocks \1/"
114 db2 -x "SELECT lock_waits from sysibmadm.snapdb" | tr -d ' ' | sed "s/\(.*\)/$INSTANCE:\\\$DB lockwaits \1/"
115 db2 -x "SELECT sort_overflows from sysibmadm.snapdb" | tr -d ' ' | sed "s/\(.*\)/$INSTANCE:\\\$DB sortoverflows \1/"
117 echo "<<<db2_logsizes>>>"
118 echo "[[[$INSTANCE:\\\$DB]]]"
119 echo "TIMESTAMP $NOW"
120 cat \\\$(db2 get dbm cfg|grep "Default database path"|awk -v FS="=" '{print \\\$2"/sqllib/db2nodes.cfg"}'|tr -d ' ') | sed 's/\(.*\)/node \1/'
121 db2 -x "SELECT 'usedspace', total_log_used from sysibmadm.snapdb" | awk '{print \\\$1" "\\\$2}'
122 db2 -x "SELECT NAME, VALUE FROM SYSIBMADM.DBCFG WHERE NAME IN ('logfilsiz','logprimary','logsecond')"| awk '{print \\\$1" "\\\$2}'
124 echo "<<<db2_connections>>>"
125 echo "[[[$INSTANCE:\\\$DB]]]"
126 echo \\\$DB_PORT
127 echo "connections " | tr -d '\n'
128 db2 list applications | grep -v Auth | grep -v Name | sed -e '/^$/d' | wc -l | tr -d ' '
129 # TODO: the time command seems to be broken and outputs 1 second steps
130 ksh -c "time db2 connect to \\\$DB > /dev/null" 2>&1 | grep real | awk '{print "latency "\\\$2}'| sed -e 's/m/:/' -e 's/s//'
132 echo "<<<db2_bp_hitratios>>>"
133 echo "[[[$INSTANCE:\\\$DB]]]"
134 cat \\\$(db2 get dbm cfg|grep "Default database path"|awk -v FS="=" '{print \\\$2"/sqllib/db2nodes.cfg"}'|tr -d ' ') | sed "s/\(.*\)/node \1/"
135 db2 "SELECT SUBSTR(BP_NAME,1,14) AS BP_NAME, TOTAL_HIT_RATIO_PERCENT, DATA_HIT_RATIO_PERCENT, INDEX_HIT_RATIO_PERCENT, XDA_HIT_RATIO_PERCENT FROM SYSIBMADM.BP_HITRATIO" | grep -v "selected." | sed -e '/^$/d' -e '/^-/d'
137 echo "<<<db2_sort_overflow>>>"
138 echo "[[[$INSTANCE:\\\$DB]]]"
139 db2 -x "get snapshot for database on \\\$DB" | grep -e "^Total sorts" -e "^Sort overflows" | tr -d '='
141 echo "<<<db2_backup>>>"
142 echo "[[[$INSTANCE:\\\$DB]]]"
143 if compare_version_greater_equal \\\$VERSION_NUMBER 10.5; then
144 # MON_GET_DATBASE(-2) gets information of all active members
145 db2 -x "select LAST_BACKUP from TABLE (MON_GET_DATABASE(-2))" | grep -v "selected." | tail -n 1
146 else
147 db2 -x "select SQLM_ELM_LAST_BACKUP from table(SNAPSHOT_DATABASE( cast( null as VARCHAR(255)), cast(null as int))) as ref" | grep -v "selected." | tail -n 1
150 # disconnect from database
151 db2 connect reset > /dev/null
152 done
154 WAITMAX
155 return $?
158 if [ "$1" = "query" ]; then
159 query_instance $2
160 exit $?
161 else
162 #### RUN CACHED #####
163 function file_age {
164 /usr/bin/perl -e 'if (! -f $ARGV[0]){die "0000000"};$mtime=(stat($ARGV[0]))[9];print ($^T-$mtime);' "$1"
167 if [ ! -d $MK_VARDIR/cache ]; then mkdir -p $MK_VARDIR/cache ; fi
170 if [ -e "$MK_VARDIR/cache/mk_db2.aix.cache" ] ; then
171 rm $MK_VARDIR/cache/mk_db2.aix.cache
173 INSTANCES=$(ps -ef | grep [d]b2sysc | awk '{print $1 }')
175 # Output any section headers
176 # If no data is available there will be at least the section headers
177 # This happens when a database is down. In this scenario the db2_version check
178 # should go CRIT and the other checks go stale
179 echo "<<<db2_version:sep(1)>>>"
180 echo "<<<db2_tablespaces>>>"
181 echo "<<<db2_counters>>>"
182 echo "<<<db2_logsizes>>>"
183 echo "<<<db2_connections>>>"
184 echo "<<<db2_bp_hitratios>>>"
185 echo "<<<db2_sort_overflow>>>"
186 echo "<<<db2_backup>>>"
188 for INSTANCE in $INSTANCES; do
189 CACHEFILE="$MK_VARDIR/cache/mk_db2.aix.cache.$INSTANCE"
190 MAXAGE=300
192 # Check if the creation of the cache takes way to long and delete this file
193 # The process might have crashed...
194 # Since the processes are called with waitmax it is very unlikely that
195 # there are still unwanted processes soiling the system.
196 if [ -e "$CACHEFILE.new" ] ; then
197 AGE=$(file_age "$CACHEFILE.new")
198 if [ $AGE -ge $((MAXAGE * 10)) ] ; then
199 rm "$CACHEFILE.new"
203 # Check if the creation of the cache takes suspiciously long and return
204 # nothing if the age (access time) of $CACHEFILE.new is twice the MAXAGE
205 if [ -e "$CACHEFILE.new" ] ; then
206 AGE=$(file_age "$CACHEFILE.new")
207 if [ $AGE -ge $((MAXAGE * 2)) ] ; then
208 return
212 # Check if cache file exists and is recent enough
213 USE_CACHEFILE=""
214 if [ -s "$CACHEFILE" ] ; then
215 AGE=$(file_age "$CACHEFILE")
216 if [ $AGE -le $MAXAGE ] ; then USE_CACHEFILE=1 ; fi
217 # Output the file in any case, even if it is
218 # outdated. The new file will not yet be available
219 cat "$CACHEFILE"
222 # Cache file outdated and new job not yet running? Start it
223 if [ -z "$USE_CACHEFILE" -a ! -e "$CACHEFILE.new" ] ; then
224 echo "set -o noclobber ; exec > \"$CACHEFILE.new\" || exit 1 ; ./$0 query $INSTANCE && mv \"$CACHEFILE.new\" \"$CACHEFILE\" || rm -f \"$CACHEFILE\" \"$CACHEFILE.new\"" | nohup ksh 2>/dev/null &
227 done
231 exit 0