Improved check_ctdb
[Samba.git] / ctdb / utils / nagios / check_ctdb
blob837a0a4539a37221602590085533b0625a68f24e
1 #!/usr/bin/perl -w
2 # Nagios plugin to monitor CTDB (Clustered Trivial Database)
4 # License: GPL
5 # Copyright (c) 2011 Nantes Metropole
6 # Author: Mathieu Parent <math.parent@gmail.com>
7 # Contributor(s): -
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License version 2 as
11 # published by the Free Software Foundation.
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
22 use strict;
23 use warnings;
24 use vars qw($PROGNAME $VERSION $output $values $result);
25 use Nagios::Plugin;
26 use File::Basename;
28 $PROGNAME = basename($0);
29 $VERSION = '0.4';
31 my $np = Nagios::Plugin->new(
32 usage => "Usage: %s -i <info>\n"
33 . " [ -t <timeout> ] [ -w <warn_range> ] [ -c <crit_range> ]\n"
34 . " [ -H <host> ] [-s] [ -l <login_name> ]\n"
35 . ' [ -V ] [ -h ]',
36 version => $VERSION,
37 plugin => $PROGNAME,
38 shortname => uc($PROGNAME),
39 blurb => 'CTDB plugin',
40 extra => "Supported commands:\n"
41 . " * scriptstatus :\n"
42 . " check the ctdb scriptstatus command and return CRITICAL if one of the\n"
43 . " scripts fails.\n"
44 . " Perfdata count the number of scripts by state (ok, disabled, error,\n"
45 . " total).\n"
46 . " * ping :\n"
47 . " check the ctdb ping command.\n"
48 . " Perfdata count the number of nodes, the total ping time and the number\n"
49 . " of clients.\n"
50 . " Thresholds are checked against the number of nodes.\n"
51 . "\n\nCopyright (c) 2011 Nantes Metropole",
52 timeout => 30,
55 $np->add_arg(
56 spec => 'info|i=s',
57 help => "-i, --info=<info>\n"
58 . ' Information: One of scriptstatus or ping.',
59 required => 1,
62 $np->add_arg(
63 spec => 'hostname|H=s',
64 help => "-H, --hostname=<login_name>\n"
65 . ' Host name or IP Address.',
66 required => 0,
69 $np->add_arg(
70 spec => 'sudo|s',
71 help => "-s, --sudo\n"
72 . ' Use sudo.',
73 required => 0,
76 $np->add_arg(
77 spec => 'login|l=s',
78 help => "-l, --login=<host>\n"
79 . ' The user to log in as on the remote machine.',
80 required => 0,
83 $np->add_arg(
84 spec => 'warning|w=s',
85 help => "-w, --warning=THRESHOLD\n"
86 . " Warning threshold. See\n"
87 . " http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n"
88 . ' for the threshold format.',
89 required => 0,
92 $np->add_arg(
93 spec => 'critical|c=s',
94 help => "-c, --critical=THRESHOLD\n"
95 . " Critical threshold. See\n"
96 . " http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n"
97 . ' for the threshold format.',
98 required => 0,
101 $np->getopts;
103 my $info = $np->opts->info;
104 my $hostname = $np->opts->hostname;
105 my $login = $np->opts->login;
106 my $sudo = $np->opts->sudo;
107 my $warning = $np->opts->warning;
108 my $critical = $np->opts->critical;
109 my $percw;
110 my $percc;
112 $output = "";
114 if (defined($critical))
116 ($percc, $critical) = check_percantage($critical);
117 $critical = undef if ($critical eq '');
120 if (defined($warning))
122 ($percw, $warning) = check_percantage($warning);
123 $warning = undef if ($warning eq '');
126 $np->set_thresholds(critical => $critical, warning => $warning);
128 my $stderr;
130 sub safe_open_command {
131 unshift @_, "sudo" if $sudo;
132 if ($hostname) {
133 unshift @_, $hostname;
134 unshift @_, "-l", $login if $login;
135 unshift @_, "ssh";
137 open(OLDERR, ">&", \*STDERR) or die "Can't dup STDERR: $!";
138 $stderr = "";
139 close STDERR;
140 open(STDERR, ">>", \$stderr) or die "Can't open STDERR: $!";
141 if ($np->opts->verbose) {
142 print "Executing: @_\n";
144 if (!open(PIPE, '-|', @_)) {
145 $result = CRITICAL;
146 $output .= "Cannot open command '@_': $! ($stderr). ";
147 # restore STDERR
148 open(STDERR, ">", \*OLDERR) or die "Can't dup OLDERR: $!";
152 sub safe_close_command {
153 close(PIPE);
155 if ($? == -1) {
156 $result = CRITICAL;
157 $output .= "failed to execute: $!. ";
158 } elsif ($? & 127) {
159 $result = CRITICAL;
160 $output .= sprintf("child died with signal %d, %s coredump. ",
161 ($? & 127), ($? & 128) ? 'with' : 'without');
162 } elsif ($? >> 8) {
163 if (($? >> 8) == 255) {
164 # ctdb returns -1=255 if any node is disconnected
165 $result = WARNING;
166 $output .= sprintf("child exited with value %d. ", $? >> 8) if $output eq "";
167 } else {
168 $result = CRITICAL;
169 $output .= sprintf("child exited with value %d. ", $? >> 8);
172 # restore STDERR
173 open(STDERR, ">&OLDERR") or die "Can't dup OLDERR: $!";
176 # main :
178 if ($info eq "scriptstatus") {
179 $result = OK;
180 safe_open_command('ctdb', '-Y', 'scriptstatus');
181 if ($result == OK) {
182 my $script_count = 0;
183 my $ok_script_count = 0;
184 my $disabled_script_count = 0;
185 my $error_script_count = 0;
186 while (<PIPE>) {
187 next if $. == 1; # Header
188 $script_count++;
189 chop;
190 my ($col0, $type, $name, $code, $status, $start, $end, @error) = split(":");
191 if ($col0 ne '') {
192 # Old version, before 30 Aug 2011 and commit a779d83a6213
193 ($type, $name, $code, $status, $start, $end, @error) = ($col0, $type, $name, $code, $status, $start, $end, @error);
195 my $error = join(':', @error);
196 if ($error ne "") {
197 $output = "$output ;; " if $output;
198 $output = "$output$name ($status=$code): $error ";
199 if ($result != CRITICAL) {
200 $result = WARNING;
203 if ($status eq "OK") {
204 $ok_script_count++;
205 next;
207 if ($status eq "DISABLED") {
208 $disabled_script_count++;
209 next;
211 $error_script_count++;
212 $result = WARNING;
214 safe_close_command();
215 $np->add_perfdata(label => "ok", value => $ok_script_count, uom => '',
216 min => 0, max => $script_count);
217 $np->add_perfdata(label => "disabled", value => $disabled_script_count, uom => '',
218 min => 0, max => $script_count);
219 $np->add_perfdata(label => "error", value => $error_script_count, uom => '',
220 min => 0, max => $script_count, warning => '0', critical => '0');
221 $np->add_perfdata(label => "total", value => $script_count, uom => '',
222 min => 0, max => $script_count);
223 if ($result == OK) {
224 $result = $np->check_threshold(check => $error_script_count, warning => '0', critical => '0');
227 $np->nagios_exit($result, $output);
228 } elsif ($info eq "ping") {
229 # Get expected nodes count
230 $result = OK;
231 safe_open_command('cat', '/etc/ctdb/nodes');
232 1 while( <PIPE> );
233 my $max_nodes_count = $.;
234 safe_close_command();
235 # ctdb ping
236 $result = OK;
237 safe_open_command('ctdb', '-n', 'all', 'ping');
238 if ($result == OK) {
239 my $nodes_count = 0;
240 my $time_total = 0.0;
241 my $clients_count = 0;
242 while (<PIPE>) {
243 chop;
244 if ($_ =~ /^response from (\d+) time=([0-9.]+) sec \((\d+) clients\)$/) {
245 my ($node_id, $time, $clients) = ($1,$2,$3);
246 $nodes_count += 1;
247 $time_total += $time;
248 $clients_count += $clients;
249 } elsif ($_ =~ /^Unable to get ping response from node (\d+)$/) {
251 } else {
252 $result = CRITICAL;
253 $output .= "'$_' doesn't match regexp. "
256 $output .= sprintf("%d missing nodes. ", $max_nodes_count - $nodes_count) if $nodes_count < $max_nodes_count;
257 safe_close_command();
258 $np->add_perfdata(label => "nodes", value => $nodes_count, uom => '',
259 min => 0, max => $max_nodes_count, warning => $warning, critical => $critical);
260 $np->add_perfdata(label => "ping_time", value => $time_total, uom => 's',
261 min => 0, max => undef);
262 $np->add_perfdata(label => "clients", value => $clients_count, uom => '',
263 min => 0, max => undef);
264 if ($result == OK) {
265 $result = $np->check_threshold(check => $nodes_count);
268 $np->nagios_exit($result, $output);
269 } else {
270 $np->nagios_exit(UNKNOWN, "Unknown command: '$info'");
273 sub check_percantage
275 my ($number) = shift(@_);
276 my $perc = $number =~ s/\%//;
277 return ($perc, $number);