2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 diskstat_inventory_mode
= "rule" # "summary", "single", "legacy"
29 diskstat_default_levels
= {
30 # "read" : (10, 20), # MB/sec
31 # "write" : (20, 40), # MB/sec
32 # "average" : 15, # min
33 # "latency" : (10, 20), # ms
34 # "latency_perfdata" : True,
37 # Rule for controlling diskstat inventory more fine grained
38 diskstat_inventory
= []
41 # diskstat_inventory = [
42 # ( [], [ 'linux' ], ALL_HOST ), --> No diskstat on this host
43 # ( [ 'summary', 'physical', 'lvm', 'vxvm' ], ALL_HOSTS ),
46 diskstat_diskless_pattern
= re
.compile("x?[shv]d[a-z]*[0-9]+")
49 def inventory_diskstat_generic(parsed
):
50 # Skip over on empty data
54 # New style: use rule based configuration, defaulting to summary mode
55 if diskstat_inventory_mode
== "rule":
56 hits
= host_extra_conf(host_name(), diskstat_inventory
)
62 elif diskstat_inventory_mode
== "single":
64 elif diskstat_inventory_mode
== "summary":
70 if "summary" in modes
:
71 inventory
.append(("SUMMARY", "diskstat_default_levels"))
74 inventory
+= [("read", None), ("write", None)]
78 if "physical" in modes
and \
80 not diskstat_diskless_pattern
.match(name
):
81 inventory
.append((name
, "diskstat_default_levels"))
83 if "lvm" in modes
and \
84 name
.startswith("LVM "):
85 inventory
.append((name
, "diskstat_default_levels"))
87 if "vxvm" in modes
and \
88 name
.startswith("VxVM "):
89 inventory
.append((name
, "diskstat_default_levels"))
91 if "diskless" in modes
and \
92 diskstat_diskless_pattern
.match(name
):
93 # Sort of partitions with disks - typical in XEN virtual setups.
94 # Eg. there are xvda1, xvda2, but no xvda...
95 inventory
.append((name
, "diskstat_default_levels"))
100 def check_diskstat_line(this_time
, item
, params
, line
, mode
='sectors'):
101 average_range
= params
.get("average")
102 if average_range
== 0:
103 average_range
= None # disable averaging when 0 is set
109 if node
is not None and node
!= "":
110 infos
.append("Node %s" % node
)
112 for what
, ctr
in [("read", line
[2]), ("write", line
[3])]:
114 countername
= "diskstat.%s.%s.%s" % (node
, item
, what
)
116 countername
= "diskstat.%s.%s" % (item
, what
)
118 # unpack levels now, need also for perfdata
119 levels
= params
.get(what
)
120 if isinstance(levels
, tuple):
123 warn
, crit
= None, None
125 per_sec
= get_rate(countername
, this_time
, int(ctr
))
126 if mode
== 'sectors':
127 # compute IO rate in bytes/sec
128 bytes_per_sec
= per_sec
* 512
129 elif mode
== 'bytes':
130 bytes_per_sec
= per_sec
132 infos
.append("%s/sec %s" % (get_bytes_human_readable(bytes_per_sec
), what
))
133 perfdata
.append((what
, bytes_per_sec
, warn
, crit
))
136 # compute average of the rate over ___ minutes
137 if average_range
is not None:
138 avg
= get_average(countername
+ ".avg", this_time
, bytes_per_sec
, average_range
)
139 dsname
= what
+ ".avg"
140 perfdata
.append((dsname
, avg
))
144 state
, text
, extraperf
= check_levels(
145 bytes_per_sec
, dsname
, levels
, unit
="MB/s", scale
=1048576, statemarkers
=True)
148 status
= max(state
, status
)
149 prediction_perf
+= extraperf
151 # Add performance data for averaged IO
152 if average_range
is not None:
153 perfdata
= [perfdata
[0], perfdata
[2], perfdata
[1], perfdata
[3]]
155 # Process IOs when available
157 if len(line
) >= 6 and line
[4] >= 0 and line
[5] > 0:
158 reads
, writes
= map(int, line
[4:6])
159 if "read_ios" in params
:
160 warn
, crit
= params
["read_ios"]
162 infos
.append('Read operations: %d (!!)' % (reads
))
165 infos
.append('Read operations: %d (!)' % (reads
))
166 status
= max(status
, 1)
168 warn
, crit
= None, None
169 if "write_ios" in params
:
170 warn
, crit
= params
["write_ios"]
172 infos
.append('Write operations: %d (!!)' % (writes
))
175 infos
.append('Write operations: %d (!)' % (writes
))
176 status
= max(status
, 1)
178 warn
, crit
= None, None
180 ios_per_sec
= get_rate(countername
+ ".ios", this_time
, ios
)
181 infos
.append("IOs: %.2f/sec" % ios_per_sec
)
183 if params
.get("latency_perfdata"):
184 perfdata
.append(("ios", ios_per_sec
))
186 # Do Latency computation if this information is available:
187 if len(line
) >= 7 and line
[6] >= 0:
188 timems
= int(line
[6])
189 timems_per_sec
= get_rate(countername
+ ".time", this_time
, timems
)
193 latency
= timems_per_sec
/ ios_per_sec
194 infos
.append("Latency: %.2fms" % latency
)
195 if "latency" in params
:
196 warn
, crit
= params
["latency"]
200 elif latency
>= warn
:
201 status
= max(status
, 1)
204 warn
, crit
= None, None
206 if params
.get("latency_perfdata"):
207 perfdata
.append(("latency", latency
, warn
, crit
))
209 # Queue Lengths (currently only Windows). Windows uses counters here.
210 # I have not understood, why....
212 for what
, ctr
in [("read", line
[7]), ("write", line
[8])]:
213 countername
= "diskstat.%s.ql.%s" % (item
, what
)
214 levels
= params
.get(what
+ "_ql")
218 warn
, crit
= None, None
220 qlx
= get_rate(countername
, this_time
, int(ctr
))
221 ql
= qlx
/ 10000000.0
222 infos
.append(what
.title() + " Queue: %.2f" % ql
)
225 if levels
is not None:
230 status
= max(status
, 1)
233 if params
.get("ql_perfdata"):
234 perfdata
.append((what
+ "_ql", ql
))
236 perfdata
+= prediction_perf
238 return (status
, ", ".join(infos
), perfdata
)
241 def check_diskstat_generic(item
, params
, this_time
, info
, mode
='sectors'):
242 # legacy version if item is "read" or "write"
243 if item
in ['read', 'write']:
244 return _check_diskstat_old(item
, params
, this_time
, info
)
246 # Sum up either all physical disks (if item is "SUMMARY") or
247 # all entries matching the item in question. It is not a bug if
248 # a disk appears more than once. This can for example happen in
249 # Windows clusters - even if they are no Check_MK clusters.
255 if item
== 'SUMMARY' and line
[0] is not None:
256 return 3, "summary mode not supported in a cluster"
258 elif item
== 'SUMMARY' and ' ' in line
[1]:
259 continue # skip non-physical disks
261 elif item
== 'SUMMARY' or line
[1] == item
:
263 summed_up
= [x
+ int(y
) for x
, y
in zip(summed_up
, line
[2:])]
266 return 3, "No matching disk found"
267 return check_diskstat_line(this_time
, item
, params
, [None, ''] + summed_up
, mode
)
270 # This is the legacy version of diskstat as used in <= 1.1.10.
271 # We keep it here for a while in order to be compatible with
273 def _check_diskstat_old(item
, params
, this_time
, info
):
274 # sum up over all devices
276 index
= 2 # sectors read
277 elif item
== 'write':
278 index
= 3 # sectors written
280 return (3, "invalid item %s" % (item
,))
284 if line
[0] is not None:
285 return 3, "read/write mode not supported in a cluster"
286 if ' ' not in line
[1]:
287 this_val
+= int(line
[index
])
289 per_sec
= get_rate("diskstat." + item
, this_time
, this_val
)
290 mb_per_s
= per_sec
/ 2048.0 # Diskstat output is in sectors a 512 Byte
291 kb_per_s
= per_sec
/ 2.0
292 perfdata
= [(item
, "%f" % kb_per_s
)]
293 return (0, "%.1f MB/s" % mb_per_s
, perfdata
)
297 # .--Dict based API------------------------------------------------------.
298 # | ____ _ _ _ _ _ ____ ___ |
299 # | | _ \(_) ___| |_ | |__ __ _ ___ ___ __| | / \ | _ \_ _| |
300 # | | | | | |/ __| __| | '_ \ / _` / __|/ _ \/ _` | / _ \ | |_) | | |
301 # | | |_| | | (__| |_ | |_) | (_| \__ \ __/ (_| | / ___ \| __/| | |
302 # | |____/|_|\___|\__| |_.__/ \__,_|___/\___|\__,_| /_/ \_\_| |___| |
304 # +----------------------------------------------------------------------+
305 # | The newest generation of Disk IO checks parse all informatin info |
306 # | a dictionary, where counters are aleady resolved. Look at diskstat |
307 # | (the Linux diskstat check) for an example. |
308 # '----------------------------------------------------------------------'
311 def diskstat_select_disk(disks
, item
):
313 # In summary mode we add up the throughput values, but
314 # we average the other values for disks that have a throughput
315 # > 0. Note: This is not very precise. Strictly spoken
316 # we would need to do the summarization directly in the
317 # parse function. But there we do not have information about
318 # the physical multipath devices and would add up the traffic
319 # of the paths with the traffice of the device itself....
321 if item
== "SUMMARY":
324 # We do not set these settings explictly because some
325 # devices may not provide all of them.
328 # "read_throughput" : 0.0,
329 # "write_throughput" : 0.0,
330 # "utilization" : 0.0,
332 # "average_request_size" : 0.0,
333 # "average_wait" : 0.0,
334 # "average_read_wait" : 0.0,
335 # "average_read_request_size" : 0.0,
336 # "average_write_wait" : 0.0,
337 # "average_write_request_size" : 0.0,
338 # "queue_length" : 0.0,
345 for device
, disk
in disks
.items():
346 # If all disks are idle the summarized dict would have no keys
347 # So we take care that at least all keys of this disk are set
348 for key
in disk
.keys():
350 summarized
.setdefault(key
, 0.0)
352 if device
.startswith("LVM "):
353 continue # skip LVM devices for summary
355 if True or disk
["read_throughput"] + disk
["write_throughput"] > 0: # skip idle disks
357 for key
, value
in disk
.items():
359 summarized
[key
] += value
362 for key
, value
in summarized
.items():
363 if key
.startswith("ave") or key
in ("utilization", "latency", "queue_length"):
364 summarized
[key
] /= num_averaged
368 elif item
not in disks
:
375 # New version for this diskstat checks that use the new dict
376 # format. The first one is "diskstat" - the Linux version of
377 # this check. Look there for examples of the format of the
378 # dictionary "disks". Example:
379 # disks = { "sda" : {
381 # 'average_read_request_size' : 0.0,
382 # 'average_read_wait' : 0.0,
383 # 'average_request_size' : 40569.90476190476,
384 # 'average_wait' : 0.761904761904762,
385 # 'average_write_request_size' : 40569.90476190476,
386 # 'average_write_wait' : 0.0007619047619047619,
388 # 'read_throughput' : 0.0,
389 # 'latency' : 0.00038095238095238096,
390 # 'utilization' : 0.0006153846153846154,
391 # 'write_ios' : 1.6153846153846154,
392 # 'write_throughput' : 65536.0,
393 # 'queue_length' : 0.0,
397 def check_diskstat_dict(item
, params
, disks
):
398 # Take care of previously discovered services
399 if item
in ("read", "write"):
400 yield 3, "Sorry, the new version of this check does not " \
401 "support one service for read and one for write anymore."
404 this_time
= time
.time()
405 disk
= diskstat_select_disk(disks
, item
)
410 # Note: this check uses a simple method of averaging: As soon as averaging
411 # is turned on the actual metrics are *replaced* by the averaged ones. No
412 # duplication of performance data or check output here. This is because we
413 # have so many metrics...
415 averaging
= params
.get("average") # in seconds here!
417 avg_disk
= {} # Do not modify our arguments!!
418 for key
, value
in disk
.items():
419 if isinstance(value
, (int, float)):
420 avg_disk
[key
] = get_average("diskstat.%s.%s.avg" % (item
, key
), this_time
, value
,
423 avg_disk
[key
] = value
425 prefix
= "%s average: " % get_age_human_readable(averaging
)
428 if "utilization" in disk
:
429 util
= disk
.pop("utilization")
433 params
.get("utilization"),
437 infoname
=prefix
+ "Utilization")
440 for what
in "read", "write":
441 if what
+ "_throughput" in disk
:
442 throughput
= disk
.pop(what
+ "_throughput")
443 state
, infotext
, extraperf
= check_levels(
445 "disk_" + what
+ "_throughput",
450 human_readable_func
=lambda x
: get_bytes_human_readable(x
) + '/s',
451 infoname
=what
.title())
452 yield state
, infotext
, extraperf
454 # Average wait from end to end
455 for what
in ["wait", "read_wait", "write_wait"]:
456 if "average_" + what
in disk
:
457 wait
= disk
.pop("average_" + what
)
460 "disk_average_" + what
,
465 infoname
="Average %s" % what
.title().replace("_", " "))
467 # Average disk latency
468 if "latency" in disk
:
469 latency
= disk
.pop("latency")
473 params
.get("latency"),
480 for what
, plugin_text
in [
481 ("queue_length", "Queue Length"),
482 ("read_ql", "Read Queue Length"),
483 ("write_ql", "Write Queue Length"),
492 infoname
="Average %s" % plugin_text
)
495 for what
in "read", "write":
496 if what
+ "_ios" in disk
:
497 ios
= disk
.pop(what
+ "_ios")
500 "disk_" + what
+ "_ios",
501 params
.get(what
+ "_ios"),
504 infoname
="%s operations" % what
.title(),
507 # All the other metrics are currently not output in the plugin output - simply because
508 # of their amount. They are present as performance data and will shown in graphs.
510 # Send everything as performance data now. Sort keys alphabetically
512 for key
in sorted(disk
.keys()):
514 if isinstance(value
, (int, float)):
515 # Currently the levels are not shown in the perfdata
516 perfdata
.append(("disk_" + key
, value
))
519 yield 0, '', perfdata