Refactoring: Changed all check parameters starting with an 'o' to the new rulespec...
[check_mk.git] / checks / diskstat.include
blob8abb2de5146aaf4099691e96397aeacd966b2edc
1 #!/usr/bin/python
2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
9 # | |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 diskstat_inventory_mode = "rule" # "summary", "single", "legacy"
29 diskstat_default_levels = {
30 # "read" : (10, 20), # MB/sec
31 # "write" : (20, 40), # MB/sec
32 # "average" : 15, # min
33 # "latency" : (10, 20), # ms
34 # "latency_perfdata" : True,
37 # Rule for controlling diskstat inventory more fine grained
38 diskstat_inventory = []
40 # Example
41 # diskstat_inventory = [
42 # ( [], [ 'linux' ], ALL_HOST ), --> No diskstat on this host
43 # ( [ 'summary', 'physical', 'lvm', 'vxvm' ], ALL_HOSTS ),
44 # ]
46 diskstat_diskless_pattern = re.compile("x?[shv]d[a-z]*[0-9]+")
49 def inventory_diskstat_generic(parsed):
50 # Skip over on empty data
51 if not parsed:
52 return
54 # New style: use rule based configuration, defaulting to summary mode
55 if diskstat_inventory_mode == "rule":
56 hits = host_extra_conf(host_name(), diskstat_inventory)
57 if len(hits) > 0:
58 modes = hits[0]
59 else:
60 modes = ["summary"]
62 elif diskstat_inventory_mode == "single":
63 modes = ["physical"]
64 elif diskstat_inventory_mode == "summary":
65 modes = ["summary"]
66 else:
67 modes = ["legacy"]
69 inventory = []
70 if "summary" in modes:
71 inventory.append(("SUMMARY", "diskstat_default_levels"))
73 if "legacy" in modes:
74 inventory += [("read", None), ("write", None)]
76 for line in parsed:
77 name = line[1]
78 if "physical" in modes and \
79 not ' ' in name and \
80 not diskstat_diskless_pattern.match(name):
81 inventory.append((name, "diskstat_default_levels"))
83 if "lvm" in modes and \
84 name.startswith("LVM "):
85 inventory.append((name, "diskstat_default_levels"))
87 if "vxvm" in modes and \
88 name.startswith("VxVM "):
89 inventory.append((name, "diskstat_default_levels"))
91 if "diskless" in modes and \
92 diskstat_diskless_pattern.match(name):
93 # Sort of partitions with disks - typical in XEN virtual setups.
94 # Eg. there are xvda1, xvda2, but no xvda...
95 inventory.append((name, "diskstat_default_levels"))
97 return inventory
100 def check_diskstat_line(this_time, item, params, line, mode='sectors'):
101 average_range = params.get("average")
102 if average_range == 0:
103 average_range = None # disable averaging when 0 is set
105 perfdata = []
106 infos = []
107 status = 0
108 node = line[0]
109 if node is not None and node != "":
110 infos.append("Node %s" % node)
111 prediction_perf = []
112 for what, ctr in [("read", line[2]), ("write", line[3])]:
113 if node:
114 countername = "diskstat.%s.%s.%s" % (node, item, what)
115 else:
116 countername = "diskstat.%s.%s" % (item, what)
118 # unpack levels now, need also for perfdata
119 levels = params.get(what)
120 if isinstance(levels, tuple):
121 warn, crit = levels
122 else:
123 warn, crit = None, None
125 per_sec = get_rate(countername, this_time, int(ctr))
126 if mode == 'sectors':
127 # compute IO rate in bytes/sec
128 bytes_per_sec = per_sec * 512
129 elif mode == 'bytes':
130 bytes_per_sec = per_sec
132 infos.append("%s/sec %s" % (get_bytes_human_readable(bytes_per_sec), what))
133 perfdata.append((what, bytes_per_sec, warn, crit))
134 dsname = what
136 # compute average of the rate over ___ minutes
137 if average_range is not None:
138 avg = get_average(countername + ".avg", this_time, bytes_per_sec, average_range)
139 dsname = what + ".avg"
140 perfdata.append((dsname, avg))
141 bytes_per_sec = avg
143 # check levels
144 state, text, extraperf = check_levels(
145 bytes_per_sec, dsname, levels, unit="MB/s", scale=1048576, statemarkers=True)
146 if text:
147 infos.append(text)
148 status = max(state, status)
149 prediction_perf += extraperf
151 # Add performance data for averaged IO
152 if average_range is not None:
153 perfdata = [perfdata[0], perfdata[2], perfdata[1], perfdata[3]]
155 # Process IOs when available
156 ios_per_sec = None
157 if len(line) >= 6 and line[4] >= 0 and line[5] > 0:
158 reads, writes = map(int, line[4:6])
159 if "read_ios" in params:
160 warn, crit = params["read_ios"]
161 if reads >= crit:
162 infos.append('Read operations: %d (!!)' % (reads))
163 status = 2
164 elif reads >= warn:
165 infos.append('Read operations: %d (!)' % (reads))
166 status = max(status, 1)
167 else:
168 warn, crit = None, None
169 if "write_ios" in params:
170 warn, crit = params["write_ios"]
171 if writes >= crit:
172 infos.append('Write operations: %d (!!)' % (writes))
173 status = 2
174 elif writes >= warn:
175 infos.append('Write operations: %d (!)' % (writes))
176 status = max(status, 1)
177 else:
178 warn, crit = None, None
179 ios = reads + writes
180 ios_per_sec = get_rate(countername + ".ios", this_time, ios)
181 infos.append("IOs: %.2f/sec" % ios_per_sec)
183 if params.get("latency_perfdata"):
184 perfdata.append(("ios", ios_per_sec))
186 # Do Latency computation if this information is available:
187 if len(line) >= 7 and line[6] >= 0:
188 timems = int(line[6])
189 timems_per_sec = get_rate(countername + ".time", this_time, timems)
190 if not ios_per_sec:
191 latency = 0.0
192 else:
193 latency = timems_per_sec / ios_per_sec
194 infos.append("Latency: %.2fms" % latency)
195 if "latency" in params:
196 warn, crit = params["latency"]
197 if latency >= crit:
198 status = 2
199 infos[-1] += "(!!)"
200 elif latency >= warn:
201 status = max(status, 1)
202 infos[-1] += "(!)"
203 else:
204 warn, crit = None, None
206 if params.get("latency_perfdata"):
207 perfdata.append(("latency", latency, warn, crit))
209 # Queue Lengths (currently only Windows). Windows uses counters here.
210 # I have not understood, why....
211 if len(line) >= 9:
212 for what, ctr in [("read", line[7]), ("write", line[8])]:
213 countername = "diskstat.%s.ql.%s" % (item, what)
214 levels = params.get(what + "_ql")
215 if levels:
216 warn, crit = levels
217 else:
218 warn, crit = None, None
220 qlx = get_rate(countername, this_time, int(ctr))
221 ql = qlx / 10000000.0
222 infos.append(what.title() + " Queue: %.2f" % ql)
224 # check levels
225 if levels is not None:
226 if ql >= crit:
227 status = 2
228 infos[-1] += "(!!)"
229 elif ql >= warn:
230 status = max(status, 1)
231 infos[-1] += "(!)"
233 if params.get("ql_perfdata"):
234 perfdata.append((what + "_ql", ql))
236 perfdata += prediction_perf
238 return (status, ", ".join(infos), perfdata)
241 def check_diskstat_generic(item, params, this_time, info, mode='sectors'):
242 # legacy version if item is "read" or "write"
243 if item in ['read', 'write']:
244 return _check_diskstat_old(item, params, this_time, info)
246 # Sum up either all physical disks (if item is "SUMMARY") or
247 # all entries matching the item in question. It is not a bug if
248 # a disk appears more than once. This can for example happen in
249 # Windows clusters - even if they are no Check_MK clusters.
251 summed_up = [0] * 13
252 matching = 0
254 for line in info:
255 if item == 'SUMMARY' and line[0] is not None:
256 return 3, "summary mode not supported in a cluster"
258 elif item == 'SUMMARY' and ' ' in line[1]:
259 continue # skip non-physical disks
261 elif item == 'SUMMARY' or line[1] == item:
262 matching += 1
263 summed_up = [x + int(y) for x, y in zip(summed_up, line[2:])]
265 if matching == 0:
266 return 3, "No matching disk found"
267 return check_diskstat_line(this_time, item, params, [None, ''] + summed_up, mode)
270 # This is the legacy version of diskstat as used in <= 1.1.10.
271 # We keep it here for a while in order to be compatible with
272 # old installations.
273 def _check_diskstat_old(item, params, this_time, info):
274 # sum up over all devices
275 if item == 'read':
276 index = 2 # sectors read
277 elif item == 'write':
278 index = 3 # sectors written
279 else:
280 return (3, "invalid item %s" % (item,))
282 this_val = 0
283 for line in info:
284 if line[0] is not None:
285 return 3, "read/write mode not supported in a cluster"
286 if ' ' not in line[1]:
287 this_val += int(line[index])
289 per_sec = get_rate("diskstat." + item, this_time, this_val)
290 mb_per_s = per_sec / 2048.0 # Diskstat output is in sectors a 512 Byte
291 kb_per_s = per_sec / 2.0
292 perfdata = [(item, "%f" % kb_per_s)]
293 return (0, "%.1f MB/s" % mb_per_s, perfdata)
297 # .--Dict based API------------------------------------------------------.
298 # | ____ _ _ _ _ _ ____ ___ |
299 # | | _ \(_) ___| |_ | |__ __ _ ___ ___ __| | / \ | _ \_ _| |
300 # | | | | | |/ __| __| | '_ \ / _` / __|/ _ \/ _` | / _ \ | |_) | | |
301 # | | |_| | | (__| |_ | |_) | (_| \__ \ __/ (_| | / ___ \| __/| | |
302 # | |____/|_|\___|\__| |_.__/ \__,_|___/\___|\__,_| /_/ \_\_| |___| |
303 # | |
304 # +----------------------------------------------------------------------+
305 # | The newest generation of Disk IO checks parse all informatin info |
306 # | a dictionary, where counters are aleady resolved. Look at diskstat |
307 # | (the Linux diskstat check) for an example. |
308 # '----------------------------------------------------------------------'
311 def diskstat_select_disk(disks, item):
313 # In summary mode we add up the throughput values, but
314 # we average the other values for disks that have a throughput
315 # > 0. Note: This is not very precise. Strictly spoken
316 # we would need to do the summarization directly in the
317 # parse function. But there we do not have information about
318 # the physical multipath devices and would add up the traffic
319 # of the paths with the traffice of the device itself....
321 if item == "SUMMARY":
322 summarized = {
323 "node": None,
324 # We do not set these settings explictly because some
325 # devices may not provide all of them.
326 # "read_ios" : 0.0,
327 # "write_ios" : 0.0,
328 # "read_throughput" : 0.0,
329 # "write_throughput" : 0.0,
330 # "utilization" : 0.0,
331 # "latency" : 0.0,
332 # "average_request_size" : 0.0,
333 # "average_wait" : 0.0,
334 # "average_read_wait" : 0.0,
335 # "average_read_request_size" : 0.0,
336 # "average_write_wait" : 0.0,
337 # "average_write_request_size" : 0.0,
338 # "queue_length" : 0.0,
339 # "read_ql" : 0.0,
340 # "write_ql" : 0.0,
343 if disks:
344 num_averaged = 0
345 for device, disk in disks.items():
346 # If all disks are idle the summarized dict would have no keys
347 # So we take care that at least all keys of this disk are set
348 for key in disk.keys():
349 if key != "node":
350 summarized.setdefault(key, 0.0)
352 if device.startswith("LVM "):
353 continue # skip LVM devices for summary
355 if True or disk["read_throughput"] + disk["write_throughput"] > 0: # skip idle disks
356 num_averaged += 1
357 for key, value in disk.items():
358 if key != "node":
359 summarized[key] += value
361 if num_averaged:
362 for key, value in summarized.items():
363 if key.startswith("ave") or key in ("utilization", "latency", "queue_length"):
364 summarized[key] /= num_averaged
366 return summarized
368 elif item not in disks:
369 return None
371 else:
372 return disks[item]
375 # New version for this diskstat checks that use the new dict
376 # format. The first one is "diskstat" - the Linux version of
377 # this check. Look there for examples of the format of the
378 # dictionary "disks". Example:
379 # disks = { "sda" : {
380 # 'node' : None,
381 # 'average_read_request_size' : 0.0,
382 # 'average_read_wait' : 0.0,
383 # 'average_request_size' : 40569.90476190476,
384 # 'average_wait' : 0.761904761904762,
385 # 'average_write_request_size' : 40569.90476190476,
386 # 'average_write_wait' : 0.0007619047619047619,
387 # 'read_ios' : 0.0,
388 # 'read_throughput' : 0.0,
389 # 'latency' : 0.00038095238095238096,
390 # 'utilization' : 0.0006153846153846154,
391 # 'write_ios' : 1.6153846153846154,
392 # 'write_throughput' : 65536.0,
393 # 'queue_length' : 0.0,
394 # 'read_ql' : 0.0,
395 # 'write_ql' : 0.0,
396 # }}
397 def check_diskstat_dict(item, params, disks):
398 # Take care of previously discovered services
399 if item in ("read", "write"):
400 yield 3, "Sorry, the new version of this check does not " \
401 "support one service for read and one for write anymore."
402 return
404 this_time = time.time()
405 disk = diskstat_select_disk(disks, item)
406 if not disk:
407 return
409 # Averaging
410 # Note: this check uses a simple method of averaging: As soon as averaging
411 # is turned on the actual metrics are *replaced* by the averaged ones. No
412 # duplication of performance data or check output here. This is because we
413 # have so many metrics...
414 prefix = ""
415 averaging = params.get("average") # in seconds here!
416 if averaging:
417 avg_disk = {} # Do not modify our arguments!!
418 for key, value in disk.items():
419 if isinstance(value, (int, float)):
420 avg_disk[key] = get_average("diskstat.%s.%s.avg" % (item, key), this_time, value,
421 averaging / 60.0)
422 else:
423 avg_disk[key] = value
424 disk = avg_disk
425 prefix = "%s average: " % get_age_human_readable(averaging)
427 # Utilization
428 if "utilization" in disk:
429 util = disk.pop("utilization")
430 yield check_levels(
431 util,
432 "disk_utilization",
433 params.get("utilization"),
434 unit="%",
435 scale=0.01,
436 statemarkers=False,
437 infoname=prefix + "Utilization")
439 # Throughput
440 for what in "read", "write":
441 if what + "_throughput" in disk:
442 throughput = disk.pop(what + "_throughput")
443 state, infotext, extraperf = check_levels(
444 throughput,
445 "disk_" + what + "_throughput",
446 params.get(what),
447 unit="MB/s",
448 scale=1048576,
449 statemarkers=False,
450 human_readable_func=lambda x: get_bytes_human_readable(x) + '/s',
451 infoname=what.title())
452 yield state, infotext, extraperf
454 # Average wait from end to end
455 for what in ["wait", "read_wait", "write_wait"]:
456 if "average_" + what in disk:
457 wait = disk.pop("average_" + what)
458 yield check_levels(
459 wait,
460 "disk_average_" + what,
461 params.get(what),
462 unit="ms",
463 scale=0.001,
464 statemarkers=False,
465 infoname="Average %s" % what.title().replace("_", " "))
467 # Average disk latency
468 if "latency" in disk:
469 latency = disk.pop("latency")
470 yield check_levels(
471 latency,
472 "disk_latency",
473 params.get("latency"),
474 unit="ms",
475 scale=0.001,
476 statemarkers=False,
477 infoname='Latency')
479 # Queue lengths
480 for what, plugin_text in [
481 ("queue_length", "Queue Length"),
482 ("read_ql", "Read Queue Length"),
483 ("write_ql", "Write Queue Length"),
485 if what in disk:
486 ql = disk.pop(what)
487 yield check_levels(
489 "disk_" + what,
490 params.get(what),
491 statemarkers=False,
492 infoname="Average %s" % plugin_text)
494 # I/O operations
495 for what in "read", "write":
496 if what + "_ios" in disk:
497 ios = disk.pop(what + "_ios")
498 yield check_levels(
499 ios,
500 "disk_" + what + "_ios",
501 params.get(what + "_ios"),
502 unit="1/s",
503 statemarkers=False,
504 infoname="%s operations" % what.title(),
507 # All the other metrics are currently not output in the plugin output - simply because
508 # of their amount. They are present as performance data and will shown in graphs.
510 # Send everything as performance data now. Sort keys alphabetically
511 perfdata = []
512 for key in sorted(disk.keys()):
513 value = disk[key]
514 if isinstance(value, (int, float)):
515 # Currently the levels are not shown in the perfdata
516 perfdata.append(("disk_" + key, value))
518 if perfdata:
519 yield 0, '', perfdata