checks/diskstat.include

   1 #!/usr/bin/python
   2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
   3 # +------------------------------------------------------------------+
   4 # |             ____ _               _        __  __ _  __           |
   5 # |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
   6 # |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
   7 # |           | |___| | | |  __/ (__|   <    | |  | | . \            |
   8 # |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
   9 # |                                                                  |
  10 # | Copyright Mathias Kettner 2014             mk@mathias-kettner.de |
  11 # +------------------------------------------------------------------+
  12 #
  13 # This file is part of Check_MK.
  14 # The official homepage is at http://mathias-kettner.de/check_mk.
  15 #
  16 # check_mk is free software;  you can redistribute it and/or modify it
  17 # under the  terms of the  GNU General Public License  as published by
  18 # the Free Software Foundation in version 2.  check_mk is  distributed
  19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
  20 # out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
  21 # PARTICULAR PURPOSE. See the  GNU General Public License for more de-
  22 # tails. You should have  received  a copy of the  GNU  General Public
  23 # License along with GNU Make; see the file  COPYING.  If  not,  write
  24 # to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
  25 # Boston, MA 02110-1301 USA.
  26
  27 diskstat_inventory_mode = "rule"  # "summary", "single", "legacy"
  28
  29 diskstat_default_levels = {
  30     #    "read" :    (10, 20),   # MB/sec
  31     #    "write" :   (20, 40),   # MB/sec
  32     #    "average" : 15,         # min
  33     #    "latency" : (10, 20),   # ms
  34     #    "latency_perfdata" : True,
  35 }
  36
  37 # Rule for controlling diskstat inventory more fine grained
  38 diskstat_inventory = []
  39
  40 # Example
  41 # diskstat_inventory = [
  42 #  ( [], [ 'linux' ], ALL_HOST ), --> No diskstat on this host
  43 #  ( [ 'summary', 'physical', 'lvm', 'vxvm' ], ALL_HOSTS ),
  44 # ]
  45
  46 diskstat_diskless_pattern = re.compile("x?[shv]d[a-z]*[0-9]+")
  47
  48
  49 def inventory_diskstat_generic(parsed):
  50     # Skip over on empty data
  51     if not parsed:
  52         return
  53
  54     # New style: use rule based configuration, defaulting to summary mode
  55     if diskstat_inventory_mode == "rule":
  56         hits = host_extra_conf(host_name(), diskstat_inventory)
  57         if len(hits) > 0:
  58             modes = hits[0]
  59         else:
  60             modes = ["summary"]
  61
  62     elif diskstat_inventory_mode == "single":
  63         modes = ["physical"]
  64     elif diskstat_inventory_mode == "summary":
  65         modes = ["summary"]
  66     else:
  67         modes = ["legacy"]
  68
  69     inventory = []
  70     if "summary" in modes:
  71         inventory.append(("SUMMARY", "diskstat_default_levels"))
  72
  73     if "legacy" in modes:
  74         inventory += [("read", None), ("write", None)]
  75
  76     for line in parsed:
  77         name = line[1]
  78         if "physical" in modes and \
  79            not ' ' in name and \
  80            not diskstat_diskless_pattern.match(name):
  81             inventory.append((name, "diskstat_default_levels"))
  82
  83         if "lvm" in modes and \
  84            name.startswith("LVM "):
  85             inventory.append((name, "diskstat_default_levels"))
  86
  87         if "vxvm" in modes and \
  88            name.startswith("VxVM "):
  89             inventory.append((name, "diskstat_default_levels"))
  90
  91         if "diskless" in modes and \
  92            diskstat_diskless_pattern.match(name):
  93             # Sort of partitions with disks - typical in XEN virtual setups.
  94             # Eg. there are xvda1, xvda2, but no xvda...
  95             inventory.append((name, "diskstat_default_levels"))
  96
  97     return inventory
  98
  99
 100 def check_diskstat_line(this_time, item, params, line, mode='sectors'):
 101     average_range = params.get("average")
 102     if average_range == 0:
 103         average_range = None  # disable averaging when 0 is set
 104
 105     perfdata = []
 106     infos = []
 107     status = 0
 108     node = line[0]
 109     if node is not None and node != "":
 110         infos.append("Node %s" % node)
 111     prediction_perf = []
 112     for what, ctr in [("read", line[2]), ("write", line[3])]:
 113         if node:
 114             countername = "diskstat.%s.%s.%s" % (node, item, what)
 115         else:
 116             countername = "diskstat.%s.%s" % (item, what)
 117
 118         # unpack levels now, need also for perfdata
 119         levels = params.get(what)
 120         if isinstance(levels, tuple):
 121             warn, crit = levels
 122         else:
 123             warn, crit = None, None
 124
 125         per_sec = get_rate(countername, this_time, int(ctr))
 126         if mode == 'sectors':
 127             # compute IO rate in bytes/sec
 128             bytes_per_sec = per_sec * 512
 129         elif mode == 'bytes':
 130             bytes_per_sec = per_sec
 131
 132         infos.append("%s/sec %s" % (get_bytes_human_readable(bytes_per_sec), what))
 133         perfdata.append((what, bytes_per_sec, warn, crit))
 134         dsname = what
 135
 136         # compute average of the rate over ___ minutes
 137         if average_range is not None:
 138             avg = get_average(countername + ".avg", this_time, bytes_per_sec, average_range)
 139             dsname = what + ".avg"
 140             perfdata.append((dsname, avg))
 141             bytes_per_sec = avg
 142
 143         # check levels
 144         state, text, extraperf = check_levels(
 145             bytes_per_sec, dsname, levels, unit="MB/s", scale=1048576, statemarkers=True)
 146         if text:
 147             infos.append(text)
 148         status = max(state, status)
 149         prediction_perf += extraperf
 150
 151     # Add performance data for averaged IO
 152     if average_range is not None:
 153         perfdata = [perfdata[0], perfdata[2], perfdata[1], perfdata[3]]
 154
 155     # Process IOs when available
 156     ios_per_sec = None
 157     if len(line) >= 6 and line[4] >= 0 and line[5] > 0:
 158         reads, writes = map(int, line[4:6])
 159         if "read_ios" in params:
 160             warn, crit = params["read_ios"]
 161             if reads >= crit:
 162                 infos.append('Read operations: %d (!!)' % (reads))
 163                 status = 2
 164             elif reads >= warn:
 165                 infos.append('Read operations: %d (!)' % (reads))
 166                 status = max(status, 1)
 167         else:
 168             warn, crit = None, None
 169         if "write_ios" in params:
 170             warn, crit = params["write_ios"]
 171             if writes >= crit:
 172                 infos.append('Write operations: %d (!!)' % (writes))
 173                 status = 2
 174             elif writes >= warn:
 175                 infos.append('Write operations: %d (!)' % (writes))
 176                 status = max(status, 1)
 177         else:
 178             warn, crit = None, None
 179         ios = reads + writes
 180         ios_per_sec = get_rate(countername + ".ios", this_time, ios)
 181         infos.append("IOs: %.2f/sec" % ios_per_sec)
 182
 183         if params.get("latency_perfdata"):
 184             perfdata.append(("ios", ios_per_sec))
 185
 186     # Do Latency computation if this information is available:
 187     if len(line) >= 7 and line[6] >= 0:
 188         timems = int(line[6])
 189         timems_per_sec = get_rate(countername + ".time", this_time, timems)
 190         if not ios_per_sec:
 191             latency = 0.0
 192         else:
 193             latency = timems_per_sec / ios_per_sec
 194         infos.append("Latency: %.2fms" % latency)
 195         if "latency" in params:
 196             warn, crit = params["latency"]
 197             if latency >= crit:
 198                 status = 2
 199                 infos[-1] += "(!!)"
 200             elif latency >= warn:
 201                 status = max(status, 1)
 202                 infos[-1] += "(!)"
 203         else:
 204             warn, crit = None, None
 205
 206         if params.get("latency_perfdata"):
 207             perfdata.append(("latency", latency, warn, crit))
 208
 209     # Queue Lengths (currently only Windows). Windows uses counters here.
 210     # I have not understood, why....
 211     if len(line) >= 9:
 212         for what, ctr in [("read", line[7]), ("write", line[8])]:
 213             countername = "diskstat.%s.ql.%s" % (item, what)
 214             levels = params.get(what + "_ql")
 215             if levels:
 216                 warn, crit = levels
 217             else:
 218                 warn, crit = None, None
 219
 220             qlx = get_rate(countername, this_time, int(ctr))
 221             ql = qlx / 10000000.0
 222             infos.append(what.title() + " Queue: %.2f" % ql)
 223
 224             # check levels
 225             if levels is not None:
 226                 if ql >= crit:
 227                     status = 2
 228                     infos[-1] += "(!!)"
 229                 elif ql >= warn:
 230                     status = max(status, 1)
 231                     infos[-1] += "(!)"
 232
 233             if params.get("ql_perfdata"):
 234                 perfdata.append((what + "_ql", ql))
 235
 236     perfdata += prediction_perf
 237
 238     return (status, ", ".join(infos), perfdata)
 239
 240
 241 def check_diskstat_generic(item, params, this_time, info, mode='sectors'):
 242     # legacy version if item is "read" or "write"
 243     if item in ['read', 'write']:
 244         return _check_diskstat_old(item, params, this_time, info)
 245
 246     # Sum up either all physical disks (if item is "SUMMARY") or
 247     # all entries matching the item in question. It is not a bug if
 248     # a disk appears more than once. This can for example happen in
 249     # Windows clusters - even if they are no Check_MK clusters.
 250
 251     summed_up = [0] * 13
 252     matching = 0
 253
 254     for line in info:
 255         if item == 'SUMMARY' and line[0] is not None:
 256             return 3, "summary mode not supported in a cluster"
 257
 258         elif item == 'SUMMARY' and ' ' in line[1]:
 259             continue  # skip non-physical disks
 260
 261         elif item == 'SUMMARY' or line[1] == item:
 262             matching += 1
 263             summed_up = [x + int(y) for x, y in zip(summed_up, line[2:])]
 264
 265     if matching == 0:
 266         return 3, "No matching disk found"
 267     return check_diskstat_line(this_time, item, params, [None, ''] + summed_up, mode)
 268
 269
 270 # This is the legacy version of diskstat as used in <= 1.1.10.
 271 # We keep it here for a while in order to be compatible with
 272 # old installations.
 273 def _check_diskstat_old(item, params, this_time, info):
 274     # sum up over all devices
 275     if item == 'read':
 276         index = 2  # sectors read
 277     elif item == 'write':
 278         index = 3  # sectors written
 279     else:
 280         return (3, "invalid item %s" % (item,))
 281
 282     this_val = 0
 283     for line in info:
 284         if line[0] is not None:
 285             return 3, "read/write mode not supported in a cluster"
 286         if ' ' not in line[1]:
 287             this_val += int(line[index])
 288
 289     per_sec = get_rate("diskstat." + item, this_time, this_val)
 290     mb_per_s = per_sec / 2048.0  # Diskstat output is in sectors a 512 Byte
 291     kb_per_s = per_sec / 2.0
 292     perfdata = [(item, "%f" % kb_per_s)]
 293     return (0, "%.1f MB/s" % mb_per_s, perfdata)
 294
 295
 296 #.
 297 #   .--Dict based API------------------------------------------------------.
 298 #   |  ____  _      _     _                        _      _    ____ ___    |
 299 #   | |  _ \(_) ___| |_  | |__   __ _ ___  ___  __| |    / \  |  _ \_ _|   |
 300 #   | | | | | |/ __| __| | '_ \ / _` / __|/ _ \/ _` |   / _ \ | |_) | |    |
 301 #   | | |_| | | (__| |_  | |_) | (_| \__ \  __/ (_| |  / ___ \|  __/| |    |
 302 #   | |____/|_|\___|\__| |_.__/ \__,_|___/\___|\__,_| /_/   \_\_|  |___|   |
 303 #   |                                                                      |
 304 #   +----------------------------------------------------------------------+
 305 #   |  The newest generation of Disk IO checks parse all informatin info   |
 306 #   |  a dictionary, where counters are aleady resolved. Look at diskstat  |
 307 #   |  (the Linux diskstat check) for an example.                          |
 308 #   '----------------------------------------------------------------------'
 309
 310
 311 def diskstat_select_disk(disks, item):
 312
 313     # In summary mode we add up the throughput values, but
 314     # we average the other values for disks that have a throughput
 315     # > 0. Note: This is not very precise. Strictly spoken
 316     # we would need to do the summarization directly in the
 317     # parse function. But there we do not have information about
 318     # the physical multipath devices and would add up the traffic
 319     # of the paths with the traffice of the device itself....
 320
 321     if item == "SUMMARY":
 322         summarized = {
 323             "node": None,
 324             # We do not set these settings explictly because some
 325             # devices may not provide all of them.
 326             # "read_ios"                   : 0.0,
 327             # "write_ios"                  : 0.0,
 328             # "read_throughput"            : 0.0,
 329             # "write_throughput"           : 0.0,
 330             # "utilization"                : 0.0,
 331             # "latency"                    : 0.0,
 332             # "average_request_size"       : 0.0,
 333             # "average_wait"               : 0.0,
 334             # "average_read_wait"          : 0.0,
 335             # "average_read_request_size"  : 0.0,
 336             # "average_write_wait"         : 0.0,
 337             # "average_write_request_size" : 0.0,
 338             # "queue_length"               : 0.0,
 339             # "read_ql"                    : 0.0,
 340             # "write_ql"                   : 0.0,
 341         }
 342
 343         if disks:
 344             num_averaged = 0
 345             for device, disk in disks.items():
 346                 # If all disks are idle the summarized dict would have no keys
 347                 # So we take care that at least all keys of this disk are set
 348                 for key in disk.keys():
 349                     if key != "node":
 350                         summarized.setdefault(key, 0.0)
 351
 352                 if device.startswith("LVM "):
 353                     continue  # skip LVM devices for summary
 354
 355                 if True or disk["read_throughput"] + disk["write_throughput"] > 0:  # skip idle disks
 356                     num_averaged += 1
 357                     for key, value in disk.items():
 358                         if key != "node":
 359                             summarized[key] += value
 360
 361             if num_averaged:
 362                 for key, value in summarized.items():
 363                     if key.startswith("ave") or key in ("utilization", "latency", "queue_length"):
 364                         summarized[key] /= num_averaged
 365
 366         return summarized
 367
 368     elif item not in disks:
 369         return None
 370
 371     else:
 372         return disks[item]
 373
 374
 375 # New version for this diskstat checks that use the new dict
 376 # format. The first one is "diskstat" - the Linux version of
 377 # this check. Look there for examples of the format of the
 378 # dictionary "disks". Example:
 379 # disks = { "sda" : {
 380 #       'node'                       : None,
 381 #       'average_read_request_size'  : 0.0,
 382 #       'average_read_wait'          : 0.0,
 383 #       'average_request_size'       : 40569.90476190476,
 384 #       'average_wait'               : 0.761904761904762,
 385 #       'average_write_request_size' : 40569.90476190476,
 386 #       'average_write_wait'         : 0.0007619047619047619,
 387 #       'read_ios'                   : 0.0,
 388 #       'read_throughput'            : 0.0,
 389 #       'latency'                    : 0.00038095238095238096,
 390 #       'utilization'                : 0.0006153846153846154,
 391 #       'write_ios'                  : 1.6153846153846154,
 392 #       'write_throughput'           : 65536.0,
 393 #       'queue_length'               : 0.0,
 394 #       'read_ql'                    : 0.0,
 395 #       'write_ql'                   : 0.0,
 396 # }}
 397 def check_diskstat_dict(item, params, disks):
 398     # Take care of previously discovered services
 399     if item in ("read", "write"):
 400         yield 3, "Sorry, the new version of this check does not " \
 401                   "support one service for read and one for write anymore."
 402         return
 403
 404     this_time = time.time()
 405     disk = diskstat_select_disk(disks, item)
 406     if not disk:
 407         return
 408
 409     # Averaging
 410     # Note: this check uses a simple method of averaging: As soon as averaging
 411     # is turned on the actual metrics are *replaced* by the averaged ones. No
 412     # duplication of performance data or check output here. This is because we
 413     # have so many metrics...
 414     prefix = ""
 415     averaging = params.get("average")  # in seconds here!
 416     if averaging:
 417         avg_disk = {}  # Do not modify our arguments!!
 418         for key, value in disk.items():
 419             if isinstance(value, (int, float)):
 420                 avg_disk[key] = get_average("diskstat.%s.%s.avg" % (item, key), this_time, value,
 421                                             averaging / 60.0)
 422             else:
 423                 avg_disk[key] = value
 424         disk = avg_disk
 425         prefix = "%s average: " % get_age_human_readable(averaging)
 426
 427     # Utilization
 428     if "utilization" in disk:
 429         util = disk.pop("utilization")
 430         yield check_levels(
 431             util,
 432             "disk_utilization",
 433             params.get("utilization"),
 434             unit="%",
 435             scale=0.01,
 436             statemarkers=False,
 437             infoname=prefix + "Utilization")
 438
 439     # Throughput
 440     for what in "read", "write":
 441         if what + "_throughput" in disk:
 442             throughput = disk.pop(what + "_throughput")
 443             state, infotext, extraperf = check_levels(
 444                 throughput,
 445                 "disk_" + what + "_throughput",
 446                 params.get(what),
 447                 unit="MB/s",
 448                 scale=1048576,
 449                 statemarkers=False,
 450                 human_readable_func=lambda x: get_bytes_human_readable(x) + '/s',
 451                 infoname=what.title())
 452             yield state, infotext, extraperf
 453
 454     # Average wait from end to end
 455     for what in ["wait", "read_wait", "write_wait"]:
 456         if "average_" + what in disk:
 457             wait = disk.pop("average_" + what)
 458             yield check_levels(
 459                 wait,
 460                 "disk_average_" + what,
 461                 params.get(what),
 462                 unit="ms",
 463                 scale=0.001,
 464                 statemarkers=False,
 465                 infoname="Average %s" % what.title().replace("_", " "))
 466
 467     # Average disk latency
 468     if "latency" in disk:
 469         latency = disk.pop("latency")
 470         yield check_levels(
 471             latency,
 472             "disk_latency",
 473             params.get("latency"),
 474             unit="ms",
 475             scale=0.001,
 476             statemarkers=False,
 477             infoname='Latency')
 478
 479     # Queue lengths
 480     for what, plugin_text in [
 481         ("queue_length", "Queue Length"),
 482         ("read_ql", "Read Queue Length"),
 483         ("write_ql", "Write Queue Length"),
 484     ]:
 485         if what in disk:
 486             ql = disk.pop(what)
 487             yield check_levels(
 488                 ql,
 489                 "disk_" + what,
 490                 params.get(what),
 491                 statemarkers=False,
 492                 infoname="Average %s" % plugin_text)
 493
 494     # I/O operations
 495     for what in "read", "write":
 496         if what + "_ios" in disk:
 497             ios = disk.pop(what + "_ios")
 498             yield check_levels(
 499                 ios,
 500                 "disk_" + what + "_ios",
 501                 params.get(what + "_ios"),
 502                 unit="1/s",
 503                 statemarkers=False,
 504                 infoname="%s operations" % what.title(),
 505             )
 506
 507     # All the other metrics are currently not output in the plugin output - simply because
 508     # of their amount. They are present as performance data and will shown in graphs.
 509
 510     # Send everything as performance data now. Sort keys alphabetically
 511     perfdata = []
 512     for key in sorted(disk.keys()):
 513         value = disk[key]
 514         if isinstance(value, (int, float)):
 515             # Currently the levels are not shown in the perfdata
 516             perfdata.append(("disk_" + key, value))
 517
 518     if perfdata:
 519         yield 0, '', perfdata