checks/diskstat

   1 #!/usr/bin/python
   2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
   3 # +------------------------------------------------------------------+
   4 # |             ____ _               _        __  __ _  __           |
   5 # |            / ___| |__   ___  ___| | __   |  \/  | |/ /           |
   6 # |           | |   | '_ \ / _ \/ __| |/ /   | |\/| | ' /            |
   7 # |           | |___| | | |  __/ (__|   <    | |  | | . \            |
   8 # |            \____|_| |_|\___|\___|_|\_\___|_|  |_|_|\_\           |
   9 # |                                                                  |
  10 # | Copyright Mathias Kettner 2014             mk@mathias-kettner.de |
  11 # +------------------------------------------------------------------+
  12 #
  13 # This file is part of Check_MK.
  14 # The official homepage is at http://mathias-kettner.de/check_mk.
  15 #
  16 # check_mk is free software;  you can redistribute it and/or modify it
  17 # under the  terms of the  GNU General Public License  as published by
  18 # the Free Software Foundation in version 2.  check_mk is  distributed
  19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY;  with-
  20 # out even the implied warranty of  MERCHANTABILITY  or  FITNESS FOR A
  21 # PARTICULAR PURPOSE. See the  GNU General Public License for more de-
  22 # tails. You should have  received  a copy of the  GNU  General Public
  23 # License along with GNU Make; see the file  COPYING.  If  not,  write
  24 # to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
  25 # Boston, MA 02110-1301 USA.
  26
  27 # <<<diskstat>>>
  28 # 1300264105
  29 #    8       0 sda 691860 951191 13559915 491748 234686 197346 3359512 94944 0 56844 586312
  30 #    8      32 sdb 791860 91191 23589915 491748 234686 197346 3359512 94944 0 56844 586312
  31
  32 # Newer agent output also dm-* and Veritas devices and if
  33 # available the following additional information for name rewriting:
  34
  35 # <<<diskstat>>>
  36 # 1338931242
  37 #    8       0 sda 6142 327 219612 2244 3190 6233 74075 8206 0 6523 10446
  38 #  253       0 dm-0 4579 0 181754 2343 9249 0 73960 259491 0 1208 261833
  39 #  253       1 dm-1 342 0 2736 47 3 0 11796464 5016 0 5063 5063
  40 #  253       2 dm-2 160 0 1274 27 11 0 56 3 0 27 30
  41 #    8      16 sdb 464 858 7717 336 1033 0 311454 3899 0 3007 4231
  42 #    8      32 sdc 855 13352 106777 1172 915 0 154467 2798 0 3012 3967
  43 #    8      48 sdd 1217 861 109802 1646 118 0 56151 1775 0 2736 3420
  44 #    8      80 sdf 359 1244 58323 792 66 0 4793 388 0 765 1178
  45 #    8      64 sde 310 1242 6964 268 118 0 56151 1607 0 1307 1872
  46 #    8      96 sdg 1393 1242 314835 3759 129 0 56172 1867 0 4027 5619
  47 #  199   27000 VxVM27000 131 0 990 61 11 0 21 29 0 89 90
  48 #  199   27001 VxVM27001 0 0 0 0 0 0 0 0 0 0 0
  49 # [dmsetup_info]
  50 # vg_zwei-lv_home 253:2 vg_zwei lv_home
  51 # vg_zwei-lv_swap 253:1 vg_zwei lv_swap
  52 # vg_zwei-lv_root 253:0 vg_zwei lv_root
  53 # [vx_dsk]
  54 # c7 6978 /dev/vx/dsk/datadg/lalavol
  55 # c7 6979 /dev/vx/dsk/datadg/oravol
  56
  57 # output may have zeros appended
  58 #
  59 # 8 0 sda 111918756 929875 3960367050 349083041 20142495 1149711 1021234448 851284769 0 233177192 1197549009 0 0 0 0
  60 # 8 1 sda1 226 0 27481 3388 381 3 31472 35862 0 8123 39260 0 0 0 0
  61 # 8 2 sda2 111918500 929875 3960337473 349079568 20142114 1149708 1021202976 851248906 0 233176504 1197492420 0 0 0 0
  62 # 253 0 dm-0 883953 0 92124097 10287533 108572 0 2251672 809814 0 7545567 11097424 0 0 0 0
  63 # 253 1 dm-1 21046 0 172072 157766 164020 0 1312160 29292970 0 124138 29451007 0 0 0 0
  64 # 253 2 dm-2 750714 0 19747073 7702216 1445987 0 36811608 9817313 0 7159271 17520030 0 0 0 0
  65
  66 # Fields in /proc/diskstats
  67 #  Index 0 -- major number
  68 #  Index 1 -- minor number
  69 #  Index 2 -- device name                        --> used by check
  70 #  Index 3 -- # of reads issued
  71 #  Index 4 -- # of reads merged
  72 #  Index 5 -- # of sectors read (a 512 Byte)     --> used by check
  73 #  Index 6 -- # of milliseconds spent reading
  74 #  Index 7 -- # of writes completed
  75 #  Index 8 -- # of writes merged
  76 #  Index 9 -- # of sectors written (a 512 Byte)  --> used by check
  77 #  Index 10 -- # of milliseconds spent writing
  78 #  Index 11 -- # of I/Os currently in progress
  79 #  Index 12 -- # of milliseconds spent doing I/Os
  80 #  Index 13 -- weighted # of milliseconds spent doing I/Os
  81
  82 # Convert information to generic format also generated
  83 # by winperf_phydisk
  84 # [ now, [( disk, readctr, writectr ), ... ]]
  85 # where counters are in sectors (512 bytes)
  86
  87 # Parse /proc/diskstat and additional information into a nice canonical
  88 # dictionary of the form:
  89 # disks = {
  90 #     "hda" : {
  91 #       'average_read_request_size'  : 0.0,
  92 #       'average_read_wait'          : 0.0,
  93 #       'average_request_size'       : 40569.90476190476,
  94 #       'average_wait'               : 0.761904761904762,
  95 #       'average_write_request_size' : 40569.90476190476,
  96 #       'average_write_wait'         : 0.0007619047619047619,
  97 #       'node'                       : None,
  98 #       'read_ios'                   : 0.0,
  99 #       'read_throughput'            : 0.0,
 100 #       'latency'                    : 0.00038095238095238096,
 101 #       'utilization'                : 0.0006153846153846154,
 102 #       'write_ios'                  : 1.6153846153846154,
 103 #       'write_throughput'           : 65536.0,
 104 #     },
 105 #     "LVM foobar" : {
 106 #         ...
 107 #     }
 108 # }
 109 #
 110 # Returns a pair of the timestamp and that dictionary
 111 # parsed = timestamp, disks
 112
 113
 114 # Consideration for debugging purposes:
 115 # Due to check_info['diskstat']['extra_sections']: ["multipath"])
 116 # each info list is prefixed with '<node_name>'.
 117 def parse_diskstat(info):
 118     timestamp_str, proc_diskstat, name_info = diskstat_extract_name_info(info)
 119     # limit diskstat to first elements before actual parsing
 120     proc_diskstat = [ds[:15] for ds in proc_diskstat]
 121     timestamp = int(timestamp_str)
 122
 123     # Here we discover real partitions and exclude them:
 124     # Sort of partitions with disks - typical in XEN virtual setups.
 125     # Eg. there are xvda1, xvda2, but no xvda...
 126     device_names = [line[3] for line in proc_diskstat]
 127     real_partitions = {
 128         device_name for device_name in device_names
 129         if diskstat_diskless_pattern.match(device_name) and re.sub('[0-9]+$', '', device_name)
 130     }
 131     disks = {}
 132     for line in proc_diskstat:
 133         if line[3] in real_partitions:
 134             continue
 135
 136         node_name, major, minor, device, \
 137             read_ios, _read_merges, read_sectors, read_ticks, \
 138             write_ios, _write_merges, write_sectors, write_ticks, \
 139             ios_in_prog, total_ticks, _rq_ticks = line
 140
 141         if (node_name, int(major), int(minor)) in name_info:
 142             device = name_info[(node_name, int(major), int(minor))]
 143
 144         counter_base = "diskstat.%s." % device
 145
 146         # Some of the following computations were learned from Munin. Thanks
 147         # to that project!
 148
 149         # There are 1000 ticks per second
 150         # Note: we use onwrap=0.0 here because the parse function is being used also during
 151         # service discovery. If we raise a counter wrap exception here, then nothing will
 152         # be inventorized.
 153         read_ticks_rate = get_rate(
 154             counter_base + "read_ticks", timestamp, int(read_ticks), onwrap=0.0)
 155         write_ticks_rate = get_rate(
 156             counter_base + "write_ticks", timestamp, int(write_ticks), onwrap=0.0)
 157         total_ticks_rate = get_rate(
 158             counter_base + "total_ticks", timestamp, int(total_ticks), onwrap=0.0)
 159         read_ios_rate = get_rate(counter_base + "read_ios", timestamp, int(read_ios), onwrap=0.0)
 160         write_ios_rate = get_rate(counter_base + "write_ios", timestamp, int(write_ios), onwrap=0.0)
 161         total_ios_rate = read_ios_rate + write_ios_rate
 162         utilization = total_ticks_rate / 1000  # not percent, but 0...1
 163         read_bytes_rate = get_rate(
 164             counter_base + "read_sectors", timestamp, int(read_sectors), onwrap=0.0) * 512
 165         write_bytes_rate = get_rate(
 166             counter_base + "write_sectors", timestamp, int(write_sectors), onwrap=0.0) * 512
 167         total_bytes_rate = read_bytes_rate + write_bytes_rate
 168
 169         # The service time is computed from the utilization. If we work
 170         # e.g. 0.34 (34%) of the time and we can do 17 operations in that
 171         # time then the average latency is time * 0.34 / 17
 172         if total_ios_rate:
 173             latency = utilization / total_ios_rate
 174             average_wait = (read_ticks_rate + write_ticks_rate) / total_ios_rate / 1000.0
 175             average_request_size = total_bytes_rate / total_ios_rate
 176         else:
 177             latency = 0.0
 178             average_wait = 0.0
 179             average_request_size = 0.0
 180
 181         # Average read and write rate, from end to end, including queuing, etc.
 182         # and average size of one request
 183         if read_ticks_rate and read_ios_rate > 0:
 184             average_read_wait = read_ticks_rate / read_ios_rate / 1000.0
 185             average_read_size = read_bytes_rate / read_ios_rate
 186         else:
 187             average_read_wait = 0.0
 188             average_read_size = 0.0
 189
 190         if write_ticks_rate and write_ios_rate > 0:
 191             average_write_wait = write_ticks_rate / write_ios_rate / 1000.0
 192             average_write_size = write_bytes_rate / write_ios_rate
 193         else:
 194             average_write_wait = 0.0
 195             average_write_size = 0.0
 196
 197         disks[device] = {
 198             "node": node_name,
 199             "read_ios": read_ios_rate,
 200             "write_ios": write_ios_rate,
 201             "read_throughput": read_bytes_rate,
 202             "write_throughput": write_bytes_rate,
 203             "utilization": utilization,
 204             "latency": latency,
 205             "average_request_size": average_request_size,
 206             "average_wait": average_wait,
 207             "average_read_wait": average_read_wait,
 208             "average_read_request_size": average_read_size,
 209             "average_write_wait": average_write_wait,
 210             "average_write_request_size": average_write_size,
 211             "queue_length": int(ios_in_prog),
 212         }
 213
 214     return disks
 215
 216
 217 ### #  Index 0 -- major number
 218 ### #  Index 1 -- minor number
 219 ### #  Index 2 -- device name                        --> used by check
 220 ### #  Index 3 -- # of reads issued
 221 ### #  Index 4 -- # of reads merged
 222 ### #  Index 5 -- # of sectors read (a 512 Byte)     --> used by check
 223 ### #  Index 6 -- # of milliseconds spent reading
 224 ### #  Index 7 -- # of writes completed
 225 ### #  Index 8 -- # of writes merged
 226 ### #  Index 9 -- # of sectors written (a 512 Byte)  --> used by check
 227 ### #  Index 10 -- # of milliseconds spent writing
 228 ### #  Index 11 -- # of I/Os currently in progress
 229 ### #  Index 12 -- # of milliseconds spent doing I/Os
 230 ### #  Index 13 -- weighted # of milliseconds spent doing I/Os
 231 ###     for line in proc_diskstat:
 232 ###         node = line[0]
 233 ###
 234 ###
 235 ###
 236 ###     # For multipath devices use the entries for dm-?? and rename
 237 ###     # them with their multipath UUID/alias - and drop the according
 238 ###     # sdXY that belong to the paths.
 239 ###     multipath_name_info = {}
 240 ###     skipped_devices = set([])
 241 ###
 242 ###     # The generic function takes the following values per line:
 243 ###     #  0: None or node name
 244 ###     #  1: devname
 245 ###     #  2: read bytes counter
 246 ###     #  3: write bytes counter
 247 ###     # Optional ones:
 248 ###     #  4: number of reads
 249 ###     #  5: number of writes
 250 ###     #  6: timems
 251 ###     #  7: read queue length *counters*
 252 ###     #  8: write queue length *counters*
 253 ###     rewritten = [
 254 ###         ( l[0], # node name or None
 255 ###         diskstat_rewrite_device(name_info, multipath_name_info, l[0:4]),
 256 ###         int(l[6]),
 257 ###         int(l[10]),
 258 ###         int(l[4]),
 259 ###         int(l[8]),
 260 ###         # int(l[13])
 261 ###         ) for l in info[1:] if len(l) >= 14
 262 ###     ]
 263 ###
 264 ###     # Remove device mapper devices without a translated name
 265 ###     return [ line for line in rewritten
 266 ###              if not line[1].startswith("dm-")
 267 ###                 and not line[1] in skipped_devices ]
 268
 269
 270 # Extra additional information from diskstat section about
 271 # LVM and DM devices. These information is encapsulated
 272 # with [dmsetup_info] and [vx_dsk] subsections. Example for
 273 # name_info:
 274 # {
 275 #     (None, 253, 0): 'LVM vg00-rootvol',
 276 #     (None, 253, 1): 'LVM vg00-tmpvol',
 277 #     (None, 253, 2): 'LVM vg00-varvol',
 278 #     (None, 253, 3): 'LVM vg00-optvol',
 279 #     (None, 253, 4): 'LVM vg00-usrvol',
 280 #     (None, 253, 5): 'LVM vg00-swapvol',
 281 #     (None, 253, 6): 'LVM vgappl-applvol',
 282 # }
 283 def diskstat_extract_name_info(info):
 284     name_info = {}  # dict from (node, major, minor) to itemname
 285     timestamp = None
 286
 287     info_plain = []
 288     phase = 'info'
 289     node = None
 290     for line in info:
 291         if node is None:
 292             node = line[0]
 293
 294         if line[1] == '[dmsetup_info]':
 295             phase = 'dmsetup_info'
 296         elif line[1] == '[vx_dsk]':
 297             phase = 'vx_dsk'
 298         # new node in case of a cluster, restart with info phase
 299         elif line[0] != node:
 300             phase = 'info'
 301             node = line[0]
 302         else:
 303             if phase == 'info':
 304                 if len(line) == 2:
 305                     timestamp = int(line[1])
 306                 else:
 307                     info_plain.append(line)
 308             elif phase == 'dmsetup_info':
 309                 try:
 310                     major, minor = map(int, line[2].split(':'))
 311                     if len(line) == 5:
 312                         name = "LVM %s" % line[1]
 313                     else:
 314                         name = "DM %s" % line[1]
 315                     name_info[node, major, minor] = name
 316                 except:
 317                     pass  # ignore such crap as "No Devices Found"
 318             elif phase == 'vx_dsk':
 319                 major = int(line[1], 16)
 320                 minor = int(line[2], 16)
 321                 group, disk = line[3].split('/')[-2:]
 322                 name = "VxVM %s-%s" % (group, disk)
 323                 name_info[(node, major, minor)] = name
 324
 325     return timestamp, info_plain, name_info
 326
 327
 328 def diskstat_convert_info(parsed):
 329     disks, multipath_info = parsed
 330     converted_disks = dict(disks.items())  # we must not modify info!
 331
 332     # If we have information about multipathing, then remove the
 333     # physical path devices from the disks array. But only do this,
 334     # when there are information for the multipath device available.
 335     #
 336     # For multipath entries: Rename the generic names like "dm-8"
 337     # with multipath names like "SDataCoreSANsymphony_DAT07-fscl"
 338     if multipath_info:
 339         for uuid, multipath in multipath_info.items():
 340             if "alias" not in multipath:
 341                 multipath["alias"] = ""
 342
 343             if multipath["device"] in converted_disks or \
 344                "DM %s" % multipath["alias"] in converted_disks:
 345                 for path in multipath["paths"]:
 346                     if path in converted_disks:
 347                         del converted_disks[path]
 348
 349             if multipath["device"] in converted_disks:
 350                 converted_disks[uuid] = converted_disks[multipath["device"]]
 351                 del converted_disks[multipath["device"]]
 352
 353             if "DM %s" % multipath["alias"] in converted_disks:
 354                 alias = "DM %s" % multipath["alias"]
 355                 converted_disks[uuid] = converted_disks[alias]
 356                 del converted_disks[alias]
 357
 358     # Remove any left-over device mapper devices that are not part of a
 359     # known multipath device, LVM device or whatever
 360     for device in converted_disks.keys():
 361         if device.startswith("dm-"):
 362             del converted_disks[device]
 363
 364     return converted_disks
 365
 366
 367 def inventory_diskstat(parsed):
 368     converted_disks = diskstat_convert_info(parsed)
 369
 370     # Use generic diskstat inventory function that is used also for other
 371     # Disk IO checks. That expects a table of (node, device, ...)
 372     return inventory_diskstat_generic(
 373         [(disk["node"], device) for device, disk in converted_disks.items()])
 374
 375
 376 def check_diskstat(item, params, parsed):
 377     return check_diskstat_dict(item, params, diskstat_convert_info(parsed))
 378
 379
 380 check_info["diskstat"] = {
 381     'parse_function': parse_diskstat,
 382     'inventory_function': inventory_diskstat,
 383     'check_function': check_diskstat,
 384     'service_description': 'Disk IO %s',
 385     'has_perfdata': True,
 386     'group': 'diskstat',
 387     "node_info": True,  # add first column with actual host name
 388     'includes': ["diskstat.include"],
 389     'extra_sections': ["multipath"],
 390 }