Refactoring: Changed all check parameters starting with an 'o' to the new rulespec...
[check_mk.git] / checks / drbd
blobaa0e01c151a07bb3c2efee20bf61ce3415a2c53c
1 #!/usr/bin/python
2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
9 # | |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 # Author: Lars Michelsen <lm@mathias-kettner.de>
29 # Example outputs from agent:
31 # While syncing:
32 # <<<drbd>>>
33 # version: 8.3.8 (api:88/proto:86-94)
34 # GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by cssint@erzc20, 2010-06-17 14:47:26
35 # 0: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r----
36 # ns:12031428 nr:0 dw:12031364 dr:1175992347 al:2179 bm:71877 lo:37 pe:0 ua:37 ap:0 ep:1 wo:b oos:301729988
37 # [=======>............] sync'ed: 42.4% (294656/510908)M delay_probe: 145637
38 # finish: 1:23:28 speed: 60,172 (51,448) K/sec
40 # Sync stalled:
41 # <<<drbd>>>
42 # b01srv05:~ # cat /proc/drbd
43 # version: 8.3.8 (api:88/proto:86-94)
44 # GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by cssint@erzc20, 2010-06-17 14:47:26
45 # 0: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r----
46 # ns:11545876 nr:0 dw:11545900 dr:954551211 al:1955 bm:58360 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:523171100
47 # [>....................] sync'ed: 0.1% (510908/510908)M delay_probe: 135599
48 # stalled
50 # Synced:
51 # <<<drbd>>>
52 # version: 8.3.8 (api:88/proto:86-94)
53 # GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by cssint@erzc20, 2010-06-17 14:47:26
54 # 0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r----
55 # ns:12227928 nr:0 dw:12227864 dr:1477722351 al:2300 bm:90294 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
57 # Description of the /proc/drbd output:
58 # http://www.drbd.org/users-guide/ch-admin.html#s-proc-drbd
60 # The information from /proc/drbd are grouped as followed (Extracted from doc above)
62 # General:
63 # cs (connection state). Status of the network connection. See the section called
64 # “Connection states” for details about the various connection states.
65 # Available States:
66 # StandAlone. No network configuration available. The resource has not yet been connected,
67 # or has been administratively disconnected (using drbdadm disconnect),
68 # or has dropped its connection due to failed authentication or split brain.
69 # Disconnecting. Temporary state during disconnection. The next state is StandAlone.
70 # Unconnected. Temporary state, prior to a connection attempt.
71 # Possible next states: WFConnection and WFReportParams.
72 # Timeout. Temporary state following a timeout in the communication with the peer. Next state: Unconnected.
73 # BrokenPipe. Temporary state after the connection to the peer was lost. Next state: Unconnected.
74 # NetworkFailure. Temporary state after the connection to the partner was lost. Next state: Unconnected.
75 # ProtocolError. Temporary state after the connection to the partner was lost. Next state: Unconnected.
76 # TearDown. Temporary state. The peer is closing the connection. Next state: Unconnected.
77 # WFConnection. This node is waiting until the peer node becomes visible on the network.
78 # WFReportParams. TCP connection has been established, this node waits for the first network packet from the peer.
79 # Connected. A DRBD connection has been established, data mirroring is now active. This is the normal state.
80 # StartingSyncS. Full synchronization, initiated by the administrator, is just starting.
81 # The next possible states are: SyncSource or PausedSyncS.
82 # StartingSyncT. Full synchronization, initiated by the administrator, is just starting. Next state: WFSyncUUID.
83 # WFBitMapS. Partial synchronization is just starting. Next possible states: SyncSource or PausedSyncS.
84 # WFBitMapT. Partial synchronization is just starting. Next possible state: WFSyncUUID.
85 # WFSyncUUID. Synchronization is about to begin. Next possible states: SyncTarget or PausedSyncT.
86 # SyncSource. Synchronization is currently running, with the local node being the source of synchronization.
87 # SyncTarget. Synchronization is currently running, with the local node being the target of synchronization.
88 # PausedSyncS. The local node is the source of an ongoing synchronization, but synchronization is currently paused.
89 # This may be due to a dependency on the completion of another synchronization process,
90 # or due to synchronization having been manually interrupted by drbdadm pause-sync.
91 # PausedSyncT. The local node is the target of an ongoing synchronization, but synchronization
92 # is currently paused. This may be due to a dependency on the completion of another
93 # synchronization process, or due to synchronization having been manually interrupted by drbdadm pause-sync.
94 # VerifyS. On-line device verification is currently running, with the local node being the source of verification.
95 # VerifyT. On-line device verification is currently running, with the local node being the target of verification.
97 # ro (roles). Roles of the nodes. The role of the local node is displayed first, followed by the role of the partner
98 # node shown after the slash. See the section called “Resource roles” for details about the possible resource roles.
99 # Available Roles:
100 # Primary. The resource is currently in the primary role, and may be read from and written to.
101 # This role only occurs on one of the two nodes, unless dual-primary node is enabled.
102 # Secondary. The resource is currently in the secondary role. It normally receives updates
103 # from its peer (unless running in disconnected mode), but may neither be read from
104 # nor written to. This role may occur on one node or both nodes.
105 # Unknown. The resource's role is currently unknown. The local resource role never has this status.
106 # It is only displayed for the peer's resource role, and only in disconnected mode.
108 # ds (disk states). State of the hard disks. Prior to the slash the state of the local node is displayed,
109 # after the slash the state of the hard disk of the partner node is shown.
110 # See the section called “Disk states” for details about the various disk states.
111 # Disk States:
112 # Diskless. No local block device has been assigned to the DRBD driver. This may mean that the resource
113 # has never attached to its backing device, that it has been manually detached using drbdadm detach
114 # or that it automatically detached after a lower-level I/O error.
115 # Attaching. Transient state while reading meta data.
116 # Failed. Transient state following an I/O failure report by the local block device. Next state: Diskless.
117 # Negotiating. Transient state when an Attach is carried out on an already-connected DRBD device.
118 # Inconsistent. The data is inconsistent. This status occurs immediately upon creation of a new resource,
119 # on both nodes (before the initial full sync). Also, this status is found in one node
120 # (the synchronization target) during synchronization.
121 # Outdated. Resource data is consistent, but outdated.
122 # DUnknown. This state is used for the peer disk if no network connection is available.
123 # Consistent. Consistent data of a node without connection. When the connection
124 # is established, it is decided whether the data are UpToDate or Outdated.
125 # UpToDate. Consistent, up-to-date state of the data. This is the normal state.
127 # Network:
128 # ns (network send). Volume of net data sent to the partner via the network connection; in Kibyte.
129 # nr (network receive). Volume of net data received by the partner via the network connection; in Kibyte.
130 # Disk:
131 # dw (disk write). Net data written on local hard disk; in Kibyte.
132 # dr (disk read). Net data read from local hard disk; in Kibyte.
133 # Stats:
134 # al (activity log). Number of updates of the activity log area of the meta data.
135 # bm (bit map). Number of updates of the bitmap area of the meta data.
136 # lo (local count). Number of open requests to the local I/O sub-system issued by DRBD.
137 # pe (pending). Number of requests sent to the partner, but that have not yet been answered by the latter.
138 # ua (unacknowledged). Number of requests received by the partner via the network connection, but that have not yet been answered.
139 # ap (application pending). Number of block I/O requests forwarded to DRBD, but not yet answered by DRBD.
140 # ep (epochs). Number of epoch objects. Usually 1. Might increase under I/O load
141 # when using either the barrier or the none write ordering method. Since 8.2.7.
142 # wo (write order). Currently used write ordering method: b (barrier), f (flush), d (drain) or n (none). Since 8.2.7.
143 # oos (out of sync). Amount of storage currently out of sync; in Kibibytes. Since 8.2.6.
145 # Default thresholds for drbd checks
146 drbd_net_default_levels = (None, None)
147 drbd_disk_default_levels = (None, None)
148 drbd_stats_default_levels = (None, None, None, None, None, None, None, None, None)
150 _drbd_block_start_match = re.compile('^[0-9]+:')
152 drbd_general_map = ['cs', 'ro', 'ds']
153 drbd_net_map = ['cs', 'ns', 'nr']
154 drbd_disk_map = ['cs', 'dw', 'dr']
155 drbd_stats_map = ['cs', 'al', 'bm', 'lo', 'pe', 'ua', 'ap', 'ep', 'wo', 'oos']
157 drbd_cs_map = {
158 'StandAlone': 1,
159 'Disconnecting': 1,
160 'Unconnected': 2,
161 'Timeout': 2,
162 'BrokenPipe': 2,
163 'NetworkFailure': 2,
164 'ProtocolError': 2,
165 'TearDown': 2,
166 'WFConnection': 2,
167 'WFReportParams': 1,
168 'Connected': 0,
169 'StartingSyncS': 1,
170 'StartingSyncT': 1,
171 'WFBitMapS': 1,
172 'WFBitMapT': 1,
173 'WFSyncUUID': 1,
174 'SyncSource': 1,
175 'SyncTarget': 1,
176 'PausedSyncS': 1,
177 'PausedSyncT': 1,
178 'VerifyS': 0,
179 'VerifyT': 0,
180 'Ahead': 1,
181 'Behind': 1,
184 drbd_ds_map = {
185 "primary_Diskless": 2,
186 "secondary_Diskless": 2,
187 "primary_Attaching": 2,
188 "secondary_Attaching": 2,
189 "primary_Failed": 2,
190 "secondary_Failed": 2,
191 "primary_Negotiating": 2,
192 "secondary_Negotiating": 2,
193 "primary_Inconsistent": 1,
194 "secondary_Inconsistent": 1,
195 "primary_Outdated": 2,
196 "secondary_Outdated": 2,
197 "primary_DUnknown": 2,
198 "secondary_DUnknown": 2,
199 "primary_Consistent": 2,
200 "secondary_Consistent": 2,
201 "primary_UpToDate": 0,
202 "secondary_UpToDate": 0,
206 def inventory_drbd(info, checktype):
207 inventory = []
208 for line in info[2:]:
209 if _drbd_block_start_match.search(line[0]) > 0:
210 parsed = drbd_parse_block(drbd_extract_block('drbd%s' % line[0][:-1], info), checktype)
211 # Skip unconfigured drbd devices
212 if parsed['cs'] == 'Unconfigured':
213 continue
215 if checktype == 'drbd':
216 if 'ro' not in parsed or 'ds' not in parsed:
217 continue
218 levels = {
219 "roles_inventory": parsed['ro'],
220 "diskstates_inventory": parsed['ds'],
222 elif checktype == 'drbd.net':
223 levels = "drbd_net_default_levels"
224 elif checktype == 'drbd.disk':
225 levels = "drbd_disk_default_levels"
226 elif checktype == 'drbd.stats':
227 levels = "drbd_stats_default_levels"
228 inventory.append(('drbd%s' % line[0][:-1], levels))
229 return inventory
232 def drbd_parse_block(block, to_parse):
233 parsed = {}
234 for line in block:
235 for field in line:
236 parts = field.split(':')
237 if len(parts) > 1:
238 # Only parse the requested information depending on the check
239 # to be executed now
240 if to_parse == 'drbd' and parts[0] in drbd_general_map:
241 if parts[0] in ['ro', 'ds']:
242 parsed[parts[0]] = parts[1].split('/')
243 else:
244 parsed[parts[0]] = parts[1]
245 elif to_parse == 'drbd.net' and parts[0] in drbd_net_map:
246 parsed[parts[0]] = parts[1]
247 elif to_parse == 'drbd.disk' and parts[0] in drbd_disk_map:
248 parsed[parts[0]] = parts[1]
249 elif to_parse == 'drbd.stats' and parts[0] in drbd_stats_map:
250 parsed[parts[0]] = parts[1]
252 return parsed
255 def drbd_extract_block(item, info):
256 block = []
257 inBlock = False
258 # Ignore the first two lines since they contain drbd version information
259 for line in info[2:]:
260 if "drbd" + line[0][:-1] == item:
261 inBlock = True
262 elif inBlock and _drbd_block_start_match.search(line[0]) > 0 \
263 and "drbd" + line[0][:-1] != item:
264 # Another block starts. So the requested block is finished
265 break
267 # Skip unwanted lines
268 if not inBlock:
269 continue
271 # If this is reached we are in the wanted block
272 block.append(line)
274 return block
277 def drbd_get_block(item, info, checktype):
278 block = drbd_extract_block(item, info)
279 if len(block) > 0:
280 return drbd_parse_block(block, checktype)
281 return None
284 def check_drbd_general(item, params, info):
285 parsed = drbd_get_block(item, info, 'drbd')
287 if isinstance(params, tuple):
288 params_conv = {}
289 params_conv.update({"roles_inventory": params[0] and params[0] or None})
290 params_conv.update({
291 "diskstates_inventory": (params[0] and params[1]) and params[1] or None
293 params = params_conv
295 if not parsed is None:
296 if parsed['cs'] == 'Unconfigured':
297 return (2, 'The device is "Unconfigured"')
298 elif not parsed['cs'] in drbd_cs_map:
299 return (3, 'Undefined "connection state" in drbd output')
301 # Weight of connection state is calculated by the drbd_cs_map.
302 # The roles and disk states are calculated using the expected values
303 state = drbd_cs_map[parsed['cs']]
304 output = 'Connection State: %s' % parsed['cs']
306 # Roles
307 output += ', Roles: %s/%s' % tuple(parsed['ro'])
308 current_roles = "_".join(str(a).lower() for a in parsed["ro"])
310 found_role_match = False
311 if "roles" in params:
312 roles = params.get("roles")
313 if roles:
314 for roles_entry, roles_state in roles:
315 if roles_entry == current_roles:
316 found_role_match = True
317 state = max(state, roles_state)
318 output += ' %s' % state_markers[roles_state]
319 break
320 else: # Ignore roles if set to None
321 found_role_match = True
323 if not found_role_match:
324 if "roles_inventory" in params:
325 roles_inventory = params.get("roles_inventory")
326 if roles_inventory and parsed["ro"] != roles_inventory:
327 state = max(2, state)
328 output += ' (Expected: %s/%s)' % tuple(params.get("roles_inventory"))
329 else:
330 state = max(3, state)
331 output += ' (Check requires a new service discovery)'
333 output += ', Diskstates: %s/%s' % tuple(parsed['ds'])
334 # Do not evaluate diskstates. Either set by rule or through the
335 # legacy configuration option None in the check parameters tuple
336 if "diskstates" in params and params["diskstates"] is None or \
337 "diskstates_inventory" in params and params["diskstates_inventory"] is None:
338 return (state, output)
340 params_diskstates_dict = dict(params.get("diskstates", []))
341 diskstates_info = set()
342 for ro, ds in [(parsed["ro"][0], parsed["ds"][0]), (parsed["ro"][1], parsed["ds"][1])]:
343 diskstate = "%s_%s" % (ro.lower(), ds)
344 params_diskstate = params_diskstates_dict.get(diskstate)
346 if params_diskstate is not None:
347 state = max(state, params_diskstate)
348 diskstates_info.add('%s/%s is %s' % (ro, ds, state_markers[params_diskstate]))
349 else:
350 default_state = drbd_ds_map.get(diskstate, 3)
351 if default_state > 0:
352 diskstates_info.add('%s/%s is %s' % (ro, ds, state_markers[default_state]))
353 state = max(state, drbd_ds_map.get(diskstate, 3))
354 if diskstates_info:
355 output += " (%s)" % ", ".join(diskstates_info)
357 return (state, output)
359 return (3, "Undefined state")
362 check_info["drbd"] = {
363 'inventory_function': lambda info: inventory_drbd(info, "drbd"),
364 'check_function': check_drbd_general,
365 'group': 'drbd',
366 'has_perfdata': True,
367 'service_description': 'DRBD %s status',
371 def drbd_get_rates(list_):
372 now = time.time()
373 output = ''
374 perfdata = []
375 for type_, name, item, value, uom in list_:
376 rate = get_rate("%s.%s.%s" % (type_, name, item), now, value)
377 perfdata.append((name, rate))
378 output += ' %s/sec: %s%s' % (name, rate, uom)
379 return (output, perfdata)
382 def check_drbd_net(item, params, info):
383 parsed = drbd_get_block(item, info, 'drbd.net')
384 if not parsed is None:
385 if parsed['cs'] == 'Unconfigured':
386 return (2, 'The device is "Unconfigured"')
387 output, perfdata = drbd_get_rates([('drbd.net', 'in', item, int(parsed['nr']), 'kb'),
388 ('drbd.net', 'out', item, int(parsed['ns']), 'kb')])
389 # FIXME: Maybe handle thresholds in the future
390 return (0, output, perfdata)
392 return (3, "Undefined state")
395 check_info["drbd.net"] = {
396 'inventory_function': lambda info: inventory_drbd(info, "drbd.net"),
397 'check_function': check_drbd_net,
398 'group': 'drbd.net',
399 'has_perfdata': True,
400 'service_description': 'DRBD %s net',
404 def check_drbd_disk(item, params, info):
405 parsed = drbd_get_block(item, info, 'drbd.disk')
406 if not parsed is None:
407 if parsed['cs'] == 'Unconfigured':
408 return (2, 'The device is "Unconfigured"')
409 output, perfdata = drbd_get_rates([('drbd.disk', 'write', item, int(parsed['dw']), 'kb'),
410 ('drbd.disk', 'read', item, int(parsed['dr']), 'kb')])
411 # FIXME: Maybe handle thresholds in the future
412 return (0, output, perfdata)
414 return (3, "Undefined state")
417 check_info["drbd.disk"] = {
418 'inventory_function': lambda info: inventory_drbd(info, "drbd.disk"),
419 'check_function': check_drbd_disk,
420 'group': 'drbd.disk',
421 'has_perfdata': True,
422 'service_description': 'DRBD %s disk',
426 def check_drbd_stats(item, params, info):
427 parsed = drbd_get_block(item, info, 'drbd.stats')
428 if not parsed is None:
429 if parsed['cs'] == 'Unconfigured':
430 return (2, 'The device is "Unconfigured"')
431 output = ''
432 perfdata = []
433 for key, label in [
434 ('al', 'activity log updates'),
435 ('bm', 'bit map updates'),
436 ('lo', 'local count requests'),
437 ('pe', 'pending requests'),
438 ('ua', 'unacknowledged requests'),
439 ('ap', 'application pending requests'),
440 ('ep', 'epoch objects'),
441 ('wo', 'write order'),
442 ('oos', 'kb out of sync'),
444 if key in parsed:
445 output += '%s: %s, ' % (label, parsed[key])
446 else:
447 parsed[key] = '0' # perfdata must always have same number of entries
448 if parsed[key].isdigit():
449 perfdata.append(('%s' % label.replace(" ", "_"), int(parsed[key])))
450 return (0, output.rstrip(', '), perfdata)
452 return (3, "Undefined state")
455 check_info["drbd.stats"] = {
456 'inventory_function': lambda info: inventory_drbd(info, "drbd.stats"),
457 'check_function': check_drbd_stats,
458 'group': 'drbd.stats',
459 'has_perfdata': True,
460 'service_description': 'DRBD %s stats',