Refactoring: Changed all check parameters starting with an 'o' to the new rulespec...
[check_mk.git] / checks / cpu_util.include
blob1ae4c6345fe0d8b701038239cc6a92b023b87e5e
1 #!/usr/bin/python
2 # -*- encoding: utf-8; py-indent-offset: 4 -*-
3 # +------------------------------------------------------------------+
4 # | ____ _ _ __ __ _ __ |
5 # | / ___| |__ ___ ___| | __ | \/ | |/ / |
6 # | | | | '_ \ / _ \/ __| |/ / | |\/| | ' / |
7 # | | |___| | | | __/ (__| < | | | | . \ |
8 # | \____|_| |_|\___|\___|_|\_\___|_| |_|_|\_\ |
9 # | |
10 # | Copyright Mathias Kettner 2014 mk@mathias-kettner.de |
11 # +------------------------------------------------------------------+
13 # This file is part of Check_MK.
14 # The official homepage is at http://mathias-kettner.de/check_mk.
16 # check_mk is free software; you can redistribute it and/or modify it
17 # under the terms of the GNU General Public License as published by
18 # the Free Software Foundation in version 2. check_mk is distributed
19 # in the hope that it will be useful, but WITHOUT ANY WARRANTY; with-
20 # out even the implied warranty of MERCHANTABILITY or FITNESS FOR A
21 # PARTICULAR PURPOSE. See the GNU General Public License for more de-
22 # tails. You should have received a copy of the GNU General Public
23 # License along with GNU Make; see the file COPYING. If not, write
24 # to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
25 # Boston, MA 02110-1301 USA.
27 # Common file for all (modern) checks that check CPU utilization (not load!)
29 # Example for check parameters:
30 # 1. Variant: Tuple (warn, crit). This is legacy style
31 # 2. Variant: dictionary:
33 # param = {
34 # "util" : .... --> compatible with check_levels(), optional
35 # "average" : 15 # -> compute average for 15 minutes, optional
36 # }
39 # This one can handle user, system and wait. values is a list of:
40 # - 0 - name: name of core
41 # - 1 - user: normal processes executing in user mode
42 # - 2 - nice: niced processes executing in user mode
43 # - 3 - system: processes executing in kernel mode
44 # - 4 - idle: twiddling thumbs
45 # - 5 - iowait: waiting for I/O to complete
46 # - 6 - irq: servicing interrupts
47 # - 7 - softirq: servicing softirqs
48 # - 8 - steal: involuntary wait
49 # - 9 - guest: time spent in guest OK, also counted in 0 (user)
50 # -10 - guest_nice: time spent in niced guest OK, also counted in 1 (nice)
51 class CpuInfo(
52 collections.namedtuple("CPU_utilization",
53 ('name', 'user', 'nice', 'system', 'idle', 'iowait', 'irq',
54 'softirq', 'steal', 'guest', 'guest_nice'))):
55 __slots__ = ()
57 @property
58 def util_total(self):
59 return self.user + self.nice + self.system + self.iowait + self.irq + self.softirq + self.steal
61 @property
62 def total_sum(self):
63 return self.util_total + self.idle
65 @property
66 def utils_perc(self):
67 # https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/kernel/sched/cputime.c
68 # see 'account_guest_time'
69 # if task_nice(p) <= 0:
70 # cpustat[CPUTIME_USER] += cputime;
71 # cpustat[CPUTIME_GUEST] += cputime;
72 guest = self.guest + self.guest_nice
73 user = self.user + self.nice - guest
75 system = self.system + self.irq + self.softirq
76 wait = self.iowait
77 steal = self.steal
78 total_sum = self.total_sum
80 perc = [
81 100.0 * float(x) / float(total_sum)
82 for x in [user, system, wait, steal, guest, self.util_total]
84 return perc
87 def cpu_info(elements, caster=int):
88 entries = [elements[0]] + map(caster, elements[1:])
89 entries.extend([0] * (11 - len(entries)))
90 return CpuInfo(*entries)
93 def util_counter(stats, this_time):
94 # Compute jiffi-differences of all relevant counters
95 diff_values = []
96 for n, v in enumerate(stats[1:], start=1):
97 countername = "cpu.util.%d" % n
98 last_val = get_item_state(countername, (0, 0))[1]
99 diff_values.append(v - last_val)
100 set_item_state(countername, (this_time, v))
102 return cpu_info([stats.name] + diff_values)
105 # normalize name of a cpu core so that the perfdata-template
106 # recognizes it. If the input name doesn't end on a number, this
107 # returns consecutive numbers per call so this function has to be
108 # called exactly once per core
109 def cpu_util_core_name(orig, core_index):
110 expr = regex(r"\d+$")
111 match = expr.search(orig)
112 if match is not None:
113 num = match.group(0)
114 else:
115 # fallback: if the cores have odd names, use
116 # consecutive numbers for each call
117 num = core_index
118 return "cpu_core_util_%s" % num
121 def check_cpu_util(util, params, this_time=None, cores=None, perf_max=100):
122 # Convert legacy param style to new dict style
123 if params is None:
124 params = {}
125 elif isinstance(params, tuple):
126 params = {"util": params}
128 if this_time is None:
129 this_time = time.time()
131 levels = params.get("util")
132 if levels is None: # legacy rules before 1.6
133 levels = params.get("levels")
135 warn, crit = levels if isinstance(levels, tuple) else (None, None) # only for perfdata
136 perfdata = [("util", util, warn, crit, 0, perf_max)]
138 # Averaging
139 if "average" in params:
140 util_avg = get_average("cpu_utilization.avg", this_time, util, params["average"])
141 perfdata.append(("util_average", util_avg, warn, crit, 0, perf_max))
142 state, infotext, extraperf = check_levels(
143 util_avg,
144 "util_average",
145 levels,
146 human_readable_func=get_percent_human_readable,
147 infoname="%dmin average" % params["average"])
148 else:
149 state, infotext, extraperf = check_levels(
150 util,
151 "util",
152 levels,
153 human_readable_func=get_percent_human_readable,
154 infoname="total cpu")
156 perfdata += extraperf[1:] # reference curve for predictive levels
157 yield state, infotext, perfdata
159 if "core_util_time_total" in params:
160 threshold, warn, crit = params["core_util_time_total"]
161 yield cpu_util_time(this_time, "total", util, threshold, warn, crit)
163 if cores and any([x in params for x in ["core_util_graph", "core_util_time", "levels_single"]]):
164 for core_index, (core, total_perc) in enumerate(cores):
165 for perfdata in util_perfdata(core, total_perc, core_index, this_time, params):
166 yield perfdata
169 def check_cpu_util_unix(values, params, cores=None, values_counter=True):
170 this_time = time.time()
171 if values_counter:
172 diff_values = util_counter(values, this_time)
173 sum_jiffies = diff_values.total_sum
174 if sum_jiffies == 0:
175 raise MKCounterWrapped("Too short time difference since last check")
176 user_perc, system_perc, wait_perc, steal_perc, guest_perc, util_total_perc = diff_values.utils_perc
177 else:
178 user_perc = values.user
179 system_perc = values.system
180 wait_perc = values.iowait
181 util_total_perc = values.util_total
183 yield check_levels(user_perc, 'user', None, "%")
184 yield check_levels(system_perc, 'system', None, "%")
185 yield check_levels(wait_perc, 'wait', params.get('iowait'), "%")
187 # Compute values used in virtualized environments (Xen, etc.)
188 # Only do this for counters that have counted at least one tick
189 # since the system boot. This avoids silly output in systems
190 # where these counters are not being used
191 if values.steal:
192 yield check_levels(steal_perc, "steal", params.get('steal'), "%")
194 if values.guest:
195 yield check_levels(guest_perc, 'guest', None, "%")
197 summary_cores = []
198 if cores:
199 for core in cores:
200 prev_total = get_item_state("cpu.util.%s.total" % core.name, 0)
201 util_total = core.util_total
202 total_diff = util_total - prev_total
203 set_item_state("cpu.util.%s.total" % core.name, util_total)
204 total_perc = (100.0 * total_diff / sum_jiffies) * len(cores)
205 summary_cores.append((core.name, total_perc))
207 for check_result in check_cpu_util(
208 util_total_perc, params, this_time, summary_cores, perf_max=None):
209 yield check_result
212 def util_perfdata(core, total_perc, core_index, this_time, params):
213 if "core_util_graph" in params:
214 yield 0, "", [(cpu_util_core_name(core, core_index), total_perc)]
216 if "core_util_time" in params:
217 threshold, warn, crit = params["core_util_time"]
218 yield cpu_util_time(this_time, core, total_perc, threshold, warn, crit)
220 state, infotext, _ = check_levels(
221 total_perc,
222 "Core %s" % core,
223 params.get('levels_single'),
224 human_readable_func=get_percent_human_readable)
225 if state:
226 yield state, infotext, []
229 # .--helper--------------------------------------------------------------.
230 # | _ _ |
231 # | | |__ ___| |_ __ ___ _ __ |
232 # | | '_ \ / _ \ | '_ \ / _ \ '__| |
233 # | | | | | __/ | |_) | __/ | |
234 # | |_| |_|\___|_| .__/ \___|_| |
235 # | |_| |
236 # '----------------------------------------------------------------------'
239 def cpu_util_time(this_time, core, perc, threshold, warn_core, crit_core):
240 core_state_name = "cpu.util.core.high.%s" % core
241 if perc > threshold:
242 timestamp = get_item_state(core_state_name, 0)
243 high_load_duration = (this_time - timestamp)
244 state, infotext, _ = check_levels(
245 high_load_duration,
246 "%s is under high load for" % core, (warn_core, crit_core),
247 human_readable_func=get_age_human_readable)
248 if timestamp == 0:
249 set_item_state(core_state_name, this_time)
250 elif state:
251 return state, infotext
252 return 0, ""
254 clear_item_state(core_state_name)
255 return 0, ""