1 #include <nagios/broker.h>
2 #include <nagios/comments.h>
4 #include "string_utils.h"
6 #define STATUS_QUERY(type) \
7 "UPDATE " type " SET " \
9 "flap_detection_enabled = %d, " \
10 "check_freshness = %d, " \
11 "process_performance_data = %d, " \
12 "active_checks_enabled = %d, passive_checks_enabled = %d, " \
13 "event_handler_enabled = %d, " \
14 "obsess_over_" type " = %d, problem_has_been_acknowledged = %d, " \
15 "acknowledgement_type = %d, check_type = %d, " \
16 "current_state = %d, last_state = %d, " /* 17 - 18 */ \
17 "last_hard_state = %d, state_type = %d, " \
18 "current_attempt = %d, current_event_id = %lu, " \
19 "last_event_id = %lu, current_problem_id = %lu, " \
20 "last_problem_id = %lu, " \
21 "latency = %f, execution_time = %lf, " /* 26 - 27 */ \
22 "notifications_enabled = %d, " \
23 "last_notification = %lu, " \
24 "next_check = %lu, should_be_scheduled = %d, last_check = %lu, " \
25 "last_state_change = %lu, last_hard_state_change = %lu, " \
26 "has_been_checked = %d, " \
27 "current_notification_number = %d, current_notification_id = %lu, " \
28 "check_flapping_recovery_notifi = %d, " \
29 "scheduled_downtime_depth = %d, pending_flex_downtime = %d, " \
30 "is_flapping = %d, flapping_comment_id = %lu, " /* 41 - 42 */ \
31 "percent_state_change = %f, " \
32 "output = %s, long_output = %s, perf_data = %s"
34 #define STATUS_ARGS(output, long_output, perf_data) \
35 p->state.flap_detection_enabled, \
36 p->state.check_freshness, \
37 p->state.process_performance_data, \
38 p->state.checks_enabled, p->state.accept_passive_checks, \
39 p->state.event_handler_enabled, \
40 p->state.obsess, p->state.problem_has_been_acknowledged, \
41 p->state.acknowledgement_type, p->state.check_type, \
42 p->state.current_state, p->state.last_state, \
43 p->state.last_hard_state, p->state.state_type, \
44 p->state.current_attempt, p->state.current_event_id, \
45 p->state.last_event_id, p->state.current_problem_id, \
46 p->state.last_problem_id, \
47 p->state.latency, p->state.execution_time, \
48 p->state.notifications_enabled, \
49 p->state.last_notification, \
50 p->state.next_check, p->state.should_be_scheduled, p->state.last_check, \
51 p->state.last_state_change, p->state.last_hard_state_change, \
52 p->state.has_been_checked, \
53 p->state.current_notification_number, p->state.current_notification_id, \
54 p->state.check_flapping_recovery_notification, \
55 p->state.scheduled_downtime_depth, p->state.pending_flex_downtime, \
56 p->state.is_flapping, p->state.flapping_comment_id, \
57 p->state.percent_state_change, \
58 safe_str(output), safe_str(long_output), safe_str(perf_data)
61 static int handle_host_status(merlin_node
*node
, int cb
, const merlin_host_status
*p
)
64 char *output
= NULL
, *long_output
= NULL
, *sql_safe_unescaped_long_output
= NULL
, *perf_data
= NULL
;
65 int result
= 0, node_id
, rpt_log
= 0, perf_log
= 0;
67 if (cb
== NEBCALLBACK_HOST_CHECK_DATA
) {
68 if (db_log_reports
&& (p
->nebattr
& NEBATTR_CHECK_ALERT
))
70 if (host_perf_table
&& p
->state
.perf_data
&& *p
->state
.perf_data
) {
75 if (!db_track_current
&& !rpt_log
&& !perf_log
)
78 node_id
= node
== &ipc
? 0 : node
->id
+ 1;
80 sql_quote(p
->name
, &host_name
);
81 if (db_track_current
|| rpt_log
) {
82 sql_quote(p
->state
.plugin_output
, &output
);
83 if (rpt_log
&& p
->state
.long_plugin_output
) {
84 char *unescaped_long_output
= NULL
;
85 size_t long_len
= strlen(p
->state
.long_plugin_output
) + 1;
86 if ((unescaped_long_output
= malloc(long_len
)) == NULL
) {
87 lerr("failed to allocate memory for unescaped long output");
90 unescape_newlines(unescaped_long_output
, p
->state
.long_plugin_output
, long_len
);
91 sql_quote(unescaped_long_output
, &sql_safe_unescaped_long_output
);
92 free(unescaped_long_output
);
93 unescaped_long_output
= NULL
;
95 sql_quote(p
->state
.long_plugin_output
, &long_output
);
97 if (db_track_current
|| perf_log
)
98 sql_quote(p
->state
.perf_data
, &perf_data
);
100 if (db_track_current
) {
101 result
= sql_query(STATUS_QUERY("host") " WHERE host_name = %s",
103 STATUS_ARGS(output
, long_output
, perf_data
),
109 ("INSERT INTO %s(timestamp, event_type, host_name, state, "
110 "hard, retry, output, long_output, downtime_depth) "
111 "VALUES(%lu, %d, %s, %d, %d, %d, %s, %s, %d)",
112 sql_table_name(), p
->state
.last_check
,
113 NEBTYPE_HOSTCHECK_PROCESSED
, host_name
,
114 p
->state
.current_state
,
115 p
->state
.state_type
== HARD_STATE
,
116 p
->state
.current_attempt
, output
,
117 sql_safe_unescaped_long_output
,
118 p
->state
.scheduled_downtime_depth
);
122 * Stash host performance data separately, in case
123 * people people are using Merlin with Nagiosgrapher or
124 * similar performance data graphing solutions.
128 ("INSERT INTO %s(timestamp, host_name, perfdata) "
129 "VALUES(%lu, %s, %s)",
130 host_perf_table
, p
->state
.last_check
, host_name
, perf_data
);
135 safe_free(long_output
);
136 safe_free(sql_safe_unescaped_long_output
);
137 safe_free(perf_data
);
141 static int handle_service_status(merlin_node
*node
, int cb
, const merlin_service_status
*p
)
143 char *host_name
, *service_description
;
144 char *output
= NULL
, *long_output
= NULL
, *perf_data
= NULL
;
145 char *sql_safe_unescaped_long_output
= NULL
;
146 int result
= 0, node_id
, rpt_log
= 0, perf_log
= 0;
148 if (cb
== NEBCALLBACK_SERVICE_CHECK_DATA
) {
149 if (db_log_reports
&& (p
->nebattr
& NEBATTR_CHECK_ALERT
))
152 if (service_perf_table
&& p
->state
.perf_data
&& *p
->state
.perf_data
)
156 if (!db_track_current
&& !rpt_log
&& !perf_log
)
159 node_id
= node
== &ipc
? 0 : node
->id
+ 1;
161 sql_quote(p
->host_name
, &host_name
);
162 sql_quote(p
->service_description
, &service_description
);
163 if (db_track_current
|| rpt_log
) {
164 char *unescaped_long_output
= NULL
;
165 sql_quote(p
->state
.plugin_output
, &output
);
166 if(rpt_log
&& p
->state
.long_plugin_output
) {
167 size_t long_len
= strlen(p
->state
.long_plugin_output
) + 1;
168 if ((unescaped_long_output
= malloc(long_len
)) == NULL
) {
169 lerr("failed to allocate memory for unescaped long output");
172 unescape_newlines(unescaped_long_output
, p
->state
.long_plugin_output
, long_len
);
173 sql_quote(unescaped_long_output
, &sql_safe_unescaped_long_output
);
174 free(unescaped_long_output
);
175 unescaped_long_output
= NULL
;
178 sql_quote(p
->state
.long_plugin_output
, &long_output
);
181 if (db_track_current
|| perf_log
)
182 sql_quote(p
->state
.perf_data
, &perf_data
);
184 if (db_track_current
) {
186 (STATUS_QUERY("service")
187 " WHERE host_name = %s AND service_description = %s",
188 node_id
, STATUS_ARGS(output
, long_output
, perf_data
),
189 host_name
, service_description
);
194 ("INSERT INTO %s(timestamp, event_type, host_name, "
195 "service_description, state, hard, retry, output, long_output, downtime_depth) "
196 "VALUES(%lu, %d, %s, %s, %d, '%d', '%d', %s, %s, %d)",
197 sql_table_name(), p
->state
.last_check
,
198 NEBTYPE_SERVICECHECK_PROCESSED
, host_name
,
199 service_description
, p
->state
.current_state
,
200 p
->state
.state_type
== HARD_STATE
,
201 p
->state
.current_attempt
, output
,
202 sql_safe_unescaped_long_output
,
203 p
->state
.scheduled_downtime_depth
);
207 * Stash service performance data separately, in case
208 * people people are using Merlin with Nagiosgrapher or
209 * similar performance data graphing solutions.
213 ("INSERT INTO %s(timestamp, host_name, "
214 "service_description, perfdata) "
215 "VALUES(%lu, %s, %s, %s)",
216 service_perf_table
, p
->state
.last_check
,
217 host_name
, service_description
, perf_data
);
221 free(service_description
);
223 safe_free(long_output
);
224 safe_free(sql_safe_unescaped_long_output
);
225 safe_free(perf_data
);
229 static int rpt_downtime(void *data
)
231 nebstruct_downtime_data
*ds
= (nebstruct_downtime_data
*)data
;
239 case NEBTYPE_DOWNTIME_START
:
240 case NEBTYPE_DOWNTIME_STOP
:
246 sql_quote(ds
->host_name
, &host_name
);
247 if (ds
->service_description
) {
248 char *service_description
;
250 sql_quote(ds
->service_description
, &service_description
);
251 depth
= ds
->type
== NEBTYPE_DOWNTIME_START
;
252 result
= sql_query("INSERT INTO %s"
253 "(timestamp, event_type, host_name,"
254 "service_description, downtime_depth) "
255 "VALUES(%lu, %d, %s, %s, %d)",
257 ds
->timestamp
.tv_sec
, ds
->type
, host_name
,
258 service_description
, depth
);
259 free(service_description
);
261 depth
= ds
->type
== NEBTYPE_DOWNTIME_START
;
262 result
= sql_query("INSERT INTO %s"
263 "(timestamp, event_type, host_name, downtime_depth)"
264 "VALUES(%lu, %d, %s, %d)",
266 ds
->timestamp
.tv_sec
, ds
->type
, host_name
, depth
);
273 static int rpt_process_data(void *data
)
275 nebstruct_process_data
*ds
= (nebstruct_process_data
*)data
;
281 case NEBTYPE_PROCESS_EVENTLOOPSTART
:
282 ds
->type
= NEBTYPE_PROCESS_START
;
284 case NEBTYPE_PROCESS_START
:
285 case NEBTYPE_PROCESS_SHUTDOWN
:
287 case NEBTYPE_PROCESS_RESTART
:
288 ds
->type
= NEBTYPE_PROCESS_SHUTDOWN
;
294 return sql_query("INSERT INTO %s(timestamp, event_type) "
296 sql_table_name(), ds
->timestamp
.tv_sec
, ds
->type
);
299 static int handle_program_status(merlin_node
*node
, const nebstruct_program_status_data
*p
)
301 char *global_host_event_handler
;
302 char *global_service_event_handler
;
304 merlin_nodeinfo
*info
;
306 if (!db_track_current
)
309 sql_quote(p
->global_host_event_handler
, &global_host_event_handler
);
310 sql_quote(p
->global_service_event_handler
, &global_service_event_handler
);
315 node_id
= node
->id
+ 1;
321 ("UPDATE program_status SET is_running = 1, "
322 "last_alive = %lu, program_start = %lu, pid = %d, daemon_mode = %d, "
323 "last_log_rotation = %lu, "
324 "notifications_enabled = %d, "
325 "active_service_checks_enabled = %d, passive_service_checks_enabled = %d, "
326 "active_host_checks_enabled = %d, passive_host_checks_enabled = %d, "
327 "event_handlers_enabled = %d, flap_detection_enabled = %d, "
328 "process_performance_data = %d, "
329 "obsess_over_hosts = %d, obsess_over_services = %d, "
330 "modified_host_attributes = %lu, modified_service_attributes = %lu, "
331 "global_host_event_handler = %s, global_service_event_handler = %s, "
332 "peer_id = %u, self_assigned_peer_id = %u, "
333 "active_peers = %u, configured_peers = %u, "
334 "active_pollers = %u, configured_pollers = %u, "
335 "active_masters = %u, configured_masters = %u, "
336 "host_checks_handled = %u, service_checks_handled = %u, "
337 "node_type = %d, config_hash = '%s' "
338 "WHERE instance_id = %d",
339 time(NULL
), p
->program_start
, p
->pid
, p
->daemon_mode
,
340 p
->last_log_rotation
,
341 p
->notifications_enabled
,
342 p
->active_service_checks_enabled
, p
->passive_service_checks_enabled
,
343 p
->active_host_checks_enabled
, p
->passive_host_checks_enabled
,
344 p
->event_handlers_enabled
, p
->flap_detection_enabled
,
345 p
->process_performance_data
,
346 p
->obsess_over_hosts
, p
->obsess_over_services
,
347 p
->modified_host_attributes
, p
->modified_service_attributes
,
348 safe_str(global_host_event_handler
), safe_str(global_service_event_handler
),
349 node
->peer_id
, info
->peer_id
,
350 info
->active_peers
, info
->configured_peers
,
351 info
->active_pollers
, info
->configured_pollers
,
352 info
->active_masters
, info
->configured_masters
,
353 info
->host_checks_handled
, info
->service_checks_handled
,
354 node
->type
, tohex(info
->config_hash
, 20),
357 free(global_host_event_handler
);
358 free(global_service_event_handler
);
362 static int handle_flapping(const nebstruct_flapping_data
*p
)
365 char *host_name
, *service_description
= NULL
;
366 unsigned long comment_id
= 0;
368 if (!db_track_current
&& !db_log_reports
)
371 sql_quote(p
->host_name
, &host_name
);
372 sql_quote(p
->service_description
, &service_description
);
374 /* comments are deleted by a separate broker event */
375 if (p
->type
!= NEBTYPE_FLAPPING_STOP
) {
376 comment_id
= p
->comment_id
;
379 if (service_description
) {
380 if (db_log_reports
) {
382 ("INSERT INTO %s(timestamp, event_type, host_name, service_description) VALUES(%lu, %d, %s, %s)",
383 sql_table_name(), p
->timestamp
.tv_sec
, p
->type
, host_name
,
384 service_description
);
387 lerr("failed to insert flapping data (host: %s, service: %s, type: %d) into %s",
388 host_name
, service_description
, p
->type
, sql_table_name());
391 if (db_track_current
) {
393 ("UPDATE service SET is_flapping = %d, "
394 "flapping_comment_id = %lu, percent_state_change = %f "
395 "WHERE host_name = %s AND service_description = %s",
396 p
->type
== NEBTYPE_FLAPPING_START
,
397 comment_id
, p
->percent_change
,
398 host_name
, service_description
);
400 free(service_description
);
402 if (db_log_reports
) {
404 ("INSERT INTO %s(timestamp, event_type, host_name) VALUES(%lu, %d, %s)",
405 sql_table_name(), p
->timestamp
.tv_sec
, p
->type
, host_name
);
408 lerr("failed to insert flapping data (host: %s, type: %d) into %s",
409 host_name
, p
->type
, sql_table_name());
412 if (db_track_current
) {
414 ("UPDATE host SET is_flapping = %d, "
415 "flapping_comment_id = %lu, percent_state_change = %f "
416 "WHERE host_name = %s",
417 p
->type
== NEBTYPE_FLAPPING_START
,
418 comment_id
, p
->percent_change
, host_name
);
427 static int handle_downtime(merlin_node
*node
, const nebstruct_downtime_data
*p
)
430 char *host_name
= NULL
, *service_description
= NULL
;
431 char *comment_data
= NULL
, *author_name
= NULL
;
433 if (!db_track_current
)
437 * If we stop downtime that's already started, we'll get a
438 * downtime stop event, but no downtime delete event (weird,
440 * Since we can't retroactively upgrade all Nagios instances
441 * in the world, we have to make sure STOP also means DELETE
443 if (p
->type
== NEBTYPE_DOWNTIME_DELETE
||
444 p
->type
== NEBTYPE_DOWNTIME_STOP
)
447 * for local delete events, we can use the downtime_id,
448 * which is properly indexed and quick to search for.
449 * If not, we'll have to use other heuristics to find
450 * the proper entry to delete.
451 * Note that this will remove all identical downtime
452 * entries, but as with comments, that just can't be
456 result
= sql_query("DELETE FROM scheduled_downtime "
457 "WHERE downtime_id = %lu", p
->downtime_id
);
459 sql_quote(p
->host_name
, &host_name
);
460 sql_quote(p
->service_description
, &service_description
);
461 sql_quote(p
->comment_data
, &comment_data
);
462 sql_quote(p
->author_name
, &author_name
);
463 result
= sql_query("DELETE FROM scheduled_downtime "
464 "WHERE downtime_type = %d AND "
465 "end_time = %lu AND fixed = %d AND "
466 "host_name = %s AND service_description %s %s "
467 "AND author_name = %s AND comment_data = %s",
469 (unsigned long)p
->end_time
, p
->fixed
,
470 host_name
, service_description
? "=" : " IS ",
471 safe_str(service_description
),
472 author_name
, comment_data
);
473 safe_free(host_name
);
474 safe_free(service_description
);
475 safe_free(comment_data
);
476 safe_free(author_name
);
486 case NEBTYPE_DOWNTIME_START
:
487 case NEBTYPE_DOWNTIME_STOP
:
488 /* this gets updated by the host and/or service status event */
490 case NEBTYPE_DOWNTIME_LOAD
:
492 ("DELETE FROM scheduled_downtime WHERE downtime_id = %lu",
495 case NEBTYPE_DOWNTIME_ADD
:
496 sql_quote(p
->host_name
, &host_name
);
497 sql_quote(p
->service_description
, &service_description
);
498 sql_quote(p
->author_name
, &author_name
);
499 sql_quote(p
->comment_data
, &comment_data
);
501 ("INSERT INTO scheduled_downtime "
502 "(downtime_type, host_name, service_description, entry_time, "
503 "author_name, comment_data, start_time, end_time, fixed, "
504 "duration, triggered_by, downtime_id) "
505 "VALUES(%d, %s, %s, %lu, "
506 " %s, %s, %lu, %lu, %d, "
508 p
->downtime_type
, host_name
, safe_str(service_description
),
509 p
->entry_time
, author_name
, comment_data
, p
->start_time
,
510 p
->end_time
, p
->fixed
, p
->duration
, p
->triggered_by
,
513 safe_free(service_description
);
518 linfo("Unknown downtime type %d", p
->type
);
525 static int handle_comment(merlin_node
*node
, const nebstruct_comment_data
*p
)
528 char *host_name
, *author_name
, *comment_data
, *service_description
;
530 if (!db_track_current
)
534 * The simple case of deleting comments is when the event
535 * comes from our local node. In that case we needn't bother
536 * with matching the comment by variable. Since we bounce
537 * COMMENT_DELETE events from remote nodes against our module
538 * before we actually delete them, this code should be the
539 * one exercised every time we delete a comment
542 sql_query("DELETE FROM comment_tbl WHERE comment_id = %lu",
544 if (p
->type
== NEBTYPE_COMMENT_DELETE
)
548 sql_quote(p
->host_name
, &host_name
);
549 sql_quote(p
->author_name
, &author_name
);
550 sql_quote(p
->comment_data
, &comment_data
);
551 sql_quote(p
->service_description
, &service_description
);
554 * Deleting comments is trickier than normal. Since each
555 * Nagios instance uses its own comment_id we're forced to
556 * use other means of uniquely identifying the comment.
557 * author_name, host and service_description, comment_data
558 * and entry_time does the trick. This means we'll delete
559 * all identical comments for the same object when we're
560 * asked to delete one such comment, but that really can't
563 if (p
->type
== NEBTYPE_COMMENT_DELETE
) {
565 ("DELETE FROM comment_tbl WHERE entry_time = %lu AND "
566 "host_name = %s AND service_description %s %s AND "
567 "author_name = %s AND comment_data = %s",
568 p
->entry_time
, host_name
, service_description
? "=" : "IS",
569 safe_str(service_description
),
570 author_name
, comment_data
);
571 } else if (node
== &ipc
) {
573 ("INSERT INTO comment_tbl(comment_type, host_name, "
574 "service_description, entry_time, author_name, comment_data, "
575 "persistent, source, entry_type, expires, expire_time, "
577 "VALUES(%d, %s, %s, %lu, %s, %s, %d, %d, %d, %d, %lu, %lu)",
578 p
->comment_type
, host_name
,
579 safe_str(service_description
), p
->entry_time
,
580 author_name
, comment_data
, p
->persistent
, p
->source
,
581 p
->entry_type
, p
->expires
, p
->expire_time
, p
->comment_id
);
587 safe_free(service_description
);
592 static int handle_contact_notification_method(const nebstruct_contact_notification_method_data
*p
)
595 char *contact_name
, *host_name
, *service_description
;
596 char *output
, *ack_author
, *ack_data
, *command_name
;
598 if (!db_log_notifications
)
601 sql_quote(p
->contact_name
, &contact_name
);
602 sql_quote(p
->host_name
, &host_name
);
603 sql_quote(p
->service_description
, &service_description
);
604 sql_quote(p
->output
, &output
);
605 sql_quote(p
->ack_author
, &ack_author
);
606 sql_quote(p
->ack_data
, &ack_data
);
607 sql_quote(p
->command_name
, &command_name
);
610 ("INSERT INTO notification "
611 "(notification_type, start_time, end_time, "
612 "contact_name, host_name, service_description, "
613 "command_name, reason_type, state, output,"
614 "ack_author, ack_data, escalated) "
615 "VALUES(%d, %lu, %lu, "
619 p
->notification_type
, p
->start_time
.tv_sec
, p
->end_time
.tv_sec
,
620 contact_name
, host_name
, safe_str(service_description
),
621 command_name
, p
->reason_type
, p
->state
, safe_str(output
),
622 safe_str(ack_author
), safe_str(ack_data
), p
->escalated
);
626 safe_free(service_description
);
628 safe_free(ack_author
);
635 int mrm_db_update(merlin_node
*node
, merlin_event
*pkt
)
639 if (!sql_is_connected(1))
643 lerr("pkt is NULL in mrm_db_update()");
647 lerr("pkt->body is NULL in mrm_db_update()");
651 if (merlin_decode_event(node
, pkt
)) {
655 switch (pkt
->hdr
.type
) {
656 case NEBCALLBACK_PROGRAM_STATUS_DATA
:
657 errors
= handle_program_status(node
, (void *)pkt
->body
);
659 case NEBCALLBACK_PROCESS_DATA
:
660 errors
= rpt_process_data(pkt
->body
);
662 case NEBCALLBACK_COMMENT_DATA
:
663 errors
= handle_comment(node
, (void *)pkt
->body
);
665 case NEBCALLBACK_DOWNTIME_DATA
:
666 errors
= handle_downtime(node
, (void *)pkt
->body
);
667 errors
|= rpt_downtime((void *)pkt
->body
);
669 case NEBCALLBACK_FLAPPING_DATA
:
670 errors
= handle_flapping((void *)pkt
->body
);
672 case NEBCALLBACK_CONTACT_NOTIFICATION_METHOD_DATA
:
673 errors
= handle_contact_notification_method((void *)pkt
->body
);
675 case NEBCALLBACK_HOST_CHECK_DATA
:
676 case NEBCALLBACK_HOST_STATUS_DATA
:
677 errors
= handle_host_status(node
, (int)pkt
->hdr
.type
, (void *)pkt
->body
);
679 case NEBCALLBACK_SERVICE_CHECK_DATA
:
680 case NEBCALLBACK_SERVICE_STATUS_DATA
:
681 errors
= handle_service_status(node
, (int)pkt
->hdr
.type
, (void *)pkt
->body
);
684 /* some callbacks are unhandled by design */
685 case NEBCALLBACK_NOTIFICATION_DATA
:
686 case NEBCALLBACK_CONTACT_NOTIFICATION_DATA
:
687 case NEBCALLBACK_EXTERNAL_COMMAND_DATA
:
691 lerr("Unknown callback type %d. Weird, to say the least...",