import: Revamp parse_import_notification()
[nagios-reports-module.git] / import.c
blob0593a58c8264d9e4f4c8f1ee24a8acd08f5c1177
1 #define _GNU_SOURCE 1
2 #include <sys/types.h>
3 #include <signal.h>
5 #include <nagios/broker.h>
6 #include <nagios/nebcallbacks.h>
7 #include "sql.h"
8 #include "hooks.h"
9 #include "logging.h"
10 #include "hash.h"
11 #include "lparse.h"
12 #include "logutils.h"
13 #include "cfgfile.h"
15 #define IGNORE_LINE 0
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
20 #define MAX_NVECS 16
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
24 #define SERVICE_OK 0
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 500 /* lines to parse between progress updates */
32 static uint imported, totsize, totlines;
33 static int lines_since_progress, do_progress;
34 static struct timeval import_start;
35 static time_t daemon_start, daemon_stop, incremental;
36 static int daemon_is_running;
37 static uint max_dt_depth;
39 static time_t next_dt_purge; /* when next to purge expired downtime */
40 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
42 static time_t ltime; /* the timestamp from the current log-line */
44 static int dt_start, dt_stop;
45 #define dt_depth (dt_start - dt_stop)
46 static hash_table *host_downtime;
47 static hash_table *service_downtime;
48 static int downtime_id;
49 static time_t probably_ignore_downtime;
51 struct downtime_entry {
52 int id;
53 int code;
54 char *host;
55 char *service;
56 time_t start;
57 time_t stop;
58 int fixed;
59 time_t duration;
60 time_t started;
61 time_t ended;
62 int purged;
63 int trigger;
64 int slot;
65 struct downtime_entry *next;
68 #define NUM_DENTRIES 1024
69 static struct downtime_entry **dentry;
70 static time_t last_downtime_start;
72 static struct string_code event_codes[] = {
73 add_ignored("Error"),
74 add_ignored("Warning"),
75 add_ignored("LOG ROTATION"),
76 add_ignored("HOST FLAPPING ALERT"),
77 add_ignored("SERVICE FLAPPING ALERT"),
78 add_ignored("SERVICE EVENT HANDLER"),
79 add_ignored("HOST EVENT HANDLER"),
80 add_ignored("LOG VERSION"),
82 add_code(5, "HOST NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_HOST),
83 add_code(6, "SERVICE NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE),
84 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED),
85 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED),
86 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END),
87 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED),
88 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
89 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
90 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED),
91 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
92 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
93 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST),
94 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE),
95 { 0, NULL, 0, 0 },
98 static struct string_code command_codes[] = {
99 add_cdef(1, DEL_HOST_DOWNTIME),
100 add_cdef(1, DEL_SVC_DOWNTIME),
101 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME),
102 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME),
103 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME),
104 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME),
105 add_cdef(8, SCHEDULE_HOST_DOWNTIME),
106 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME),
107 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME),
108 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME),
109 add_cdef(8, SCHEDULE_SVC_DOWNTIME),
112 * These really have one more field than listed here. We omit one
113 * to make author and comment concatenated with a semi-colon by default.
115 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM),
116 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM),
117 { 0, NULL, 0, 0 },
121 static inline void print_strvec(char **v, int n)
123 int i;
125 for (i = 0; i < n; i++)
126 printf("v[%2d]: %s\n", i, v[i]);
130 static const char *tobytes(uint n)
132 const char *suffix = "KMGT";
133 static char tbuf[2][30];
134 static int t = 0;
135 int shift = 1;
137 t ^= 1;
138 if (n < 1024) {
139 sprintf(tbuf[t], "%d bytes", n);
140 return tbuf[t];
143 while (n >> (shift * 10) > 1024)
144 shift++;
146 sprintf(tbuf[t], "%0.2f %ciB",
147 (float)n / (float)(1 << (shift * 10)), suffix[shift - 1]);
149 return tbuf[t];
152 static void show_progress(void)
154 time_t eta, elapsed;
155 float pct_done;
157 totlines += lines_since_progress;
158 lines_since_progress = 0;
160 if (!do_progress)
161 return;
163 elapsed = time(NULL) - import_start.tv_sec;
164 if (!elapsed)
165 elapsed = 1;
167 pct_done = ((float)imported / (float)totsize) * 100;
168 eta = (elapsed / pct_done) * (100.0 - pct_done);
170 printf("\rImporting data: %.2f%% (%s) done ",
171 pct_done, tobytes(imported));
172 if (elapsed > 10) {
173 printf("ETA: ");
174 if (eta > 60)
175 printf("%lum%lus", eta / 60, eta % 60);
176 else
177 printf("%lus", eta);
179 printf(" ");
182 static void end_progress(void)
184 struct timeval tv;
185 int mins;
186 float secs;
188 gettimeofday(&tv, NULL);
191 * If any of the logfiles doesn't have a newline
192 * at end of file, imported will be slightly off.
193 * We set it hard here so as to make sure that
194 * the final progress output stops at exactly 100%
196 imported = totsize;
198 show_progress();
199 putchar('\n');
200 secs = (tv.tv_sec - import_start.tv_sec) * 1000000;
201 secs += tv.tv_usec - import_start.tv_usec;
202 mins = (tv.tv_sec - import_start.tv_sec) / 60;
203 secs /= 1000000;
204 secs -= (mins * 60);
205 printf("%s in %u lines imported in ", tobytes(totsize), totlines);
206 if (mins)
207 printf("%dm ", mins);
208 printf("%.3fs\n", secs);
211 static int use_sql = 1;
212 static int insert_downtime_event(int type, char *host, char *service, int id)
214 nebstruct_downtime_data ds;
215 int result;
217 if (!is_interesting_service(host, service))
218 return 0;
220 dt_start += type == NEBTYPE_DOWNTIME_START;
221 dt_stop += type == NEBTYPE_DOWNTIME_STOP;
222 if (dt_depth > max_dt_depth)
223 max_dt_depth = dt_depth;
225 if (!use_sql)
226 return 0;
228 memset(&ds, 0, sizeof(ds));
230 ds.type = type;
231 ds.timestamp.tv_sec = ltime;
232 ds.host_name = host;
233 ds.service_description = service;
234 ds.downtime_id = id;
236 result = hook_downtime(NEBCALLBACK_DOWNTIME_DATA, (void *)&ds);
237 if (result < 0)
238 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
239 type, host, service, id);
241 return result;
244 typedef struct import_notification {
245 int type, reason, state;
246 } import_notification;
248 static int parse_import_notification(char *str, import_notification *n)
250 char *state_str = str;
252 n->reason = parse_notification_reason(str);
253 if (n->reason != NOTIFICATION_NORMAL) {
254 char *space, *paren;
256 printf("Parsing notification type and state from '%s'\n", str);
258 space = strchr(str, ' ');
259 if (!space)
260 return -1;
261 paren = strchr(space, ')');
262 if (!paren)
263 return -1;
264 *paren = '\0';
266 state_str = space + 2;
267 printf("state_str: '%s'; str: '%s'\n", state_str, str);
270 n->type = SERVICE_NOTIFICATION;
271 n->state = parse_service_state_gently(state_str);
272 if (n->state < 0) {
273 n->type = HOST_NOTIFICATION;
274 n->state = parse_host_state_gently(state_str);
277 return 0;
280 static int insert_notification(struct string_code *sc)
282 int base_idx = 0;
283 const char *desc = NULL;
284 struct import_notification n;
286 parse_import_notification(strv[base_idx + 2], &n);
287 return sql_query
288 ("INSERT INTO merlin.notification("
289 "notification_type, start_time, end_time, contact_name, "
290 "host_name, service_description, "
291 "command_name, output, "
292 "state, reason_type) "
293 "VALUES("
294 "%d, %lu, %lu, '%s', "
295 "'%s', '%s', "
296 "'%s', '%s', "
297 "%d, %d)",
298 n.type, ltime, ltime, sql_escape(strv[0]),
299 sql_escape(strv[1]), desc ? sql_escape(desc) : "''",
300 sql_escape(strv[base_idx + 3]), sql_escape(strv[base_idx + 4]),
301 n.state, n.reason);
304 static int insert_service_check(struct string_code *sc)
306 nebstruct_service_check_data ds;
308 if (!is_interesting_service(strv[0], strv[1]))
309 return 0;
311 memset(&ds, 0, sizeof(ds));
313 ds.timestamp.tv_sec = ltime;
314 ds.type = sc->code;
315 ds.host_name = strv[0];
316 ds.service_description = strv[1];
317 if (sc->nvecs == 4) {
318 /* passive service check result */
319 if (*strv[2] >= '0' && *strv[2] <= '9')
320 ds.state = atoi(strv[2]);
321 else
322 ds.state = parse_service_state(strv[2]);
323 ds.state_type = HARD_STATE;
324 ds.current_attempt = 1;
325 ds.output = strv[3];
326 } else {
327 ds.state = parse_service_state(strv[2]);
328 ds.state_type = soft_hard(strv[3]);
329 ds.current_attempt = atoi(strv[4]);
330 ds.output = strv[5];
333 if (!use_sql)
334 return 0;
336 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA, (void *)&ds);
339 static int insert_host_check(struct string_code *sc)
341 nebstruct_host_check_data ds;
343 if (!is_interesting_host(strv[0]))
344 return 0;
346 memset(&ds, 0, sizeof(ds));
348 ds.timestamp.tv_sec = ltime;
349 ds.type = sc->code;
350 ds.host_name = strv[0];
351 if (sc->nvecs == 3) {
352 if (*strv[1] >= '0' && *strv[1] <= '9')
353 ds.state = atoi(strv[1]);
354 else
355 ds.state = parse_host_state(strv[1]);
356 /* passive host check result */
357 ds.output = strv[2];
358 ds.current_attempt = 1;
359 ds.state_type = HARD_STATE;
360 } else {
361 ds.state = parse_host_state(strv[1]);
362 ds.state_type = soft_hard(strv[2]);
363 ds.current_attempt = atoi(strv[3]);
364 ds.output = strv[4];
367 if (!use_sql)
368 return 0;
370 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA, (void *)&ds);
373 static int insert_process_event(int type)
375 nebstruct_process_data ds;
377 if (!use_sql)
378 return 0;
380 memset(&ds, 0, sizeof(ds));
381 ds.timestamp.tv_sec = ltime;
382 ds.type = type;
383 return hook_process_data(NEBCALLBACK_PROCESS_DATA, (void *)&ds);
386 static int insert_acknowledgement(struct string_code *sc)
388 return 0;
391 static void dt_print(char *tpc, time_t when, struct downtime_entry *dt)
393 if (!debug_level)
394 return;
396 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
397 tpc, when, dt->started, dt->start, dt->stop, dt->duration, dt->id);
398 printf("%s", dt->host);
399 if (dt->service)
400 printf(";%s", dt->service);
401 putchar('\n');
404 static struct downtime_entry *last_dte;
405 static struct downtime_entry *del_dte;
407 static void remove_downtime(struct downtime_entry *dt);
408 static int del_matching_dt(void *data)
410 struct downtime_entry *dt = data;
412 if (del_dte->id == dt->id) {
413 dt_print("ALSO", 0, dt);
414 remove_downtime(dt);
417 return 0;
420 static void stash_downtime_command(struct downtime_entry *dt)
422 dt->slot = dt->start % NUM_DENTRIES;
423 dt->next = dentry[dt->slot];
424 dentry[dt->slot] = dt;
427 static void remove_downtime(struct downtime_entry *dt)
429 struct downtime_entry *old;
431 if (!is_interesting_service(dt->host, dt->service))
432 return;
434 insert_downtime_event(NEBTYPE_DOWNTIME_STOP, dt->host, dt->service, dt->id);
436 if (!dt->service)
437 old = hash_remove(host_downtime, dt->host);
438 else
439 old = hash_remove2(service_downtime, dt->host, dt->service);
441 dt_print("RM_DT", ltime, dt);
442 dt->purged = 1;
445 static struct downtime_entry *
446 dt_matches_command(struct downtime_entry *dt, char *host, char *service)
448 for (; dt; dt = dt->next) {
449 time_t diff;
451 if (ltime > dt->stop || ltime < dt->start) {
452 continue;
455 switch (dt->code) {
456 case SCHEDULE_SVC_DOWNTIME:
457 if (service && strcmp(service, dt->service))
458 continue;
460 /* fallthrough */
461 case SCHEDULE_HOST_DOWNTIME:
462 case SCHEDULE_HOST_SVC_DOWNTIME:
463 if (strcmp(host, dt->host)) {
464 continue;
467 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
468 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
469 /* these two have host set in dt, but
470 * it will not match all the possible hosts */
472 /* fallthrough */
473 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
474 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
475 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
476 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
477 break;
478 default:
479 crash("dt->code not set properly\n");
483 * Once we get here all the various other criteria have
484 * been matched, so we need to check if the daemon was
485 * running when this downtime was supposed to have
486 * started, and otherwise use the daemon start time
487 * as the value to diff against
489 if (daemon_stop < dt->start && daemon_start > dt->start) {
490 debug("Adjusting dt->start (%lu) to (%lu)\n",
491 dt->start, daemon_start);
492 dt->start = daemon_start;
493 if (dt->trigger && dt->duration)
494 dt->stop = dt->start + dt->duration;
497 diff = ltime - dt->start;
498 if (diff < 3 || dt->trigger || !dt->fixed)
499 return dt;
502 return NULL;
505 static struct downtime_entry *
506 find_downtime_command(char *host, char *service)
508 int i;
509 struct downtime_entry *shortcut = NULL;
511 if (last_dte && last_dte->start == ltime) {
512 shortcut = last_dte;
513 // return last_dte;
515 for (i = 0; i < NUM_DENTRIES; i++) {
516 struct downtime_entry *dt;
517 dt = dt_matches_command(dentry[i], host, service);
518 if (dt) {
519 if (shortcut && dt != shortcut)
520 if (debug_level)
521 printf("FIND shortcut no good\n");
522 last_dte = dt;
523 return dt;
527 debug("FIND not\n");
528 return NULL;
531 static int print_downtime(void *data)
533 struct downtime_entry *dt = (struct downtime_entry *)data;
535 dt_print("UNCLOSED", ltime, dt);
537 return 0;
540 static inline void set_next_dt_purge(time_t base, time_t add)
542 if (!next_dt_purge || next_dt_purge > base + add)
543 next_dt_purge = base + add;
545 if (next_dt_purge <= ltime)
546 next_dt_purge = ltime + 1;
549 static inline void add_downtime(char *host, char *service, int id)
551 struct downtime_entry *dt, *cmd, *old;
553 if (!is_interesting_service(host, service))
554 return;
556 dt = malloc(sizeof(*dt));
557 cmd = find_downtime_command(host, service);
558 if (!cmd) {
559 warn("DT with no ext cmd? %lu %s;%s", ltime, host, service);
560 memset(dt, 0, sizeof(*dt));
561 dt->duration = 7200; /* the default downtime duration in nagios */
562 dt->start = ltime;
563 dt->stop = dt->start + dt->duration;
565 else
566 memcpy(dt, cmd, sizeof(*dt));
568 dt->host = strdup(host);
569 dt->id = id;
570 dt->started = ltime;
572 set_next_dt_purge(ltime, dt->duration);
574 if (!service) {
575 dt->service = NULL;
576 old = hash_update(host_downtime, dt->host, dt);
578 else {
579 dt->service = strdup(service);
580 old = hash_update2(service_downtime, dt->host, dt->service, dt);
583 if (old && old != dt) {
584 free(old->host);
585 if (old->service)
586 free(old->service);
587 free(old);
590 dt_print("IN_DT", ltime, dt);
591 insert_downtime_event(NEBTYPE_DOWNTIME_START, dt->host, dt->service, dt->id);
594 static time_t last_host_dt_del, last_svc_dt_del;
595 static int register_downtime_command(struct string_code *sc)
597 struct downtime_entry *dt;
598 char *start_time, *end_time, *duration = NULL;
599 char *host = NULL, *service = NULL, *fixed, *triggered_by = NULL;
600 time_t foo;
602 switch (sc->code) {
603 case DEL_HOST_DOWNTIME:
604 last_host_dt_del = ltime;
605 return 0;
606 case DEL_SVC_DOWNTIME:
607 last_svc_dt_del = ltime;
608 return 0;
610 case SCHEDULE_HOST_DOWNTIME:
611 if (strtotimet(strv[5], &foo))
612 duration = strv[4];
613 /* fallthrough */
614 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
615 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
616 case SCHEDULE_HOST_SVC_DOWNTIME:
617 host = strv[0];
618 /* fallthrough */
619 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
620 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
621 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
622 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
623 start_time = strv[1];
624 end_time = strv[2];
625 fixed = strv[3];
626 if (strtotimet(strv[5], &foo))
627 triggered_by = strv[4];
628 if (!duration)
629 duration = strv[5];
631 break;
633 case SCHEDULE_SVC_DOWNTIME:
634 host = strv[0];
635 service = strv[1];
636 start_time = strv[2];
637 end_time = strv[3];
638 fixed = strv[4];
639 if (strtotimet(strv[6], &foo)) {
640 triggered_by = strv[5];
641 duration = strv[6];
643 else {
644 duration = strv[5];
646 break;
648 default:
649 crash("Unknown downtime type: %d", sc->code);
652 if (!(dt = calloc(sizeof(*dt), 1)))
653 crash("calloc(%u, 1) failed: %s", (uint)sizeof(*dt), strerror(errno));
655 dt->code = sc->code;
656 if (host)
657 dt->host = strdup(host);
658 if (service)
659 dt->service = strdup(service);
661 dt->trigger = triggered_by ? !!(*triggered_by - '0') : 0;
662 if (strtotimet(start_time, &dt->start) || strtotimet(end_time, &dt->stop))
664 print_strvec(strv, sc->nvecs);
665 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
666 command_codes[sc->code - 1].str, start_time, end_time, duration);
670 * sometimes downtime commands can be logged according to
671 * log version 1, while the log still claims to be version 2.
672 * Apparently, this happens when using a daemon supporting
673 * version 2 logging but a downtime command is added that
674 * follows the version 1 standard.
675 * As such, we simply ignore the result of the "duration"
676 * field conversion and just accept that it might not work
678 (void)strtotimet(duration, &dt->duration);
679 dt->fixed = *fixed - '0';
682 * ignore downtime scheduled to take place in the future.
683 * It will be picked up by the module anyways
685 if (dt->start > time(NULL)) {
686 free(dt);
687 return 0;
690 if (dt->duration > time(NULL)) {
691 warn("Bizarrely large duration (%lu)", dt->duration);
693 if (dt->start < ltime) {
694 if (dt->duration && dt->duration > ltime - dt->start)
695 dt->duration -= ltime - dt->start;
697 dt->start = ltime;
699 if (dt->stop < ltime || dt->stop < dt->start) {
700 /* retroactively scheduled downtime, or just plain wrong */
701 dt->stop = dt->start;
702 dt->duration = 0;
705 if (dt->fixed && dt->duration != dt->stop - dt->start) {
706 // warn("duration doesn't match stop - start: (%lu : %lu)",
707 // dt->duration, dt->stop - dt->start);
709 dt->duration = dt->stop - dt->start;
711 else if (dt->duration > 86400 * 14) {
712 warn("Oddly long duration: %lu", dt->duration);
715 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
716 dt->start, dt->stop, dt->duration, dt->fixed, dt->trigger, dt->host, dt->service);
718 stash_downtime_command(dt);
719 return 0;
722 static int insert_downtime(struct string_code *sc)
724 int type;
725 struct downtime_entry *dt = NULL;
726 int id = 0;
727 time_t dt_del_cmd;
728 char *host, *service = NULL;
730 host = strv[0];
731 if (sc->nvecs == 4) {
732 service = strv[1];
733 dt = hash_find2(service_downtime, host, service);
735 else
736 dt = hash_find(host_downtime, host);
739 * to stop a downtime we can either get STOPPED or
740 * CANCELLED. So far, I've only ever seen STARTED
741 * for when it actually starts though, and since
742 * the Nagios daemon is reponsible for launching
743 * it, it's unlikely there are more variants of
744 * that string
746 type = NEBTYPE_DOWNTIME_STOP;
747 if (!strcmp(strv[sc->nvecs - 2], "STARTED"))
748 type = NEBTYPE_DOWNTIME_START;
750 switch (type) {
751 case NEBTYPE_DOWNTIME_START:
752 if (dt) {
753 if (!probably_ignore_downtime)
754 dt_print("ALRDY", ltime, dt);
755 return 0;
758 if (probably_ignore_downtime)
759 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
760 probably_ignore_downtime, ltime, host, service);
762 if (ltime - last_downtime_start > 1)
763 downtime_id++;
765 id = downtime_id;
766 add_downtime(host, service, id);
767 last_downtime_start = ltime;
768 break;
770 case NEBTYPE_DOWNTIME_STOP:
771 if (!dt) {
773 * this can happen when overlapping downtime entries
774 * occur, and the start event for the second (or nth)
775 * downtime starts before the first downtime has had
776 * a stop event. It basically means we've almost
777 * certainly done something wrong.
779 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
780 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
781 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
782 return 0;
785 dt_del_cmd = !dt->service ? last_host_dt_del : last_svc_dt_del;
787 if ((ltime - dt_del_cmd) > 1 && dt->duration - (ltime - dt->started) > 60) {
788 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
789 ltime - dt->started, dt->host, dt->service, dt->duration);
791 if (ltime - dt->started > dt->duration + DT_PURGE_GRACETIME)
792 dt_print("Long", ltime, dt);
794 remove_downtime(dt);
796 * Now delete whatever matching downtimes we can find.
797 * this must be here, or we'll recurse like crazy into
798 * remove_downtime(), possibly exhausting the stack
799 * frame buffer
801 del_dte = dt;
802 if (!dt->service)
803 hash_walk_data(host_downtime, del_matching_dt);
804 else
805 hash_walk_data(service_downtime, del_matching_dt);
806 break;
808 default:
809 return -1;
812 return 0;
815 static int dt_purged;
816 static int purge_expired_dt(void *data)
818 struct downtime_entry *dt = data;
820 if (dt->purged) {
821 return 0;
824 if (ltime + DT_PURGE_GRACETIME > dt->stop) {
825 dt_purged++;
826 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
827 ltime, dt->id, dt->start, dt->started, dt->stop, dt->duration, dt->host, dt->service);
828 remove_downtime(dt);
830 else {
831 dt_print("PURGED_NOT_TIME", ltime, dt);
834 set_next_dt_purge(dt->started, dt->duration);
836 return 0;
839 static int purged_downtimes;
840 static void purge_expired_downtime(void)
842 int tot_purged = 0;
844 next_dt_purge = 0;
845 dt_purged = 0;
846 hash_walk_data(host_downtime, purge_expired_dt);
847 if (dt_purged)
848 debug("PURGE %d host downtimes purged", dt_purged);
849 tot_purged += dt_purged;
850 dt_purged = 0;
851 hash_walk_data(service_downtime, purge_expired_dt);
852 if (dt_purged)
853 debug("PURGE %d service downtimes purged", dt_purged);
854 tot_purged += dt_purged;
855 if (tot_purged)
856 debug("PURGE total %d entries purged", tot_purged);
858 if (next_dt_purge)
859 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
860 next_dt_purge, next_dt_purge - ltime);
862 purged_downtimes += tot_purged;
865 static inline void handle_start_event(void)
867 if (!daemon_is_running)
868 insert_process_event(NEBTYPE_PROCESS_START);
870 probably_ignore_downtime = daemon_start = ltime;
871 daemon_is_running = 1;
874 static inline void handle_stop_event(void)
876 if (daemon_is_running) {
877 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN);
878 daemon_is_running = 0;
880 daemon_stop = ltime;
883 static int parse_line(char *line, uint len)
885 char *ptr, *colon;
886 int nvecs = 0;
887 struct string_code *sc;
888 static time_t last_ltime = 0;
890 imported += len + 1; /* make up for 1 lost byte per newline */
892 /* ignore empty lines */
893 if (!len)
894 return 0;
896 if (++lines_since_progress >= PROGRESS_INTERVAL)
897 show_progress();
899 /* skip obviously bogus lines */
900 if (len < 12 || *line != '[') {
901 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no, line);
902 return -1;
905 ltime = strtoul(line + 1, &ptr, 10);
906 if (line + 1 == ptr) {
907 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line);
908 return -1;
911 if (ltime < last_ltime) {
912 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
913 // ltime, last_ltime, last_ltime - ltime);
914 ltime = last_ltime;
916 else
917 last_ltime = ltime;
920 * Incremental will be 0 if not set, or 1 if set but
921 * the database is currently empty.
922 * Note that this will not always do the correct thing,
923 * as downtime entries that might have been scheduled for
924 * purging may never show up as "stopped" in the database
925 * with this scheme. As such, incremental imports absolutely
926 * require that nothing is in scheduled downtime when the
927 * import is running (well, started really, but it amounts
928 * to the same thing).
930 if (ltime < incremental)
931 return 0;
933 if (next_dt_purge && ltime >= next_dt_purge)
934 purge_expired_downtime();
936 if (probably_ignore_downtime && ltime - probably_ignore_downtime > 1)
937 probably_ignore_downtime = 0;
939 while (*ptr == ']' || *ptr == ' ')
940 ptr++;
942 if (!is_interesting(ptr))
943 return 0;
945 if (!(colon = strchr(ptr, ':'))) {
946 /* stupid heuristic, but might be good for something,
947 * somewhere, sometime. if nothing else, it should suppress
948 * annoying output */
949 if (is_start_event(ptr)) {
950 handle_start_event();
951 return 0;
953 if (is_stop_event(ptr)) {
954 handle_stop_event();
955 return 0;
959 * An unhandled event. We should probably crash here
961 handle_unknown_event(line);
962 return -1;
965 /* an event happened without us having gotten a start-event */
966 if (!daemon_is_running) {
967 insert_process_event(NEBTYPE_PROCESS_START);
968 daemon_start = ltime;
969 daemon_is_running = 1;
972 if (!(sc = get_event_type(ptr, colon - ptr))) {
973 handle_unknown_event(line);
974 return -1;
977 if (sc->code == IGNORE_LINE)
978 return 0;
980 *colon = 0;
981 ptr = colon + 1;
982 while (*ptr == ' ')
983 ptr++;
985 if (sc->nvecs) {
986 int i;
988 nvecs = vectorize_string(ptr, sc->nvecs);
990 if (nvecs != sc->nvecs) {
991 /* broken line */
992 warn("Line %d in %s seems to not have all the fields it should",
993 line_no, cur_file->path);
994 return -1;
997 for (i = 0; i < sc->nvecs; i++) {
998 if (!strv[i]) {
999 /* this should never happen */
1000 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
1001 line_no, cur_file->path);
1002 return -1;
1007 switch (sc->code) {
1008 char *semi_colon;
1010 case NEBTYPE_EXTERNALCOMMAND_END:
1011 semi_colon = strchr(ptr, ';');
1012 if (!semi_colon)
1013 return 0;
1014 if (!(sc = get_command_type(ptr, semi_colon - ptr))) {
1015 return 0;
1017 if (sc->code == RESTART_PROGRAM) {
1018 handle_stop_event();
1019 return 0;
1022 nvecs = vectorize_string(semi_colon + 1, sc->nvecs);
1023 if (nvecs != sc->nvecs) {
1024 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs, sc->nvecs, ptr);
1026 if (sc->code != ACKNOWLEDGE_HOST_PROBLEM &&
1027 sc->code != ACKNOWLEDGE_SVC_PROBLEM)
1029 register_downtime_command(sc);
1030 } else {
1031 insert_acknowledgement(sc);
1033 break;
1035 case NEBTYPE_HOSTCHECK_PROCESSED:
1036 return insert_host_check(sc);
1038 case NEBTYPE_SERVICECHECK_PROCESSED:
1039 return insert_service_check(sc);
1041 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST:
1042 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE:
1043 return insert_downtime(sc);
1045 case NEBTYPE_NOTIFICATION_END + CONCERNS_HOST:
1046 case NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE:
1047 return insert_notification(sc);
1049 case IGNORE_LINE:
1050 return 0;
1053 return 0;
1056 static int parse_one_line(char *str, uint len)
1058 if (parse_line(str, len) && use_sql && sql_errno())
1059 crash("sql error: %s", sql_error());
1061 return 0;
1064 static int hash_one_line(char *line, uint len)
1066 return add_interesting_object(line);
1069 static int hash_interesting(const char *path)
1071 struct stat st;
1073 if (stat(path, &st) < 0)
1074 crash("failed to stat %s: %s", path, strerror(errno));
1076 lparse_path(path, st.st_size, hash_one_line);
1078 return 0;
1081 extern const char *__progname;
1082 int main(int argc, char **argv)
1084 int i, truncate_db = 0;
1085 const char *nagios_cfg = NULL;
1086 char *db_name = "monitor_reports";
1087 char *db_user = "monitor";
1088 char *db_pass = "monitor";
1089 char *db_table = "report_data";
1091 do_progress = isatty(fileno(stdout));
1093 strv = calloc(sizeof(char *), MAX_NVECS);
1094 dentry = calloc(sizeof(*dentry), NUM_DENTRIES);
1095 if (!strv || !dentry)
1096 crash("Failed to alloc initial structs");
1099 for (num_nfile = 0,i = 1; i < argc; i++) {
1100 char *opt, *arg = argv[i];
1101 int arg_len, eq_opt = 0;
1103 if ((opt = strchr(arg, '='))) {
1104 *opt++ = '\0';
1105 eq_opt = 1;
1107 else if (i < argc - 1) {
1108 opt = argv[i + 1];
1111 if (!prefixcmp(arg, "--incremental")) {
1112 incremental = 1;
1113 continue;
1115 if (!prefixcmp(arg, "--no-sql")) {
1116 use_sql = 0;
1117 continue;
1119 if (!prefixcmp(arg, "--no-progress")) {
1120 do_progress = 0;
1121 continue;
1123 if (!prefixcmp(arg, "--debug") || !prefixcmp(arg, "-d")) {
1124 do_progress = 0;
1125 debug_level++;
1126 continue;
1128 if (!prefixcmp(arg, "--truncate-db")) {
1129 truncate_db = 1;
1130 continue;
1132 if (!prefixcmp(arg, "--nagios-cfg")) {
1133 if (!opt || !*opt) {
1134 crash("%s requires the path to nagios.cfg as argument", arg);
1136 nagios_cfg = opt;
1137 if (opt && !eq_opt)
1138 i++;
1139 continue;
1141 if (!prefixcmp(arg, "--db-name")) {
1142 if (!opt || !*opt)
1143 crash("%s requires a database name as an argument", arg);
1144 db_name = opt;
1145 if (opt && !eq_opt)
1146 i++;
1147 continue;
1149 if (!prefixcmp(arg, "--db-user")) {
1150 if (!opt || !*opt)
1151 crash("%s requires a database username as argument", arg);
1152 db_user = opt;
1153 if (opt && !eq_opt)
1154 i++;
1155 continue;
1157 if (!prefixcmp(arg, "--db-pass")) {
1158 if (!opt || !*opt)
1159 crash("%s requires a database username as argument", arg);
1160 db_pass = opt;
1161 if (opt && !eq_opt)
1162 i++;
1163 continue;
1165 if (!prefixcmp(arg, "--db-table")) {
1166 if (!opt || !*opt)
1167 crash("%s requires a database table name as argument", arg);
1168 db_table = opt;
1169 if (opt && !eq_opt)
1170 i++;
1171 continue;
1173 if (!prefixcmp(arg, "--interesting") || !prefixcmp(arg, "-i")) {
1174 if (!opt || !*opt)
1175 crash("%s requires a filename as argument", arg);
1176 hash_interesting(opt);
1177 if (opt && !eq_opt)
1178 i++;
1179 continue;
1182 /* non-argument, so treat as a config- or log-file */
1183 arg_len = strlen(arg);
1184 if (arg_len >= 10 && !strcmp(&arg[arg_len - 10], "nagios.cfg")) {
1185 nagios_cfg = arg;
1186 } else {
1187 add_naglog_path(arg);
1191 /* fallback for op5 systems */
1192 if (!nagios_cfg && !num_nfile) {
1193 nagios_cfg = "/opt/monitor/etc/nagios.cfg";
1195 if (nagios_cfg) {
1196 struct cfg_comp *conf;
1197 conf = cfg_parse_file(nagios_cfg);
1198 for (i = 0; i < conf->vars; i++) {
1199 struct cfg_var *v = conf->vlist[i];
1200 if (!strcmp(v->key, "log_file")) {
1201 add_naglog_path(v->value);
1203 if (!strcmp(v->key, "log_archive_path")) {
1204 add_naglog_path(v->value);
1209 if (use_sql) {
1210 sql_config("db_database", db_name);
1211 sql_config("db_user", db_user);
1212 sql_config("db_pass", db_pass);
1213 sql_config("db_table", db_table);
1215 if (sql_init() < 0)
1216 crash("sql_init() failed");
1217 if (truncate_db)
1218 sql_query("TRUNCATE %s", sql_table_name());
1220 if (incremental) {
1221 MYSQL_RES *result;
1222 MYSQL_ROW row;
1223 sql_query("SELECT timestamp FROM %s.%s ORDER BY timestamp DESC LIMIT 1",
1224 db_name, db_table);
1226 if (!(result = sql_get_result()))
1227 crash("Failed to get last timestamp: %s\n", sql_error());
1229 /* someone might use --incremental with an empty
1230 * database. We shouldn't crash in that case */
1231 if ((row = sql_fetch_row(result)))
1232 incremental = strtoul(row[0], NULL, 0);
1234 sql_free_result(result);
1237 * We lock the table we'll be working with and disable
1238 * indexes on it. Otherwise doing the actual inserts
1239 * will take just about forever, as MySQL has to update
1240 * and flush the index cache between each operation.
1242 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
1243 crash("Failed to disable keys: %s", sql_error());
1244 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
1245 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
1248 log_grok_var("logfile", "/dev/null");
1249 log_grok_var("log_levels", "warn");
1251 if (!num_nfile)
1252 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1253 __progname);
1255 if (log_init() < 0)
1256 crash("log_init() failed");
1258 qsort(nfile, num_nfile, sizeof(*nfile), nfile_cmp);
1260 host_downtime = hash_init(HASH_TABLE_SIZE);
1261 service_downtime = hash_init(HASH_TABLE_SIZE);
1263 if (hook_init() < 0)
1264 crash("Failed to initialize hooks");
1266 /* go through them once to count the total size for progress output */
1267 for (i = 0; i < num_nfile; i++) {
1268 totsize += nfile[i].size;
1271 gettimeofday(&import_start, NULL);
1272 printf("Importing %s of data from %d files\n",
1273 tobytes(totsize), num_nfile);
1275 for (i = 0; i < num_nfile; i++) {
1276 struct naglog_file *nf = &nfile[i];
1277 cur_file = nf;
1278 show_progress();
1279 debug("importing from %s (%lu : %u)\n", nf->path, nf->first, nf->cmp);
1280 line_no = 0;
1281 lparse_path(nf->path, nf->size, parse_one_line);
1282 imported++; /* make up for one lost byte per file */
1285 end_progress();
1287 if (debug_level) {
1288 if (dt_depth) {
1289 printf("Unclosed host downtimes:\n");
1290 puts("------------------------");
1291 hash_walk_data(host_downtime, print_downtime);
1292 printf("Unclosed service downtimes:\n");
1293 puts("---------------------------");
1294 hash_walk_data(service_downtime, print_downtime);
1296 printf("dt_depth: %d\n", dt_depth);
1298 printf("purged downtimes: %d\n", purged_downtimes);
1299 printf("max simultaneous host downtime hashes: %u\n",
1300 hash_get_max_entries(host_downtime));
1301 printf("max simultaneous service downtime hashes: %u\n",
1302 hash_get_max_entries(service_downtime));
1303 printf("max downtime depth: %u\n", max_dt_depth);
1306 if (use_sql) {
1307 SQL_RESULT *res;
1308 SQL_ROW row;
1309 time_t start;
1310 unsigned long entries;
1312 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
1313 if (!(res = sql_get_result()))
1314 entries = 0;
1315 else {
1316 row = sql_fetch_row(res);
1317 entries = strtoul(row[0], NULL, 0);
1318 sql_free_result(res);
1321 signal(SIGINT, SIG_IGN);
1322 sql_query("UNLOCK TABLES");
1323 start = time(NULL);
1324 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
1325 (entries / 50000) + 1);
1326 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
1327 printf("%lu database entries indexed in %lu seconds\n",
1328 entries, time(NULL) - start);
1329 sql_close();
1332 if (warnings && debug_level)
1333 fprintf(stderr, "Total warnings: %d\n", warnings);
1335 if (debug_level || dt_start != dt_stop)
1336 fprintf(stderr, "Downtime data %s\n started: %d\n stopped: %d\n",
1337 dt_depth ? "mismatch!" : "consistent", dt_start, dt_stop);
1338 if (hash_check_table(host_downtime))
1339 fprintf(stderr, "Hash table inconsistencies for host_downtime\n");
1340 if (hash_check_table(service_downtime))
1341 fprintf(stderr, "Hash table inconsistencies for service_downtime\n");
1343 print_unhandled_events();
1345 return 0;