showlog: Remove debugging output
[nagios-reports-module.git] / import.c
blob81c0de7db039b4adedcdb655179c8d0006e12c96
1 #define _GNU_SOURCE 1
2 #include <sys/types.h>
3 #include <signal.h>
5 #include "nagios/broker.h"
6 #include "nagios/nebcallbacks.h"
7 #include "sql.h"
8 #include "hooks.h"
9 #include "logging.h"
10 #include "hash.h"
11 #include "lparse.h"
12 #include "logutils.h"
13 #include "cfgfile.h"
15 #define IGNORE_LINE 0
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
20 #define MAX_NVECS 16
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
24 #define SERVICE_OK 0
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 500 /* lines to parse between progress updates */
32 static int only_notifications;
33 static uint64_t imported, totsize, totlines;
34 static int lines_since_progress, do_progress;
35 static struct timeval import_start;
36 static time_t daemon_start, daemon_stop, incremental;
37 static int daemon_is_running;
38 static uint max_dt_depth, skipped_files;
40 static time_t next_dt_purge; /* when next to purge expired downtime */
41 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
43 static time_t ltime; /* the timestamp from the current log-line */
45 static int dt_start, dt_stop, dt_skip;
46 #define dt_depth (dt_start - dt_stop)
47 static hash_table *host_downtime;
48 static hash_table *service_downtime;
49 static int downtime_id;
50 static time_t probably_ignore_downtime;
52 struct downtime_entry {
53 int id;
54 int code;
55 char *host;
56 char *service;
57 time_t start;
58 time_t stop;
59 int fixed;
60 time_t duration;
61 time_t started;
62 time_t ended;
63 int purged;
64 int trigger;
65 int slot;
66 struct downtime_entry *next;
69 #define NUM_DENTRIES 1024
70 static struct downtime_entry **dentry;
71 static time_t last_downtime_start;
73 static struct string_code event_codes[] = {
74 add_ignored("Error"),
75 add_ignored("Warning"),
76 add_ignored("LOG ROTATION"),
77 add_ignored("HOST FLAPPING ALERT"),
78 add_ignored("SERVICE FLAPPING ALERT"),
79 add_ignored("SERVICE EVENT HANDLER"),
80 add_ignored("HOST EVENT HANDLER"),
81 add_ignored("LOG VERSION"),
83 add_code(5, "HOST NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_HOST),
84 add_code(6, "SERVICE NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE),
85 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED),
86 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED),
87 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END),
88 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED),
89 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
90 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
91 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED),
92 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
93 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
94 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST),
95 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE),
96 { 0, NULL, 0, 0 },
99 static struct string_code command_codes[] = {
100 add_cdef(1, DEL_HOST_DOWNTIME),
101 add_cdef(1, DEL_SVC_DOWNTIME),
102 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME),
103 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME),
104 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME),
105 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME),
106 add_cdef(8, SCHEDULE_HOST_DOWNTIME),
107 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME),
108 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME),
109 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME),
110 add_cdef(8, SCHEDULE_SVC_DOWNTIME),
113 * These really have one more field than listed here. We omit one
114 * to make author and comment concatenated with a semi-colon by default.
116 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM),
117 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM),
118 { 0, NULL, 0, 0 },
122 static inline void print_strvec(char **v, int n)
124 int i;
126 for (i = 0; i < n; i++)
127 printf("v[%2d]: %s\n", i, v[i]);
131 static const char *tobytes(uint64_t n)
133 const char *suffix = "KMGTP";
134 static char tbuf[2][30];
135 static int t = 0;
136 int shift = 1;
138 t ^= 1;
139 if (n < 1024) {
140 sprintf(tbuf[t], "%llu bytes", n);
141 return tbuf[t];
144 while (n >> (shift * 10) > 1024 && shift < sizeof(suffix) - 1)
145 shift++;
147 sprintf(tbuf[t], "%0.2f %ciB",
148 (float)n / (float)(1 << (shift * 10)), suffix[shift - 1]);
150 return tbuf[t];
153 static void show_progress(void)
155 time_t eta, elapsed;
156 float pct_done;
158 totlines += lines_since_progress;
159 lines_since_progress = 0;
161 if (!do_progress)
162 return;
164 elapsed = time(NULL) - import_start.tv_sec;
165 if (!elapsed)
166 elapsed = 1;
168 pct_done = ((float)imported / (float)totsize) * 100;
169 eta = (elapsed / pct_done) * (100.0 - pct_done);
171 printf("\rImporting data: %.2f%% (%s) done ",
172 pct_done, tobytes(imported));
173 if (elapsed > 10) {
174 printf("ETA: ");
175 if (eta > 60)
176 printf("%lum%lus", eta / 60, eta % 60);
177 else
178 printf("%lus", eta);
180 printf(" ");
183 static void end_progress(void)
185 struct timeval tv;
186 int mins;
187 float secs;
189 gettimeofday(&tv, NULL);
192 * If any of the logfiles doesn't have a newline
193 * at end of file, imported will be slightly off.
194 * We set it hard here so as to make sure that
195 * the final progress output stops at exactly 100%
197 imported = totsize;
199 show_progress();
200 putchar('\n');
201 secs = (tv.tv_sec - import_start.tv_sec) * 1000000;
202 secs += tv.tv_usec - import_start.tv_usec;
203 mins = (tv.tv_sec - import_start.tv_sec) / 60;
204 secs /= 1000000;
205 secs -= (mins * 60);
206 printf("%s, %llu lines imported in ", tobytes(totsize), totlines);
207 if (mins)
208 printf("%dm ", mins);
209 printf("%.3fs. %u files skipped\n", secs, skipped_files);
212 static int use_sql = 1;
213 static int insert_downtime_event(int type, char *host, char *service, int id)
215 nebstruct_downtime_data ds;
216 int result;
218 if (!is_interesting_service(host, service))
219 return 0;
221 dt_start += type == NEBTYPE_DOWNTIME_START;
222 dt_stop += type == NEBTYPE_DOWNTIME_STOP;
223 if (dt_depth > max_dt_depth)
224 max_dt_depth = dt_depth;
226 if (!use_sql || only_notifications)
227 return 0;
229 memset(&ds, 0, sizeof(ds));
231 ds.type = type;
232 ds.timestamp.tv_sec = ltime;
233 ds.host_name = host;
234 ds.service_description = service;
235 ds.downtime_id = id;
237 result = hook_downtime(NEBCALLBACK_DOWNTIME_DATA, (void *)&ds);
238 if (result < 0)
239 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
240 type, host, service, id);
242 return result;
245 typedef struct import_notification {
246 int type, reason, state;
247 } import_notification;
249 static int parse_import_notification(char *str, import_notification *n)
251 char *state_str = str;
253 n->reason = parse_notification_reason(str);
254 if (n->reason != NOTIFICATION_NORMAL) {
255 char *space, *paren;
257 space = strchr(str, ' ');
258 if (!space)
259 return -1;
260 paren = strchr(space, ')');
261 if (!paren)
262 return -1;
263 *paren = '\0';
265 state_str = space + 2;
268 n->type = SERVICE_NOTIFICATION;
269 n->state = parse_service_state_gently(state_str);
270 if (n->state < 0) {
271 n->type = HOST_NOTIFICATION;
272 n->state = parse_host_state_gently(state_str);
275 return 0;
278 static int insert_notification(struct string_code *sc)
280 int base_idx;
281 const char *desc;
282 struct import_notification n;
284 if (!only_notifications)
285 return 0;
287 if (sc->code - NEBTYPE_NOTIFICATION_END == CONCERNS_SERVICE) {
288 base_idx = 1;
289 desc = strv[2];
290 } else {
291 base_idx = 0;
292 desc = 0;
294 if (parse_import_notification(strv[base_idx + 2], &n) < 0) {
295 handle_unknown_event(strv[base_idx + 2]);
296 return 0;
299 if (!use_sql)
300 return 0;
302 return sql_query
303 ("INSERT INTO %s.%s("
304 "notification_type, start_time, end_time, contact_name, "
305 "host_name, service_description, "
306 "command_name, output, "
307 "state, reason_type) "
308 "VALUES("
309 "%d, %lu, %lu, '%s', "
310 "'%s', '%s', "
311 "'%s', '%s', "
312 "%d, %d)",
313 sql_db_name(), sql_table_name(),
314 n.type, ltime, ltime, sql_escape(strv[0]),
315 sql_escape(strv[1]), desc ? sql_escape(desc) : "",
316 sql_escape(strv[base_idx + 3]), sql_escape(strv[base_idx + 4]),
317 n.state, n.reason);
320 static int insert_service_check(struct string_code *sc)
322 nebstruct_service_check_data ds;
324 if (!is_interesting_service(strv[0], strv[1]))
325 return 0;
327 memset(&ds, 0, sizeof(ds));
329 ds.timestamp.tv_sec = ltime;
330 ds.type = sc->code;
331 ds.host_name = strv[0];
332 ds.service_description = strv[1];
333 if (sc->nvecs == 4) {
334 /* passive service check result */
335 if (*strv[2] >= '0' && *strv[2] <= '9')
336 ds.state = atoi(strv[2]);
337 else
338 ds.state = parse_service_state(strv[2]);
339 ds.state_type = HARD_STATE;
340 ds.current_attempt = 1;
341 ds.output = strv[3];
342 } else {
343 ds.state = parse_service_state(strv[2]);
344 ds.state_type = soft_hard(strv[3]);
345 ds.current_attempt = atoi(strv[4]);
346 ds.output = strv[5];
349 if (!use_sql || only_notifications)
350 return 0;
352 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA, (void *)&ds);
355 static int insert_host_check(struct string_code *sc)
357 nebstruct_host_check_data ds;
359 if (!is_interesting_host(strv[0]))
360 return 0;
362 memset(&ds, 0, sizeof(ds));
364 ds.timestamp.tv_sec = ltime;
365 ds.type = sc->code;
366 ds.host_name = strv[0];
367 if (sc->nvecs == 3) {
368 if (*strv[1] >= '0' && *strv[1] <= '9')
369 ds.state = atoi(strv[1]);
370 else
371 ds.state = parse_host_state(strv[1]);
372 /* passive host check result */
373 ds.output = strv[2];
374 ds.current_attempt = 1;
375 ds.state_type = HARD_STATE;
376 } else {
377 ds.state = parse_host_state(strv[1]);
378 ds.state_type = soft_hard(strv[2]);
379 ds.current_attempt = atoi(strv[3]);
380 ds.output = strv[4];
383 if (!use_sql || only_notifications)
384 return 0;
386 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA, (void *)&ds);
389 static int insert_process_event(int type)
391 nebstruct_process_data ds;
393 if (!use_sql || only_notifications)
394 return 0;
396 memset(&ds, 0, sizeof(ds));
397 ds.timestamp.tv_sec = ltime;
398 ds.type = type;
399 return hook_process_data(NEBCALLBACK_PROCESS_DATA, (void *)&ds);
402 static int insert_acknowledgement(struct string_code *sc)
404 return 0;
407 static void dt_print(char *tpc, time_t when, struct downtime_entry *dt)
409 if (!debug_level)
410 return;
412 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
413 tpc, when, dt->started, dt->start, dt->stop, dt->duration, dt->id);
414 printf("%s", dt->host);
415 if (dt->service)
416 printf(";%s", dt->service);
417 putchar('\n');
420 static struct downtime_entry *last_dte;
421 static struct downtime_entry *del_dte;
423 static void remove_downtime(struct downtime_entry *dt);
424 static int del_matching_dt(void *data)
426 struct downtime_entry *dt = data;
428 if (del_dte->id == dt->id) {
429 dt_print("ALSO", 0, dt);
430 remove_downtime(dt);
431 return HASH_WALK_REMOVE;
434 return 0;
437 static void stash_downtime_command(struct downtime_entry *dt)
439 dt->slot = dt->start % NUM_DENTRIES;
440 dt->next = dentry[dt->slot];
441 dentry[dt->slot] = dt;
444 static void remove_downtime(struct downtime_entry *dt)
446 if (!is_interesting_service(dt->host, dt->service))
447 return;
449 insert_downtime_event(NEBTYPE_DOWNTIME_STOP, dt->host, dt->service, dt->id);
451 dt_print("RM_DT", ltime, dt);
452 dt->purged = 1;
455 static struct downtime_entry *
456 dt_matches_command(struct downtime_entry *dt, char *host, char *service)
458 for (; dt; dt = dt->next) {
459 time_t diff;
461 if (ltime > dt->stop || ltime < dt->start) {
462 continue;
465 switch (dt->code) {
466 case SCHEDULE_SVC_DOWNTIME:
467 if (service && strcmp(service, dt->service))
468 continue;
470 /* fallthrough */
471 case SCHEDULE_HOST_DOWNTIME:
472 case SCHEDULE_HOST_SVC_DOWNTIME:
473 if (strcmp(host, dt->host)) {
474 continue;
477 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
478 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
479 /* these two have host set in dt, but
480 * it will not match all the possible hosts */
482 /* fallthrough */
483 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
484 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
485 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
486 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
487 break;
488 default:
489 crash("dt->code not set properly\n");
493 * Once we get here all the various other criteria have
494 * been matched, so we need to check if the daemon was
495 * running when this downtime was supposed to have
496 * started, and otherwise use the daemon start time
497 * as the value to diff against
499 if (daemon_stop < dt->start && daemon_start > dt->start) {
500 debug("Adjusting dt->start (%lu) to (%lu)\n",
501 dt->start, daemon_start);
502 dt->start = daemon_start;
503 if (dt->trigger && dt->duration)
504 dt->stop = dt->start + dt->duration;
507 diff = ltime - dt->start;
508 if (diff < 3 || dt->trigger || !dt->fixed)
509 return dt;
512 return NULL;
515 static struct downtime_entry *
516 find_downtime_command(char *host, char *service)
518 int i;
519 struct downtime_entry *shortcut = NULL;
521 if (last_dte && last_dte->start == ltime) {
522 shortcut = last_dte;
523 // return last_dte;
525 for (i = 0; i < NUM_DENTRIES; i++) {
526 struct downtime_entry *dt;
527 dt = dt_matches_command(dentry[i], host, service);
528 if (dt) {
529 if (shortcut && dt != shortcut)
530 if (debug_level)
531 printf("FIND shortcut no good\n");
532 last_dte = dt;
533 return dt;
537 debug("FIND not\n");
538 return NULL;
541 static int print_downtime(void *data)
543 struct downtime_entry *dt = (struct downtime_entry *)data;
545 dt_print("UNCLOSED", ltime, dt);
547 return 0;
550 static inline void set_next_dt_purge(time_t base, time_t add)
552 if (!next_dt_purge || next_dt_purge > base + add)
553 next_dt_purge = base + add;
555 if (next_dt_purge <= ltime)
556 next_dt_purge = ltime + 1;
559 static inline void add_downtime(char *host, char *service, int id)
561 struct downtime_entry *dt, *cmd, *old;
563 if (!is_interesting_service(host, service))
564 return;
566 dt = malloc(sizeof(*dt));
567 cmd = find_downtime_command(host, service);
568 if (!cmd) {
569 warn("DT with no ext cmd? %lu %s;%s", ltime, host, service);
570 memset(dt, 0, sizeof(*dt));
571 dt->duration = 7200; /* the default downtime duration in nagios */
572 dt->start = ltime;
573 dt->stop = dt->start + dt->duration;
575 else
576 memcpy(dt, cmd, sizeof(*dt));
578 dt->host = strdup(host);
579 dt->id = id;
580 dt->started = ltime;
582 set_next_dt_purge(ltime, dt->duration);
584 if (!service) {
585 dt->service = NULL;
586 old = hash_update(host_downtime, dt->host, dt);
588 else {
589 dt->service = strdup(service);
590 old = hash_update2(service_downtime, dt->host, dt->service, dt);
593 if (old && old != dt) {
594 free(old->host);
595 if (old->service)
596 free(old->service);
597 free(old);
600 dt_print("IN_DT", ltime, dt);
601 insert_downtime_event(NEBTYPE_DOWNTIME_START, dt->host, dt->service, dt->id);
604 static time_t last_host_dt_del, last_svc_dt_del;
605 static int register_downtime_command(struct string_code *sc)
607 struct downtime_entry *dt;
608 char *start_time, *end_time, *duration = NULL;
609 char *host = NULL, *service = NULL, *fixed, *triggered_by = NULL;
610 time_t foo;
612 switch (sc->code) {
613 case DEL_HOST_DOWNTIME:
614 last_host_dt_del = ltime;
615 return 0;
616 case DEL_SVC_DOWNTIME:
617 last_svc_dt_del = ltime;
618 return 0;
620 case SCHEDULE_HOST_DOWNTIME:
621 if (strtotimet(strv[5], &foo))
622 duration = strv[4];
623 /* fallthrough */
624 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
625 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
626 case SCHEDULE_HOST_SVC_DOWNTIME:
627 host = strv[0];
628 /* fallthrough */
629 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
630 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
631 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
632 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
633 start_time = strv[1];
634 end_time = strv[2];
635 fixed = strv[3];
636 if (strtotimet(strv[5], &foo))
637 triggered_by = strv[4];
638 if (!duration)
639 duration = strv[5];
641 break;
643 case SCHEDULE_SVC_DOWNTIME:
644 host = strv[0];
645 service = strv[1];
646 start_time = strv[2];
647 end_time = strv[3];
648 fixed = strv[4];
649 if (strtotimet(strv[6], &foo)) {
650 triggered_by = strv[5];
651 duration = strv[6];
653 else {
654 duration = strv[5];
656 break;
658 default:
659 crash("Unknown downtime type: %d", sc->code);
662 if (!(dt = calloc(sizeof(*dt), 1)))
663 crash("calloc(%u, 1) failed: %s", (uint)sizeof(*dt), strerror(errno));
665 dt->code = sc->code;
666 if (host)
667 dt->host = strdup(host);
668 if (service)
669 dt->service = strdup(service);
671 dt->trigger = triggered_by ? !!(*triggered_by - '0') : 0;
672 if (strtotimet(start_time, &dt->start) || strtotimet(end_time, &dt->stop))
674 print_strvec(strv, sc->nvecs);
675 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
676 command_codes[sc->code - 1].str, start_time, end_time, duration);
680 * sometimes downtime commands can be logged according to
681 * log version 1, while the log still claims to be version 2.
682 * Apparently, this happens when using a daemon supporting
683 * version 2 logging but a downtime command is added that
684 * follows the version 1 standard.
685 * As such, we simply ignore the result of the "duration"
686 * field conversion and just accept that it might not work
688 (void)strtotimet(duration, &dt->duration);
689 dt->fixed = *fixed - '0';
692 * ignore downtime scheduled to take place in the future.
693 * It will be picked up by the module anyways
695 if (dt->start > time(NULL)) {
696 free(dt);
697 return 0;
700 if (dt->duration > time(NULL)) {
701 warn("Bizarrely large duration (%lu)", dt->duration);
703 if (dt->start < ltime) {
704 if (dt->duration && dt->duration > ltime - dt->start)
705 dt->duration -= ltime - dt->start;
707 dt->start = ltime;
709 if (dt->stop < ltime || dt->stop < dt->start) {
710 /* retroactively scheduled downtime, or just plain wrong */
711 dt->stop = dt->start;
712 dt->duration = 0;
715 if (dt->fixed && dt->duration != dt->stop - dt->start) {
716 // warn("duration doesn't match stop - start: (%lu : %lu)",
717 // dt->duration, dt->stop - dt->start);
719 dt->duration = dt->stop - dt->start;
721 else if (dt->duration > 86400 * 14) {
722 warn("Oddly long duration: %lu", dt->duration);
725 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
726 dt->start, dt->stop, dt->duration, dt->fixed, dt->trigger, dt->host, dt->service);
728 stash_downtime_command(dt);
729 return 0;
732 static int insert_downtime(struct string_code *sc)
734 int type;
735 struct downtime_entry *dt = NULL;
736 int id = 0;
737 time_t dt_del_cmd;
738 char *host, *service = NULL;
740 host = strv[0];
741 if (sc->nvecs == 4) {
742 service = strv[1];
743 dt = hash_find2(service_downtime, host, service);
745 else
746 dt = hash_find(host_downtime, host);
749 * to stop a downtime we can either get STOPPED or
750 * CANCELLED. So far, I've only ever seen STARTED
751 * for when it actually starts though, and since
752 * the Nagios daemon is reponsible for launching
753 * it, it's unlikely there are more variants of
754 * that string
756 type = NEBTYPE_DOWNTIME_STOP;
757 if (!strcmp(strv[sc->nvecs - 2], "STARTED"))
758 type = NEBTYPE_DOWNTIME_START;
760 switch (type) {
761 case NEBTYPE_DOWNTIME_START:
762 if (dt) {
763 if (!probably_ignore_downtime)
764 dt_print("ALRDY", ltime, dt);
765 return 0;
768 if (probably_ignore_downtime)
769 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
770 probably_ignore_downtime, ltime, host, service);
772 if (ltime - last_downtime_start > 1)
773 downtime_id++;
775 id = downtime_id;
776 add_downtime(host, service, id);
777 last_downtime_start = ltime;
778 break;
780 case NEBTYPE_DOWNTIME_STOP:
781 if (!dt) {
783 * this can happen when overlapping downtime entries
784 * occur, and the start event for the second (or nth)
785 * downtime starts before the first downtime has had
786 * a stop event. It basically means we've almost
787 * certainly done something wrong.
789 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
790 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
791 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
792 dt_skip++;
793 return 0;
796 dt_del_cmd = !dt->service ? last_host_dt_del : last_svc_dt_del;
798 if ((ltime - dt_del_cmd) > 1 && dt->duration - (ltime - dt->started) > 60) {
799 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
800 ltime - dt->started, dt->host, dt->service, dt->duration);
802 if (ltime - dt->started > dt->duration + DT_PURGE_GRACETIME)
803 dt_print("Long", ltime, dt);
805 remove_downtime(dt);
807 * Now delete whatever matching downtimes we can find.
808 * this must be here, or we'll recurse like crazy into
809 * remove_downtime(), possibly exhausting the stack
810 * frame buffer
812 del_dte = dt;
813 if (!dt->service)
814 hash_walk_data(host_downtime, del_matching_dt);
815 else
816 hash_walk_data(service_downtime, del_matching_dt);
817 break;
819 default:
820 return -1;
823 return 0;
826 static int dt_purged;
827 static int purge_expired_dt(void *data)
829 struct downtime_entry *dt = data;
831 if (dt->purged) {
832 dt_skip++;
833 return 0;
836 set_next_dt_purge(dt->started, dt->duration);
838 if (ltime + DT_PURGE_GRACETIME > dt->stop) {
839 dt_purged++;
840 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
841 ltime, dt->id, dt->start, dt->started, dt->stop, dt->duration, dt->host, dt->service);
842 remove_downtime(dt);
843 return HASH_WALK_REMOVE;
845 else {
846 dt_print("PURGED_NOT_TIME", ltime, dt);
849 return 0;
852 static int purged_downtimes;
853 static void purge_expired_downtime(void)
855 int tot_purged = 0;
857 next_dt_purge = 0;
858 dt_purged = 0;
859 hash_walk_data(host_downtime, purge_expired_dt);
860 if (dt_purged)
861 debug("PURGE %d host downtimes purged", dt_purged);
862 tot_purged += dt_purged;
863 dt_purged = 0;
864 hash_walk_data(service_downtime, purge_expired_dt);
865 if (dt_purged)
866 debug("PURGE %d service downtimes purged", dt_purged);
867 tot_purged += dt_purged;
868 if (tot_purged)
869 debug("PURGE total %d entries purged", tot_purged);
871 if (next_dt_purge)
872 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
873 next_dt_purge, next_dt_purge - ltime);
875 purged_downtimes += tot_purged;
878 static inline void handle_start_event(void)
880 if (!daemon_is_running)
881 insert_process_event(NEBTYPE_PROCESS_START);
883 probably_ignore_downtime = daemon_start = ltime;
884 daemon_is_running = 1;
887 static inline void handle_stop_event(void)
889 if (daemon_is_running) {
890 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN);
891 daemon_is_running = 0;
893 daemon_stop = ltime;
896 static int parse_line(char *line, uint len)
898 char *ptr, *colon;
899 int nvecs = 0;
900 struct string_code *sc;
901 static time_t last_ltime = 0;
903 imported += len + 1; /* make up for 1 lost byte per newline */
905 /* ignore empty lines */
906 if (!len)
907 return 0;
909 if (++lines_since_progress >= PROGRESS_INTERVAL)
910 show_progress();
912 /* skip obviously bogus lines */
913 if (len < 12 || *line != '[') {
914 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no, line);
915 return -1;
918 ltime = strtoul(line + 1, &ptr, 10);
919 if (line + 1 == ptr) {
920 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line);
921 return -1;
924 if (ltime < last_ltime) {
925 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
926 // ltime, last_ltime, last_ltime - ltime);
927 ltime = last_ltime;
929 else
930 last_ltime = ltime;
933 * Incremental will be 0 if not set, or 1 if set but
934 * the database is currently empty.
935 * Note that this will not always do the correct thing,
936 * as downtime entries that might have been scheduled for
937 * purging may never show up as "stopped" in the database
938 * with this scheme. As such, incremental imports absolutely
939 * require that nothing is in scheduled downtime when the
940 * import is running (well, started really, but it amounts
941 * to the same thing).
943 if (ltime < incremental)
944 return 0;
946 if (next_dt_purge && ltime >= next_dt_purge)
947 purge_expired_downtime();
949 if (probably_ignore_downtime && ltime - probably_ignore_downtime > 1)
950 probably_ignore_downtime = 0;
952 while (*ptr == ']' || *ptr == ' ')
953 ptr++;
955 if (!is_interesting(ptr))
956 return 0;
958 if (!(colon = strchr(ptr, ':'))) {
959 /* stupid heuristic, but might be good for something,
960 * somewhere, sometime. if nothing else, it should suppress
961 * annoying output */
962 if (is_start_event(ptr)) {
963 handle_start_event();
964 return 0;
966 if (is_stop_event(ptr)) {
967 handle_stop_event();
968 return 0;
972 * An unhandled event. We should probably crash here
974 handle_unknown_event(line);
975 return -1;
978 /* an event happened without us having gotten a start-event */
979 if (!daemon_is_running) {
980 insert_process_event(NEBTYPE_PROCESS_START);
981 daemon_start = ltime;
982 daemon_is_running = 1;
985 if (!(sc = get_event_type(ptr, colon - ptr))) {
986 handle_unknown_event(line);
987 return -1;
990 if (sc->code == IGNORE_LINE)
991 return 0;
993 *colon = 0;
994 ptr = colon + 1;
995 while (*ptr == ' ')
996 ptr++;
998 if (sc->nvecs) {
999 int i;
1001 nvecs = vectorize_string(ptr, sc->nvecs);
1003 if (nvecs != sc->nvecs) {
1004 /* broken line */
1005 warn("Line %d in %s seems to not have all the fields it should",
1006 line_no, cur_file->path);
1007 return -1;
1010 for (i = 0; i < sc->nvecs; i++) {
1011 if (!strv[i]) {
1012 /* this should never happen */
1013 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
1014 line_no, cur_file->path);
1015 return -1;
1020 switch (sc->code) {
1021 char *semi_colon;
1023 case NEBTYPE_EXTERNALCOMMAND_END:
1024 semi_colon = strchr(ptr, ';');
1025 if (!semi_colon)
1026 return 0;
1027 if (!(sc = get_command_type(ptr, semi_colon - ptr))) {
1028 return 0;
1030 if (sc->code == RESTART_PROGRAM) {
1031 handle_stop_event();
1032 return 0;
1035 nvecs = vectorize_string(semi_colon + 1, sc->nvecs);
1036 if (nvecs != sc->nvecs) {
1037 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs, sc->nvecs, ptr);
1039 if (sc->code != ACKNOWLEDGE_HOST_PROBLEM &&
1040 sc->code != ACKNOWLEDGE_SVC_PROBLEM)
1042 register_downtime_command(sc);
1043 } else {
1044 insert_acknowledgement(sc);
1046 break;
1048 case NEBTYPE_HOSTCHECK_PROCESSED:
1049 return insert_host_check(sc);
1051 case NEBTYPE_SERVICECHECK_PROCESSED:
1052 return insert_service_check(sc);
1054 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST:
1055 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE:
1056 return insert_downtime(sc);
1058 case NEBTYPE_NOTIFICATION_END + CONCERNS_HOST:
1059 case NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE:
1060 return insert_notification(sc);
1062 case IGNORE_LINE:
1063 return 0;
1066 return 0;
1069 static int parse_one_line(char *str, uint len)
1071 if (parse_line(str, len) && use_sql && sql_errno())
1072 crash("sql error: %s", sql_error());
1074 return 0;
1077 static int hash_one_line(char *line, uint len)
1079 return add_interesting_object(line);
1082 static int hash_interesting(const char *path)
1084 struct stat st;
1086 if (stat(path, &st) < 0)
1087 crash("failed to stat %s: %s", path, strerror(errno));
1089 lparse_path(path, st.st_size, hash_one_line);
1091 return 0;
1094 extern const char *__progname;
1095 __attribute__((__format__(__printf__, 1, 2)))
1096 static void usage(const char *fmt, ...)
1098 if (fmt && *fmt) {
1099 va_list ap;
1101 va_start(ap, fmt);
1102 vfprintf(stdout, fmt, ap);
1103 va_end(ap);
1106 printf("Usage %s [options] [logfiles]\n\n", __progname);
1107 printf(" [logfiles] refers to all the nagios logfiles you want to import\n");
1108 printf(" If --nagios-cfg is given or can be inferred no logfiles need to be supplied\n");
1109 printf("\nOptions:\n");
1110 printf(" --help this cruft\n");
1111 printf(" --no-progress don't display progress output\n");
1112 printf(" --no-sql don't access the database\n");
1113 printf(" --db-name database name\n");
1114 printf(" --db-table database table name\n");
1115 printf(" --db-user database user\n");
1116 printf(" --db-pass database password\n");
1117 printf(" --incremental perform an incremental import\n");
1118 printf(" --truncate-db truncate database before importing\n");
1119 printf(" --only-notifications only import notifications\n");
1120 printf(" --nagios-cfg=</path/to/nagios.cfg> path to nagios.cfg\n");
1121 printf("\n\n");
1123 if (fmt && *fmt)
1124 exit(1);
1126 exit(0);
1129 int main(int argc, char **argv)
1131 int i, truncate_db = 0;
1132 const char *nagios_cfg = NULL;
1133 char *db_name, *db_user, *db_pass, *db_table;
1135 db_name = db_user = db_pass = db_table = NULL;
1137 do_progress = isatty(fileno(stdout));
1139 strv = calloc(sizeof(char *), MAX_NVECS);
1140 dentry = calloc(sizeof(*dentry), NUM_DENTRIES);
1141 if (!strv || !dentry)
1142 crash("Failed to alloc initial structs");
1145 for (num_nfile = 0,i = 1; i < argc; i++) {
1146 char *opt, *arg = argv[i];
1147 int arg_len, eq_opt = 0;
1149 if ((opt = strchr(arg, '='))) {
1150 *opt++ = '\0';
1151 eq_opt = 1;
1153 else if (i < argc - 1) {
1154 opt = argv[i + 1];
1157 if (!prefixcmp(arg, "-h") || !prefixcmp(arg, "--help")) {
1158 usage(NULL);
1160 if (!prefixcmp(arg, "--incremental")) {
1161 incremental = 1;
1164 * nifty for debugging --incremental skipping log-files
1165 * The value will be overwritten unless --no-sql is also
1166 * in effect
1168 if (eq_opt) {
1169 incremental = strtoul(opt, NULL, 0);
1170 if (!incremental)
1171 usage("--incremental= requires a parameter");
1173 continue;
1175 if (!prefixcmp(arg, "--no-sql")) {
1176 use_sql = 0;
1177 continue;
1179 if (!prefixcmp(arg, "--only-notifications")) {
1180 only_notifications = 1;
1181 db_name = db_name ? db_name : "merlin";
1182 db_user = db_user ? db_user : "merlin";
1183 db_pass = db_pass ? db_pass : "merlin";
1184 db_table = db_table ? db_table : "notification";
1185 continue;
1187 if (!prefixcmp(arg, "--no-progress")) {
1188 do_progress = 0;
1189 continue;
1191 if (!prefixcmp(arg, "--debug") || !prefixcmp(arg, "-d")) {
1192 do_progress = 0;
1193 debug_level++;
1194 continue;
1196 if (!prefixcmp(arg, "--truncate-db")) {
1197 truncate_db = 1;
1198 continue;
1200 if (!prefixcmp(arg, "--nagios-cfg")) {
1201 if (!opt || !*opt) {
1202 crash("%s requires the path to nagios.cfg as argument", arg);
1204 nagios_cfg = opt;
1205 if (opt && !eq_opt)
1206 i++;
1207 continue;
1209 if (!prefixcmp(arg, "--db-name")) {
1210 if (!opt || !*opt)
1211 crash("%s requires a database name as an argument", arg);
1212 db_name = opt;
1213 if (opt && !eq_opt)
1214 i++;
1215 continue;
1217 if (!prefixcmp(arg, "--db-user")) {
1218 if (!opt || !*opt)
1219 crash("%s requires a database username as argument", arg);
1220 db_user = opt;
1221 if (opt && !eq_opt)
1222 i++;
1223 continue;
1225 if (!prefixcmp(arg, "--db-pass")) {
1226 if (!opt || !*opt)
1227 crash("%s requires a database username as argument", arg);
1228 db_pass = opt;
1229 if (opt && !eq_opt)
1230 i++;
1231 continue;
1233 if (!prefixcmp(arg, "--db-table")) {
1234 if (!opt || !*opt)
1235 crash("%s requires a database table name as argument", arg);
1236 db_table = opt;
1237 if (opt && !eq_opt)
1238 i++;
1239 continue;
1241 if (!prefixcmp(arg, "--interesting") || !prefixcmp(arg, "-i")) {
1242 if (!opt || !*opt)
1243 crash("%s requires a filename as argument", arg);
1244 hash_interesting(opt);
1245 if (opt && !eq_opt)
1246 i++;
1247 continue;
1250 /* non-argument, so treat as a config- or log-file */
1251 arg_len = strlen(arg);
1252 if (arg_len >= 10 && !strcmp(&arg[arg_len - 10], "nagios.cfg")) {
1253 nagios_cfg = arg;
1254 } else {
1255 add_naglog_path(arg);
1259 /* fallback for op5 systems */
1260 if (!nagios_cfg && !num_nfile) {
1261 nagios_cfg = "/opt/monitor/etc/nagios.cfg";
1263 if (nagios_cfg) {
1264 struct cfg_comp *conf;
1265 conf = cfg_parse_file(nagios_cfg);
1266 for (i = 0; i < conf->vars; i++) {
1267 struct cfg_var *v = conf->vlist[i];
1268 if (!strcmp(v->key, "log_file")) {
1269 add_naglog_path(v->value);
1271 if (!strcmp(v->key, "log_archive_path")) {
1272 add_naglog_path(v->value);
1277 if (use_sql) {
1278 db_name = db_name ? db_name : "monitor_reports";
1279 db_user = db_user ? db_user : "monitor";
1280 db_pass = db_pass ? db_pass : "monitor";
1281 db_table = db_table ? db_table : "report_data";
1282 sql_config("db_database", db_name);
1283 sql_config("db_user", db_user);
1284 sql_config("db_pass", db_pass);
1285 sql_config("db_table", db_table);
1287 if (sql_init() < 0)
1288 crash("sql_init() failed");
1289 if (truncate_db)
1290 sql_query("TRUNCATE %s", sql_table_name());
1292 if (incremental) {
1293 MYSQL_RES *result;
1294 MYSQL_ROW row;
1295 sql_query("SELECT %s FROM %s.%s ORDER BY %s DESC LIMIT 1",
1296 only_notifications ? "end_time" : "timestamp",
1297 db_name, db_table,
1298 only_notifications ? "end_time" : "timestamp");
1300 if (!(result = sql_get_result()))
1301 crash("Failed to get last timestamp: %s\n", sql_error());
1303 /* someone might use --incremental with an empty
1304 * database. We shouldn't crash in that case */
1305 if ((row = sql_fetch_row(result)))
1306 incremental = strtoul(row[0], NULL, 0);
1308 sql_free_result(result);
1311 * We lock the table we'll be working with and disable
1312 * indexes on it. Otherwise doing the actual inserts
1313 * will take just about forever, as MySQL has to update
1314 * and flush the index cache between each operation.
1316 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
1317 crash("Failed to disable keys: %s", sql_error());
1318 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
1319 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
1322 log_grok_var("logfile", "/dev/null");
1323 log_grok_var("log_levels", "warn");
1325 if (!num_nfile)
1326 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1327 __progname);
1329 if (log_init() < 0)
1330 crash("log_init() failed");
1332 qsort(nfile, num_nfile, sizeof(*nfile), nfile_cmp);
1334 host_downtime = hash_init(HASH_TABLE_SIZE);
1335 service_downtime = hash_init(HASH_TABLE_SIZE);
1337 if (hook_init() < 0)
1338 crash("Failed to initialize hooks");
1340 /* go through them once to count the total size for progress output */
1341 for (i = 0; i < num_nfile; i++) {
1342 totsize += nfile[i].size;
1345 gettimeofday(&import_start, NULL);
1346 printf("Importing %s of data from %d files\n",
1347 tobytes(totsize), num_nfile);
1349 for (i = 0; i < num_nfile; i++) {
1350 struct naglog_file *nf = &nfile[i];
1351 cur_file = nf;
1352 show_progress();
1355 * skip parsing files if they're not interesting, such
1356 * as during incremental imports.
1357 * 'incremental' will be 0 if we're doing a full import,
1358 * 1 if we're doing an incremental but the database is
1359 * empty and will contain the timestamp of the latest
1360 * entry in the database if we're doing an incremental
1361 * import to a populated table.
1362 * Note that we can never skip the last file in the list,
1363 * although the lparse routine should sift through it
1364 * pretty quickly in case it has nothing interesting.
1366 if (i + 1 < num_nfile && incremental > nfile[i + 1].first) {
1367 skipped_files++;
1368 imported += nf->size;
1369 continue;
1371 debug("importing from %s (%lu : %u)\n", nf->path, nf->first, nf->cmp);
1372 line_no = 0;
1373 lparse_path(nf->path, nf->size, parse_one_line);
1374 imported++; /* make up for one lost byte per file */
1377 ltime = time(NULL);
1378 purge_expired_downtime();
1379 end_progress();
1381 if (debug_level) {
1382 if (dt_depth) {
1383 printf("Unclosed host downtimes:\n");
1384 puts("------------------------");
1385 hash_walk_data(host_downtime, print_downtime);
1386 printf("Unclosed service downtimes:\n");
1387 puts("---------------------------");
1388 hash_walk_data(service_downtime, print_downtime);
1390 printf("dt_depth: %d\n", dt_depth);
1392 printf("purged downtimes: %d\n", purged_downtimes);
1393 printf("max simultaneous host downtime hashes: %u\n",
1394 hash_entries_max(host_downtime));
1395 printf("max simultaneous service downtime hashes: %u\n",
1396 hash_entries_max(service_downtime));
1397 printf("max downtime depth: %u\n", max_dt_depth);
1400 if (use_sql) {
1401 SQL_RESULT *res;
1402 SQL_ROW row;
1403 time_t start;
1404 unsigned long entries;
1406 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
1407 if (!(res = sql_get_result()))
1408 entries = 0;
1409 else {
1410 row = sql_fetch_row(res);
1411 entries = strtoul(row[0], NULL, 0);
1412 sql_free_result(res);
1415 signal(SIGINT, SIG_IGN);
1416 sql_query("UNLOCK TABLES");
1417 start = time(NULL);
1418 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
1419 (entries / 50000) + 1);
1420 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
1421 printf("%lu database entries indexed in %lu seconds\n",
1422 entries, time(NULL) - start);
1423 sql_close();
1426 if (warnings && debug_level)
1427 fprintf(stderr, "Total warnings: %d\n", warnings);
1429 if (debug_level || dt_start > dt_stop) {
1430 uint count;
1431 fprintf(stderr, "Downtime data %s\n started: %d\n stopped: %d\n delta : %d\n skipped: %d\n",
1432 dt_depth ? "mismatch!" : "consistent", dt_start, dt_stop, dt_depth, dt_skip);
1433 hash_debug_table(host_downtime, 0);
1434 hash_debug_table(service_downtime, 0);
1435 if ((count = hash_entries(host_downtime))) {
1436 fprintf(stderr, "host_downtime as %u entries remaining\n", count);
1438 if ((count = hash_entries(service_downtime))) {
1439 fprintf(stderr, "service_downtime has %u entries remaining\n", count);
1443 print_unhandled_events();
1445 return 0;