import: Only read last time from sql if --incremental lacks timestamp
[nagios-reports-module.git] / import.c
blobd53b6b695c55bd6f609057bf50c4ba34ba4d84c0
1 #define _GNU_SOURCE 1
2 #include <sys/types.h>
3 #include <signal.h>
5 #include "nagios/broker.h"
6 #include "nagios/nebcallbacks.h"
7 #include "sql.h"
8 #include "hooks.h"
9 #include "logging.h"
10 #include "hash.h"
11 #include "lparse.h"
12 #include "logutils.h"
13 #include "cfgfile.h"
15 #define IGNORE_LINE 0
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
20 #define MAX_NVECS 16
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
24 #define SERVICE_OK 0
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 25000 /* lines to parse between progress updates */
32 static int only_notifications;
33 static unsigned long long imported, totsize, totlines, skipped;
34 static int lines_since_progress, do_progress, list_files;
35 static struct timeval import_start;
36 static time_t daemon_start, daemon_stop, incremental;
37 static int daemon_is_running;
38 static uint max_dt_depth, skipped_files;
40 static time_t next_dt_purge; /* when next to purge expired downtime */
41 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
43 static time_t ltime; /* the timestamp from the current log-line */
45 static int dt_start, dt_stop, dt_skip;
46 #define dt_depth (dt_start - dt_stop)
47 static hash_table *host_downtime;
48 static hash_table *service_downtime;
49 static int downtime_id;
50 static time_t probably_ignore_downtime;
52 struct downtime_entry {
53 int id;
54 int code;
55 char *host;
56 char *service;
57 time_t start;
58 time_t stop;
59 int fixed;
60 time_t duration;
61 time_t started;
62 time_t ended;
63 int purged;
64 int trigger;
65 int slot;
66 struct downtime_entry *next;
69 #define NUM_DENTRIES 1024
70 static struct downtime_entry **dentry;
71 static time_t last_downtime_start;
73 static struct string_code event_codes[] = {
74 add_ignored("Error"),
75 add_ignored("Warning"),
76 add_ignored("LOG ROTATION"),
77 add_ignored("HOST FLAPPING ALERT"),
78 add_ignored("SERVICE FLAPPING ALERT"),
79 add_ignored("SERVICE EVENT HANDLER"),
80 add_ignored("HOST EVENT HANDLER"),
81 add_ignored("LOG VERSION"),
83 add_code(5, "HOST NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_HOST),
84 add_code(6, "SERVICE NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE),
85 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED),
86 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED),
87 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END),
88 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED),
89 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
90 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
91 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED),
92 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
93 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
94 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST),
95 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE),
96 { 0, NULL, 0, 0 },
99 static struct string_code command_codes[] = {
100 add_cdef(1, DEL_HOST_DOWNTIME),
101 add_cdef(1, DEL_SVC_DOWNTIME),
102 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME),
103 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME),
104 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME),
105 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME),
106 add_cdef(8, SCHEDULE_HOST_DOWNTIME),
107 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME),
108 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME),
109 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME),
110 add_cdef(8, SCHEDULE_SVC_DOWNTIME),
113 * These really have one more field than listed here. We omit one
114 * to make author and comment concatenated with a semi-colon by default.
116 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM),
117 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM),
118 { 0, NULL, 0, 0 },
122 static inline void print_strvec(char **v, int n)
124 int i;
126 for (i = 0; i < n; i++)
127 printf("v[%2d]: %s\n", i, v[i]);
131 static const char *tobytes(unsigned long long n)
133 const char *suffix = "KMGTP";
134 static char tbuf[2][30];
135 static int t = 0;
136 int shift = 1;
138 t ^= 1;
139 if (n < 1024) {
140 sprintf(tbuf[t], "%llu bytes", n);
141 return tbuf[t];
144 while (n >> (shift * 10) > 1024 && shift < sizeof(suffix) - 1)
145 shift++;
147 sprintf(tbuf[t], "%0.2f %ciB",
148 (float)n / (float)(1 << (shift * 10)), suffix[shift - 1]);
150 return tbuf[t];
153 static const char *tv_delta(struct timeval *start, struct timeval *stop)
155 static char buf[30];
156 double secs;
157 unsigned int days, hours, mins;
159 secs = stop->tv_sec - start->tv_sec;
160 days = secs / 86400;
161 secs -= days * 86400;
162 hours = secs / 3600;
163 secs -= hours * 3600;
164 mins = secs / 60;
165 secs -= mins * 60;
167 /* add the micro-seconds */
168 secs = ((secs * 1000000) + (stop->tv_usec - start->tv_usec)) / 1000000;
170 if (!mins && !hours && !days) {
171 sprintf(buf, "%.3lfs", secs);
172 } else if (!hours && !days) {
173 sprintf(buf, "%um %.3lfs", mins, secs);
174 } else if (!days) {
175 sprintf(buf, "%uh %um %.3lfs", hours, mins, secs);
176 } else {
177 sprintf(buf, "%ud %uh %um %.3lfs", days, hours, mins, secs);
180 return buf;
183 static void show_progress(void)
185 time_t eta, elapsed;
186 float pct_done, real_pct_done;
188 totlines += lines_since_progress;
189 lines_since_progress = 0;
191 if (!do_progress)
192 return;
194 elapsed = time(NULL) - import_start.tv_sec;
195 if (!elapsed)
196 elapsed = 1;
198 real_pct_done = (float)imported / (float)(totsize - skipped) * 100;
199 pct_done = ((float)(imported + skipped) / (float)totsize) * 100;
200 eta = (elapsed / real_pct_done) * (100.0 - real_pct_done);
202 printf("Importing data: %.2f%% (%s) done ",
203 pct_done, tobytes(imported + skipped));
204 if (elapsed > 10) {
205 printf("ETA: ");
206 if (eta > 60)
207 printf("%lum%lus", eta / 60, eta % 60);
208 else
209 printf("%lus", eta);
211 printf(" \r");
212 fflush(stdout);
215 static void end_progress(void)
217 struct timeval tv;
219 if (list_files)
220 return;
222 gettimeofday(&tv, NULL);
225 * If any of the logfiles doesn't have a newline
226 * at end of file, imported will be slightly off.
227 * We set it hard here so as to make sure that
228 * the final progress output stops at exactly 100%
230 imported = totsize - skipped;
232 show_progress();
233 putchar('\n');
234 printf("%s, %llu lines imported in %s.",
235 tobytes(totsize), totlines, tv_delta(&import_start, &tv));
236 if (skipped)
237 printf(" %s in %u files skipped.", tobytes(skipped), skipped_files);
238 putchar('\n');
241 static int use_sql = 1, indexes_disabled;
242 static void disable_indexes(void)
244 if (indexes_disabled)
245 return;
248 * if we're more than 95% done before inserting anything,
249 * such as might be the case when running an incremental
250 * import, we might as well not bother with disabling
251 * the indexes, since enabling them again can take quite
252 * a long time
254 if (((float)(skipped + imported) / (float)totsize) * 100 >= 95.0)
255 return;
258 * We lock the table we'll be working with and disable
259 * indexes on it. Otherwise doing the actual inserts
260 * will take just about forever, as MySQL has to update
261 * and flush the index cache between each operation.
263 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
264 crash("Failed to disable keys: %s", sql_error());
265 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
266 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
268 indexes_disabled = 1;
271 static void enable_indexes(void)
273 SQL_RESULT *res;
274 SQL_ROW row;
275 unsigned long entries;
276 time_t start;
278 /* if we haven't disabled the indexes we can quit early */
279 if (!indexes_disabled)
280 return;
282 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
283 if (!(res = sql_get_result()))
284 entries = 0;
285 else {
286 row = sql_fetch_row(res);
287 entries = strtoul(row[0], NULL, 0);
288 sql_free_result(res);
291 signal(SIGINT, SIG_IGN);
292 sql_query("UNLOCK TABLES");
293 start = time(NULL);
294 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
295 (entries / 50000) + 1);
296 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
297 printf("%lu database entries indexed in %lu seconds\n",
298 entries, time(NULL) - start);
301 static int insert_downtime_event(int type, char *host, char *service, int id)
303 nebstruct_downtime_data ds;
304 int result;
306 if (!is_interesting_service(host, service))
307 return 0;
309 dt_start += type == NEBTYPE_DOWNTIME_START;
310 dt_stop += type == NEBTYPE_DOWNTIME_STOP;
311 if (dt_depth > max_dt_depth)
312 max_dt_depth = dt_depth;
314 if (!use_sql || only_notifications)
315 return 0;
317 memset(&ds, 0, sizeof(ds));
319 ds.type = type;
320 ds.timestamp.tv_sec = ltime;
321 ds.host_name = host;
322 ds.service_description = service;
323 ds.downtime_id = id;
325 disable_indexes();
326 result = hook_downtime(NEBCALLBACK_DOWNTIME_DATA, (void *)&ds);
327 if (result < 0)
328 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
329 type, host, service, id);
331 return result;
334 typedef struct import_notification {
335 int type, reason, state;
336 } import_notification;
338 static int parse_import_notification(char *str, import_notification *n)
340 char *state_str = str;
342 n->reason = parse_notification_reason(str);
343 if (n->reason != NOTIFICATION_NORMAL) {
344 char *space, *paren;
346 space = strchr(str, ' ');
347 if (!space)
348 return -1;
349 paren = strchr(space, ')');
350 if (!paren)
351 return -1;
352 *paren = '\0';
354 state_str = space + 2;
357 n->type = SERVICE_NOTIFICATION;
358 n->state = parse_service_state_gently(state_str);
359 if (n->state < 0) {
360 n->type = HOST_NOTIFICATION;
361 n->state = parse_host_state_gently(state_str);
364 return 0;
367 static int insert_notification(struct string_code *sc)
369 int base_idx;
370 const char *desc;
371 struct import_notification n;
373 if (!only_notifications)
374 return 0;
376 if (sc->code - NEBTYPE_NOTIFICATION_END == CONCERNS_SERVICE) {
377 base_idx = 1;
378 desc = strv[2];
379 } else {
380 base_idx = 0;
381 desc = 0;
383 if (parse_import_notification(strv[base_idx + 2], &n) < 0) {
384 handle_unknown_event(strv[base_idx + 2]);
385 return 0;
388 if (!use_sql)
389 return 0;
391 disable_indexes();
392 return sql_query
393 ("INSERT INTO %s.%s("
394 "notification_type, start_time, end_time, contact_name, "
395 "host_name, service_description, "
396 "command_name, output, "
397 "state, reason_type) "
398 "VALUES("
399 "%d, %lu, %lu, '%s', "
400 "'%s', '%s', "
401 "'%s', '%s', "
402 "%d, %d)",
403 sql_db_name(), sql_table_name(),
404 n.type, ltime, ltime, sql_escape(strv[0]),
405 sql_escape(strv[1]), desc ? sql_escape(desc) : "",
406 sql_escape(strv[base_idx + 3]), sql_escape(strv[base_idx + 4]),
407 n.state, n.reason);
410 static int insert_service_check(struct string_code *sc)
412 nebstruct_service_check_data ds;
414 if (!is_interesting_service(strv[0], strv[1]))
415 return 0;
417 memset(&ds, 0, sizeof(ds));
419 ds.timestamp.tv_sec = ltime;
420 ds.type = sc->code;
421 ds.host_name = strv[0];
422 ds.service_description = strv[1];
423 if (sc->nvecs == 4) {
424 /* passive service check result */
425 if (*strv[2] >= '0' && *strv[2] <= '9')
426 ds.state = atoi(strv[2]);
427 else
428 ds.state = parse_service_state(strv[2]);
429 ds.state_type = HARD_STATE;
430 ds.current_attempt = 1;
431 ds.output = strv[3];
432 } else {
433 ds.state = parse_service_state(strv[2]);
434 ds.state_type = soft_hard(strv[3]);
435 ds.current_attempt = atoi(strv[4]);
436 ds.output = strv[5];
439 if (!use_sql || only_notifications)
440 return 0;
442 disable_indexes();
443 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA, (void *)&ds);
446 static int insert_host_check(struct string_code *sc)
448 nebstruct_host_check_data ds;
450 if (!is_interesting_host(strv[0]))
451 return 0;
453 memset(&ds, 0, sizeof(ds));
455 ds.timestamp.tv_sec = ltime;
456 ds.type = sc->code;
457 ds.host_name = strv[0];
458 if (sc->nvecs == 3) {
459 if (*strv[1] >= '0' && *strv[1] <= '9')
460 ds.state = atoi(strv[1]);
461 else
462 ds.state = parse_host_state(strv[1]);
463 /* passive host check result */
464 ds.output = strv[2];
465 ds.current_attempt = 1;
466 ds.state_type = HARD_STATE;
467 } else {
468 ds.state = parse_host_state(strv[1]);
469 ds.state_type = soft_hard(strv[2]);
470 ds.current_attempt = atoi(strv[3]);
471 ds.output = strv[4];
474 if (!use_sql || only_notifications)
475 return 0;
477 disable_indexes();
478 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA, (void *)&ds);
481 static int insert_process_event(int type)
483 nebstruct_process_data ds;
485 if (!use_sql || only_notifications)
486 return 0;
488 memset(&ds, 0, sizeof(ds));
489 ds.timestamp.tv_sec = ltime;
490 ds.type = type;
491 disable_indexes();
492 return hook_process_data(NEBCALLBACK_PROCESS_DATA, (void *)&ds);
495 static int insert_acknowledgement(struct string_code *sc)
497 return 0;
500 static void dt_print(char *tpc, time_t when, struct downtime_entry *dt)
502 if (!debug_level)
503 return;
505 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
506 tpc, when, dt->started, dt->start, dt->stop, dt->duration, dt->id);
507 printf("%s", dt->host);
508 if (dt->service)
509 printf(";%s", dt->service);
510 putchar('\n');
513 static struct downtime_entry *last_dte;
514 static struct downtime_entry *del_dte;
516 static void remove_downtime(struct downtime_entry *dt);
517 static int del_matching_dt(void *data)
519 struct downtime_entry *dt = data;
521 if (del_dte->id == dt->id) {
522 dt_print("ALSO", 0, dt);
523 remove_downtime(dt);
524 return HASH_WALK_REMOVE;
527 return 0;
530 static void stash_downtime_command(struct downtime_entry *dt)
532 dt->slot = dt->start % NUM_DENTRIES;
533 dt->next = dentry[dt->slot];
534 dentry[dt->slot] = dt;
537 static void remove_downtime(struct downtime_entry *dt)
539 if (!is_interesting_service(dt->host, dt->service))
540 return;
542 insert_downtime_event(NEBTYPE_DOWNTIME_STOP, dt->host, dt->service, dt->id);
544 dt_print("RM_DT", ltime, dt);
545 dt->purged = 1;
548 static struct downtime_entry *
549 dt_matches_command(struct downtime_entry *dt, char *host, char *service)
551 for (; dt; dt = dt->next) {
552 time_t diff;
554 if (ltime > dt->stop || ltime < dt->start) {
555 continue;
558 switch (dt->code) {
559 case SCHEDULE_SVC_DOWNTIME:
560 if (service && strcmp(service, dt->service))
561 continue;
563 /* fallthrough */
564 case SCHEDULE_HOST_DOWNTIME:
565 case SCHEDULE_HOST_SVC_DOWNTIME:
566 if (strcmp(host, dt->host)) {
567 continue;
570 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
571 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
572 /* these two have host set in dt, but
573 * it will not match all the possible hosts */
575 /* fallthrough */
576 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
577 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
578 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
579 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
580 break;
581 default:
582 crash("dt->code not set properly\n");
586 * Once we get here all the various other criteria have
587 * been matched, so we need to check if the daemon was
588 * running when this downtime was supposed to have
589 * started, and otherwise use the daemon start time
590 * as the value to diff against
592 if (daemon_stop < dt->start && daemon_start > dt->start) {
593 debug("Adjusting dt->start (%lu) to (%lu)\n",
594 dt->start, daemon_start);
595 dt->start = daemon_start;
596 if (dt->trigger && dt->duration)
597 dt->stop = dt->start + dt->duration;
600 diff = ltime - dt->start;
601 if (diff < 3 || dt->trigger || !dt->fixed)
602 return dt;
605 return NULL;
608 static struct downtime_entry *
609 find_downtime_command(char *host, char *service)
611 int i;
612 struct downtime_entry *shortcut = NULL;
614 if (last_dte && last_dte->start == ltime) {
615 shortcut = last_dte;
616 // return last_dte;
618 for (i = 0; i < NUM_DENTRIES; i++) {
619 struct downtime_entry *dt;
620 dt = dt_matches_command(dentry[i], host, service);
621 if (dt) {
622 if (shortcut && dt != shortcut)
623 if (debug_level)
624 printf("FIND shortcut no good\n");
625 last_dte = dt;
626 return dt;
630 debug("FIND not\n");
631 return NULL;
634 static int print_downtime(void *data)
636 struct downtime_entry *dt = (struct downtime_entry *)data;
638 dt_print("UNCLOSED", ltime, dt);
640 return 0;
643 static inline void set_next_dt_purge(time_t base, time_t add)
645 if (!next_dt_purge || next_dt_purge > base + add)
646 next_dt_purge = base + add;
648 if (next_dt_purge <= ltime)
649 next_dt_purge = ltime + 1;
652 static inline void add_downtime(char *host, char *service, int id)
654 struct downtime_entry *dt, *cmd, *old;
656 if (!is_interesting_service(host, service))
657 return;
659 dt = malloc(sizeof(*dt));
660 cmd = find_downtime_command(host, service);
661 if (!cmd) {
662 warn("DT with no ext cmd? %lu %s;%s", ltime, host, service);
663 memset(dt, 0, sizeof(*dt));
664 dt->duration = 7200; /* the default downtime duration in nagios */
665 dt->start = ltime;
666 dt->stop = dt->start + dt->duration;
668 else
669 memcpy(dt, cmd, sizeof(*dt));
671 dt->host = strdup(host);
672 dt->id = id;
673 dt->started = ltime;
675 set_next_dt_purge(ltime, dt->duration);
677 if (!service) {
678 dt->service = NULL;
679 old = hash_update(host_downtime, dt->host, dt);
681 else {
682 dt->service = strdup(service);
683 old = hash_update2(service_downtime, dt->host, dt->service, dt);
686 if (old && old != dt) {
687 free(old->host);
688 if (old->service)
689 free(old->service);
690 free(old);
693 dt_print("IN_DT", ltime, dt);
694 insert_downtime_event(NEBTYPE_DOWNTIME_START, dt->host, dt->service, dt->id);
697 static time_t last_host_dt_del, last_svc_dt_del;
698 static int register_downtime_command(struct string_code *sc)
700 struct downtime_entry *dt;
701 char *start_time, *end_time, *duration = NULL;
702 char *host = NULL, *service = NULL, *fixed, *triggered_by = NULL;
703 time_t foo;
705 switch (sc->code) {
706 case DEL_HOST_DOWNTIME:
707 last_host_dt_del = ltime;
708 return 0;
709 case DEL_SVC_DOWNTIME:
710 last_svc_dt_del = ltime;
711 return 0;
713 case SCHEDULE_HOST_DOWNTIME:
714 if (strtotimet(strv[5], &foo))
715 duration = strv[4];
716 /* fallthrough */
717 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
718 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
719 case SCHEDULE_HOST_SVC_DOWNTIME:
720 host = strv[0];
721 /* fallthrough */
722 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
723 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
724 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
725 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
726 start_time = strv[1];
727 end_time = strv[2];
728 fixed = strv[3];
729 if (strtotimet(strv[5], &foo))
730 triggered_by = strv[4];
731 if (!duration)
732 duration = strv[5];
734 break;
736 case SCHEDULE_SVC_DOWNTIME:
737 host = strv[0];
738 service = strv[1];
739 start_time = strv[2];
740 end_time = strv[3];
741 fixed = strv[4];
742 if (strtotimet(strv[6], &foo)) {
743 triggered_by = strv[5];
744 duration = strv[6];
746 else {
747 duration = strv[5];
749 break;
751 default:
752 crash("Unknown downtime type: %d", sc->code);
755 if (!(dt = calloc(sizeof(*dt), 1)))
756 crash("calloc(%u, 1) failed: %s", (uint)sizeof(*dt), strerror(errno));
758 dt->code = sc->code;
759 if (host)
760 dt->host = strdup(host);
761 if (service)
762 dt->service = strdup(service);
764 dt->trigger = triggered_by ? !!(*triggered_by - '0') : 0;
765 if (strtotimet(start_time, &dt->start) || strtotimet(end_time, &dt->stop))
767 print_strvec(strv, sc->nvecs);
768 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
769 command_codes[sc->code - 1].str, start_time, end_time, duration);
773 * sometimes downtime commands can be logged according to
774 * log version 1, while the log still claims to be version 2.
775 * Apparently, this happens when using a daemon supporting
776 * version 2 logging but a downtime command is added that
777 * follows the version 1 standard.
778 * As such, we simply ignore the result of the "duration"
779 * field conversion and just accept that it might not work
781 (void)strtotimet(duration, &dt->duration);
782 dt->fixed = *fixed - '0';
785 * ignore downtime scheduled to take place in the future.
786 * It will be picked up by the module anyways
788 if (dt->start > time(NULL)) {
789 free(dt);
790 return 0;
793 if (dt->duration > time(NULL)) {
794 warn("Bizarrely large duration (%lu)", dt->duration);
796 if (dt->start < ltime) {
797 if (dt->duration && dt->duration > ltime - dt->start)
798 dt->duration -= ltime - dt->start;
800 dt->start = ltime;
802 if (dt->stop < ltime || dt->stop < dt->start) {
803 /* retroactively scheduled downtime, or just plain wrong */
804 dt->stop = dt->start;
805 dt->duration = 0;
808 if (dt->fixed && dt->duration != dt->stop - dt->start) {
809 // warn("duration doesn't match stop - start: (%lu : %lu)",
810 // dt->duration, dt->stop - dt->start);
812 dt->duration = dt->stop - dt->start;
814 else if (dt->duration > 86400 * 14) {
815 warn("Oddly long duration: %lu", dt->duration);
818 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
819 dt->start, dt->stop, dt->duration, dt->fixed, dt->trigger, dt->host, dt->service);
821 stash_downtime_command(dt);
822 return 0;
825 static int insert_downtime(struct string_code *sc)
827 int type;
828 struct downtime_entry *dt = NULL;
829 int id = 0;
830 time_t dt_del_cmd;
831 char *host, *service = NULL;
833 host = strv[0];
834 if (sc->nvecs == 4) {
835 service = strv[1];
836 dt = hash_find2(service_downtime, host, service);
838 else
839 dt = hash_find(host_downtime, host);
842 * to stop a downtime we can either get STOPPED or
843 * CANCELLED. So far, I've only ever seen STARTED
844 * for when it actually starts though, and since
845 * the Nagios daemon is reponsible for launching
846 * it, it's unlikely there are more variants of
847 * that string
849 type = NEBTYPE_DOWNTIME_STOP;
850 if (!strcmp(strv[sc->nvecs - 2], "STARTED"))
851 type = NEBTYPE_DOWNTIME_START;
853 switch (type) {
854 case NEBTYPE_DOWNTIME_START:
855 if (dt) {
856 if (!probably_ignore_downtime)
857 dt_print("ALRDY", ltime, dt);
858 return 0;
861 if (probably_ignore_downtime)
862 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
863 probably_ignore_downtime, ltime, host, service);
865 if (ltime - last_downtime_start > 1)
866 downtime_id++;
868 id = downtime_id;
869 add_downtime(host, service, id);
870 last_downtime_start = ltime;
871 break;
873 case NEBTYPE_DOWNTIME_STOP:
874 if (!dt) {
876 * this can happen when overlapping downtime entries
877 * occur, and the start event for the second (or nth)
878 * downtime starts before the first downtime has had
879 * a stop event. It basically means we've almost
880 * certainly done something wrong.
882 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
883 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
884 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
885 dt_skip++;
886 return 0;
889 dt_del_cmd = !dt->service ? last_host_dt_del : last_svc_dt_del;
891 if ((ltime - dt_del_cmd) > 1 && dt->duration - (ltime - dt->started) > 60) {
892 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
893 ltime - dt->started, dt->host, dt->service, dt->duration);
895 if (ltime - dt->started > dt->duration + DT_PURGE_GRACETIME)
896 dt_print("Long", ltime, dt);
898 remove_downtime(dt);
900 * Now delete whatever matching downtimes we can find.
901 * this must be here, or we'll recurse like crazy into
902 * remove_downtime(), possibly exhausting the stack
903 * frame buffer
905 del_dte = dt;
906 if (!dt->service)
907 hash_walk_data(host_downtime, del_matching_dt);
908 else
909 hash_walk_data(service_downtime, del_matching_dt);
910 break;
912 default:
913 return -1;
916 return 0;
919 static int dt_purged;
920 static int purge_expired_dt(void *data)
922 struct downtime_entry *dt = data;
924 if (dt->purged) {
925 dt_skip++;
926 return 0;
929 set_next_dt_purge(dt->started, dt->duration);
931 if (ltime + DT_PURGE_GRACETIME > dt->stop) {
932 dt_purged++;
933 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
934 ltime, dt->id, dt->start, dt->started, dt->stop, dt->duration, dt->host, dt->service);
935 remove_downtime(dt);
936 return HASH_WALK_REMOVE;
938 else {
939 dt_print("PURGED_NOT_TIME", ltime, dt);
942 return 0;
945 static int purged_downtimes;
946 static void purge_expired_downtime(void)
948 int tot_purged = 0;
950 next_dt_purge = 0;
951 dt_purged = 0;
952 hash_walk_data(host_downtime, purge_expired_dt);
953 if (dt_purged)
954 debug("PURGE %d host downtimes purged", dt_purged);
955 tot_purged += dt_purged;
956 dt_purged = 0;
957 hash_walk_data(service_downtime, purge_expired_dt);
958 if (dt_purged)
959 debug("PURGE %d service downtimes purged", dt_purged);
960 tot_purged += dt_purged;
961 if (tot_purged)
962 debug("PURGE total %d entries purged", tot_purged);
964 if (next_dt_purge)
965 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
966 next_dt_purge, next_dt_purge - ltime);
968 purged_downtimes += tot_purged;
971 static inline void handle_start_event(void)
973 if (!daemon_is_running)
974 insert_process_event(NEBTYPE_PROCESS_START);
976 probably_ignore_downtime = daemon_start = ltime;
977 daemon_is_running = 1;
980 static inline void handle_stop_event(void)
982 if (daemon_is_running) {
983 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN);
984 daemon_is_running = 0;
986 daemon_stop = ltime;
989 static int parse_line(char *line, uint len)
991 char *ptr, *colon;
992 int nvecs = 0;
993 struct string_code *sc;
994 static time_t last_ltime = 0;
996 imported += len + 1; /* make up for 1 lost byte per newline */
998 /* ignore empty lines */
999 if (!len)
1000 return 0;
1002 if (++lines_since_progress >= PROGRESS_INTERVAL)
1003 show_progress();
1005 /* skip obviously bogus lines */
1006 if (len < 12 || *line != '[') {
1007 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no, line);
1008 return -1;
1011 ltime = strtoul(line + 1, &ptr, 10);
1012 if (line + 1 == ptr) {
1013 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line);
1014 return -1;
1017 if (ltime < last_ltime) {
1018 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
1019 // ltime, last_ltime, last_ltime - ltime);
1020 ltime = last_ltime;
1022 else
1023 last_ltime = ltime;
1026 * Incremental will be 0 if not set, or 1 if set but
1027 * the database is currently empty.
1028 * Note that this will not always do the correct thing,
1029 * as downtime entries that might have been scheduled for
1030 * purging may never show up as "stopped" in the database
1031 * with this scheme. As such, incremental imports absolutely
1032 * require that nothing is in scheduled downtime when the
1033 * import is running (well, started really, but it amounts
1034 * to the same thing).
1036 if (ltime < incremental)
1037 return 0;
1039 if (next_dt_purge && ltime >= next_dt_purge)
1040 purge_expired_downtime();
1042 if (probably_ignore_downtime && ltime - probably_ignore_downtime > 1)
1043 probably_ignore_downtime = 0;
1045 while (*ptr == ']' || *ptr == ' ')
1046 ptr++;
1048 if (!is_interesting(ptr))
1049 return 0;
1051 if (!(colon = strchr(ptr, ':'))) {
1052 /* stupid heuristic, but might be good for something,
1053 * somewhere, sometime. if nothing else, it should suppress
1054 * annoying output */
1055 if (is_start_event(ptr)) {
1056 handle_start_event();
1057 return 0;
1059 if (is_stop_event(ptr)) {
1060 handle_stop_event();
1061 return 0;
1065 * An unhandled event. We should probably crash here
1067 handle_unknown_event(line);
1068 return -1;
1071 /* an event happened without us having gotten a start-event */
1072 if (!daemon_is_running) {
1073 insert_process_event(NEBTYPE_PROCESS_START);
1074 daemon_start = ltime;
1075 daemon_is_running = 1;
1078 if (!(sc = get_event_type(ptr, colon - ptr))) {
1079 handle_unknown_event(line);
1080 return -1;
1083 if (sc->code == IGNORE_LINE)
1084 return 0;
1086 *colon = 0;
1087 ptr = colon + 1;
1088 while (*ptr == ' ')
1089 ptr++;
1091 if (sc->nvecs) {
1092 int i;
1094 nvecs = vectorize_string(ptr, sc->nvecs);
1096 if (nvecs != sc->nvecs) {
1097 /* broken line */
1098 warn("Line %d in %s seems to not have all the fields it should",
1099 line_no, cur_file->path);
1100 return -1;
1103 for (i = 0; i < sc->nvecs; i++) {
1104 if (!strv[i]) {
1105 /* this should never happen */
1106 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
1107 line_no, cur_file->path);
1108 return -1;
1113 switch (sc->code) {
1114 char *semi_colon;
1116 case NEBTYPE_EXTERNALCOMMAND_END:
1117 semi_colon = strchr(ptr, ';');
1118 if (!semi_colon)
1119 return 0;
1120 if (!(sc = get_command_type(ptr, semi_colon - ptr))) {
1121 return 0;
1123 if (sc->code == RESTART_PROGRAM) {
1124 handle_stop_event();
1125 return 0;
1128 nvecs = vectorize_string(semi_colon + 1, sc->nvecs);
1129 if (nvecs != sc->nvecs) {
1130 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs, sc->nvecs, ptr);
1132 if (sc->code != ACKNOWLEDGE_HOST_PROBLEM &&
1133 sc->code != ACKNOWLEDGE_SVC_PROBLEM)
1135 register_downtime_command(sc);
1136 } else {
1137 insert_acknowledgement(sc);
1139 break;
1141 case NEBTYPE_HOSTCHECK_PROCESSED:
1142 return insert_host_check(sc);
1144 case NEBTYPE_SERVICECHECK_PROCESSED:
1145 return insert_service_check(sc);
1147 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST:
1148 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE:
1149 return insert_downtime(sc);
1151 case NEBTYPE_NOTIFICATION_END + CONCERNS_HOST:
1152 case NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE:
1153 return insert_notification(sc);
1155 case IGNORE_LINE:
1156 return 0;
1159 return 0;
1162 static int parse_one_line(char *str, uint len)
1164 if (parse_line(str, len) && use_sql && sql_errno())
1165 crash("sql error: %s", sql_error());
1167 return 0;
1170 static int hash_one_line(char *line, uint len)
1172 return add_interesting_object(line);
1175 static int hash_interesting(const char *path)
1177 struct stat st;
1179 if (stat(path, &st) < 0)
1180 crash("failed to stat %s: %s", path, strerror(errno));
1182 lparse_path(path, st.st_size, hash_one_line);
1184 return 0;
1187 extern const char *__progname;
1188 __attribute__((__format__(__printf__, 1, 2)))
1189 static void usage(const char *fmt, ...)
1191 if (fmt && *fmt) {
1192 va_list ap;
1194 va_start(ap, fmt);
1195 vfprintf(stdout, fmt, ap);
1196 va_end(ap);
1199 printf("Usage %s [options] [logfiles]\n\n", __progname);
1200 printf(" [logfiles] refers to all the nagios logfiles you want to import\n");
1201 printf(" If --nagios-cfg is given or can be inferred no logfiles need to be supplied\n");
1202 printf("\nOptions:\n");
1203 printf(" --help this cruft\n");
1204 printf(" --no-progress don't display progress output\n");
1205 printf(" --no-sql don't access the database\n");
1206 printf(" --db-name database name\n");
1207 printf(" --db-table database table name\n");
1208 printf(" --db-user database user\n");
1209 printf(" --db-pass database password\n");
1210 printf(" --incremental[=<when>] do an incremental import (since $when)\n");
1211 printf(" --truncate-db truncate database before importing\n");
1212 printf(" --only-notifications only import notifications\n");
1213 printf(" --nagios-cfg=</path/to/nagios.cfg> path to nagios.cfg\n");
1214 printf(" --list-files list files to import\n");
1215 printf("\n\n");
1217 if (fmt && *fmt)
1218 exit(1);
1220 exit(0);
1223 int main(int argc, char **argv)
1225 int i, truncate_db = 0;
1226 const char *nagios_cfg = NULL;
1227 char *db_name, *db_user, *db_pass, *db_table;
1229 db_name = db_user = db_pass = db_table = NULL;
1231 do_progress = isatty(fileno(stdout));
1233 strv = calloc(sizeof(char *), MAX_NVECS);
1234 dentry = calloc(sizeof(*dentry), NUM_DENTRIES);
1235 if (!strv || !dentry)
1236 crash("Failed to alloc initial structs");
1239 for (num_nfile = 0,i = 1; i < argc; i++) {
1240 char *opt, *arg = argv[i];
1241 int arg_len, eq_opt = 0;
1243 if ((opt = strchr(arg, '='))) {
1244 *opt++ = '\0';
1245 eq_opt = 1;
1247 else if (i < argc - 1) {
1248 opt = argv[i + 1];
1251 if (!prefixcmp(arg, "-h") || !prefixcmp(arg, "--help")) {
1252 usage(NULL);
1254 if (!prefixcmp(arg, "--incremental")) {
1255 incremental = 1;
1258 * nifty for debugging --incremental skipping log-files
1259 * The value will be overwritten unless --no-sql is also
1260 * in effect
1262 if (eq_opt) {
1263 incremental = strtoul(opt, NULL, 0);
1264 if (!incremental)
1265 usage("--incremental= requires a parameter");
1267 continue;
1269 if (!prefixcmp(arg, "--no-sql")) {
1270 use_sql = 0;
1271 continue;
1273 if (!prefixcmp(arg, "--only-notifications")) {
1274 only_notifications = 1;
1275 db_name = db_name ? db_name : "merlin";
1276 db_user = db_user ? db_user : "merlin";
1277 db_pass = db_pass ? db_pass : "merlin";
1278 db_table = db_table ? db_table : "notification";
1279 continue;
1281 if (!prefixcmp(arg, "--no-progress")) {
1282 do_progress = 0;
1283 continue;
1285 if (!prefixcmp(arg, "--debug") || !prefixcmp(arg, "-d")) {
1286 do_progress = 0;
1287 debug_level++;
1288 continue;
1290 if (!prefixcmp(arg, "--truncate-db")) {
1291 truncate_db = 1;
1292 continue;
1294 if (!prefixcmp(arg, "--list-files")) {
1295 list_files = 1;
1296 do_progress = 0;
1297 continue;
1299 if (!prefixcmp(arg, "--nagios-cfg")) {
1300 if (!opt || !*opt) {
1301 crash("%s requires the path to nagios.cfg as argument", arg);
1303 nagios_cfg = opt;
1304 if (opt && !eq_opt)
1305 i++;
1306 continue;
1308 if (!prefixcmp(arg, "--db-name")) {
1309 if (!opt || !*opt)
1310 crash("%s requires a database name as an argument", arg);
1311 db_name = opt;
1312 if (opt && !eq_opt)
1313 i++;
1314 continue;
1316 if (!prefixcmp(arg, "--db-user")) {
1317 if (!opt || !*opt)
1318 crash("%s requires a database username as argument", arg);
1319 db_user = opt;
1320 if (opt && !eq_opt)
1321 i++;
1322 continue;
1324 if (!prefixcmp(arg, "--db-pass")) {
1325 if (!opt || !*opt)
1326 crash("%s requires a database username as argument", arg);
1327 db_pass = opt;
1328 if (opt && !eq_opt)
1329 i++;
1330 continue;
1332 if (!prefixcmp(arg, "--db-table")) {
1333 if (!opt || !*opt)
1334 crash("%s requires a database table name as argument", arg);
1335 db_table = opt;
1336 if (opt && !eq_opt)
1337 i++;
1338 continue;
1340 if (!prefixcmp(arg, "--interesting") || !prefixcmp(arg, "-i")) {
1341 if (!opt || !*opt)
1342 crash("%s requires a filename as argument", arg);
1343 hash_interesting(opt);
1344 if (opt && !eq_opt)
1345 i++;
1346 continue;
1349 /* non-argument, so treat as a config- or log-file */
1350 arg_len = strlen(arg);
1351 if (arg_len >= 10 && !strcmp(&arg[arg_len - 10], "nagios.cfg")) {
1352 nagios_cfg = arg;
1353 } else {
1354 add_naglog_path(arg);
1358 /* fallback for op5 systems */
1359 if (!nagios_cfg && !num_nfile) {
1360 nagios_cfg = "/opt/monitor/etc/nagios.cfg";
1362 if (nagios_cfg) {
1363 struct cfg_comp *conf;
1364 conf = cfg_parse_file(nagios_cfg);
1365 for (i = 0; i < conf->vars; i++) {
1366 struct cfg_var *v = conf->vlist[i];
1367 if (!strcmp(v->key, "log_file")) {
1368 add_naglog_path(v->value);
1370 if (!strcmp(v->key, "log_archive_path")) {
1371 add_naglog_path(v->value);
1376 if (use_sql) {
1377 db_name = db_name ? db_name : "monitor_reports";
1378 db_user = db_user ? db_user : "monitor";
1379 db_pass = db_pass ? db_pass : "monitor";
1380 db_table = db_table ? db_table : "report_data";
1381 sql_config("db_database", db_name);
1382 sql_config("db_user", db_user);
1383 sql_config("db_pass", db_pass);
1384 sql_config("db_table", db_table);
1386 if (sql_init() < 0)
1387 crash("sql_init() failed");
1388 if (truncate_db)
1389 sql_query("TRUNCATE %s", sql_table_name());
1391 if (incremental == 1) {
1392 MYSQL_RES *result;
1393 MYSQL_ROW row;
1394 sql_query("SELECT %s FROM %s.%s ORDER BY %s DESC LIMIT 1",
1395 only_notifications ? "end_time" : "timestamp",
1396 db_name, db_table,
1397 only_notifications ? "end_time" : "timestamp");
1399 if (!(result = sql_get_result()))
1400 crash("Failed to get last timestamp: %s\n", sql_error());
1402 /* someone might use --incremental with an empty
1403 * database. We shouldn't crash in that case */
1404 if ((row = sql_fetch_row(result)))
1405 incremental = strtoul(row[0], NULL, 0);
1407 sql_free_result(result);
1411 log_grok_var("logfile", "/dev/null");
1412 log_grok_var("log_levels", "warn");
1414 if (!num_nfile)
1415 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1416 __progname);
1418 if (log_init() < 0)
1419 crash("log_init() failed");
1421 qsort(nfile, num_nfile, sizeof(*nfile), nfile_cmp);
1423 host_downtime = hash_init(HASH_TABLE_SIZE);
1424 service_downtime = hash_init(HASH_TABLE_SIZE);
1426 if (hook_init() < 0)
1427 crash("Failed to initialize hooks");
1429 /* go through them once to count the total size for progress output */
1430 for (i = 0; i < num_nfile; i++) {
1431 totsize += nfile[i].size;
1434 if (!list_files) {
1435 gettimeofday(&import_start, NULL);
1436 printf("Importing %s of data from %d files\n",
1437 tobytes(totsize), num_nfile);
1440 for (i = 0; i < num_nfile; i++) {
1441 struct naglog_file *nf = &nfile[i];
1442 cur_file = nf;
1443 show_progress();
1446 * skip parsing files if they're not interesting, such
1447 * as during incremental imports.
1448 * 'incremental' will be 0 if we're doing a full import,
1449 * 1 if we're doing an incremental but the database is
1450 * empty and will contain the timestamp of the latest
1451 * entry in the database if we're doing an incremental
1452 * import to a populated table.
1453 * Note that we can never skip the last file in the list,
1454 * although the lparse routine should sift through it
1455 * pretty quickly in case it has nothing interesting.
1457 if (i + 1 < num_nfile && incremental > nfile[i + 1].first) {
1458 skipped_files++;
1459 skipped += nf->size;
1460 continue;
1462 if (list_files) {
1463 printf("%s\n", nf->path);
1464 continue;
1466 debug("importing from %s (%lu : %u)\n", nf->path, nf->first, nf->cmp);
1467 line_no = 0;
1468 lparse_path(nf->path, nf->size, parse_one_line);
1469 imported++; /* make up for one lost byte per file */
1472 ltime = time(NULL);
1473 purge_expired_downtime();
1474 end_progress();
1476 if (debug_level) {
1477 if (dt_depth) {
1478 printf("Unclosed host downtimes:\n");
1479 puts("------------------------");
1480 hash_walk_data(host_downtime, print_downtime);
1481 printf("Unclosed service downtimes:\n");
1482 puts("---------------------------");
1483 hash_walk_data(service_downtime, print_downtime);
1485 printf("dt_depth: %d\n", dt_depth);
1487 printf("purged downtimes: %d\n", purged_downtimes);
1488 printf("max simultaneous host downtime hashes: %u\n",
1489 hash_entries_max(host_downtime));
1490 printf("max simultaneous service downtime hashes: %u\n",
1491 hash_entries_max(service_downtime));
1492 printf("max downtime depth: %u\n", max_dt_depth);
1495 if (use_sql) {
1496 enable_indexes();
1497 sql_close();
1500 if (warnings && debug_level)
1501 fprintf(stderr, "Total warnings: %d\n", warnings);
1503 if (debug_level || dt_start > dt_stop) {
1504 uint count;
1505 fprintf(stderr, "Downtime data %s\n started: %d\n stopped: %d\n delta : %d\n skipped: %d\n",
1506 dt_depth ? "mismatch!" : "consistent", dt_start, dt_stop, dt_depth, dt_skip);
1507 hash_debug_table(host_downtime, 0);
1508 hash_debug_table(service_downtime, 0);
1509 if ((count = hash_entries(host_downtime))) {
1510 fprintf(stderr, "host_downtime as %u entries remaining\n", count);
1512 if ((count = hash_entries(service_downtime))) {
1513 fprintf(stderr, "service_downtime has %u entries remaining\n", count);
1517 print_unhandled_events();
1519 return 0;