import: Don't bother enabling indexes if we're more than 95% done
[nagios-reports-module.git] / import.c
blobba87552a2248161f6da8ea7d5391546f2d52526b
1 #define _GNU_SOURCE 1
2 #include <sys/types.h>
3 #include <signal.h>
5 #include "nagios/broker.h"
6 #include "nagios/nebcallbacks.h"
7 #include "sql.h"
8 #include "hooks.h"
9 #include "logging.h"
10 #include "hash.h"
11 #include "lparse.h"
12 #include "logutils.h"
13 #include "cfgfile.h"
15 #define IGNORE_LINE 0
17 #define CONCERNS_HOST 50
18 #define CONCERNS_SERVICE 60
20 #define MAX_NVECS 16
21 #define HASH_TABLE_SIZE 128
23 /* for some reason these aren't defined inside Nagios' headers */
24 #define SERVICE_OK 0
25 #define SERVICE_WARNING 1
26 #define SERVICE_CRITICAL 2
27 #define SERVICE_UNKNOWN 3
29 #define PROGRESS_INTERVAL 500 /* lines to parse between progress updates */
32 static int only_notifications;
33 static uint64_t imported, totsize, totlines, skipped;
34 static int lines_since_progress, do_progress;
35 static struct timeval import_start;
36 static time_t daemon_start, daemon_stop, incremental;
37 static int daemon_is_running;
38 static uint max_dt_depth, skipped_files;
40 static time_t next_dt_purge; /* when next to purge expired downtime */
41 #define DT_PURGE_GRACETIME 300 /* seconds to add to next_dt_purge */
43 static time_t ltime; /* the timestamp from the current log-line */
45 static int dt_start, dt_stop, dt_skip;
46 #define dt_depth (dt_start - dt_stop)
47 static hash_table *host_downtime;
48 static hash_table *service_downtime;
49 static int downtime_id;
50 static time_t probably_ignore_downtime;
52 struct downtime_entry {
53 int id;
54 int code;
55 char *host;
56 char *service;
57 time_t start;
58 time_t stop;
59 int fixed;
60 time_t duration;
61 time_t started;
62 time_t ended;
63 int purged;
64 int trigger;
65 int slot;
66 struct downtime_entry *next;
69 #define NUM_DENTRIES 1024
70 static struct downtime_entry **dentry;
71 static time_t last_downtime_start;
73 static struct string_code event_codes[] = {
74 add_ignored("Error"),
75 add_ignored("Warning"),
76 add_ignored("LOG ROTATION"),
77 add_ignored("HOST FLAPPING ALERT"),
78 add_ignored("SERVICE FLAPPING ALERT"),
79 add_ignored("SERVICE EVENT HANDLER"),
80 add_ignored("HOST EVENT HANDLER"),
81 add_ignored("LOG VERSION"),
83 add_code(5, "HOST NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_HOST),
84 add_code(6, "SERVICE NOTIFICATION", NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE),
85 add_code(3, "PASSIVE HOST CHECK", NEBTYPE_HOSTCHECK_PROCESSED),
86 add_code(4, "PASSIVE SERVICE CHECK", NEBTYPE_SERVICECHECK_PROCESSED),
87 add_code(0, "EXTERNAL COMMAND", NEBTYPE_EXTERNALCOMMAND_END),
88 add_code(5, "HOST ALERT", NEBTYPE_HOSTCHECK_PROCESSED),
89 add_code(5, "INITIAL HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
90 add_code(5, "CURRENT HOST STATE", NEBTYPE_HOSTCHECK_PROCESSED),
91 add_code(6, "SERVICE ALERT", NEBTYPE_SERVICECHECK_PROCESSED),
92 add_code(6, "INITIAL SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
93 add_code(6, "CURRENT SERVICE STATE", NEBTYPE_SERVICECHECK_PROCESSED),
94 add_code(3, "HOST DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST),
95 add_code(4, "SERVICE DOWNTIME ALERT", NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE),
96 { 0, NULL, 0, 0 },
99 static struct string_code command_codes[] = {
100 add_cdef(1, DEL_HOST_DOWNTIME),
101 add_cdef(1, DEL_SVC_DOWNTIME),
102 add_cdef(8, SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME),
103 add_cdef(8, SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME),
104 add_cdef(8, SCHEDULE_HOSTGROUP_HOST_DOWNTIME),
105 add_cdef(8, SCHEDULE_HOSTGROUP_SVC_DOWNTIME),
106 add_cdef(8, SCHEDULE_HOST_DOWNTIME),
107 add_cdef(8, SCHEDULE_HOST_SVC_DOWNTIME),
108 add_cdef(8, SCHEDULE_SERVICEGROUP_HOST_DOWNTIME),
109 add_cdef(8, SCHEDULE_SERVICEGROUP_SVC_DOWNTIME),
110 add_cdef(8, SCHEDULE_SVC_DOWNTIME),
113 * These really have one more field than listed here. We omit one
114 * to make author and comment concatenated with a semi-colon by default.
116 add_cdef(6, ACKNOWLEDGE_SVC_PROBLEM),
117 add_cdef(5, ACKNOWLEDGE_HOST_PROBLEM),
118 { 0, NULL, 0, 0 },
122 static inline void print_strvec(char **v, int n)
124 int i;
126 for (i = 0; i < n; i++)
127 printf("v[%2d]: %s\n", i, v[i]);
131 static const char *tobytes(uint64_t n)
133 const char *suffix = "KMGTP";
134 static char tbuf[2][30];
135 static int t = 0;
136 int shift = 1;
138 t ^= 1;
139 if (n < 1024) {
140 sprintf(tbuf[t], "%llu bytes", n);
141 return tbuf[t];
144 while (n >> (shift * 10) > 1024 && shift < sizeof(suffix) - 1)
145 shift++;
147 sprintf(tbuf[t], "%0.2f %ciB",
148 (float)n / (float)(1 << (shift * 10)), suffix[shift - 1]);
150 return tbuf[t];
153 static const char *tv_delta(struct timeval *start, struct timeval *stop)
155 static char buf[30];
156 double secs;
157 unsigned int days, hours, mins;
159 secs = stop->tv_sec - start->tv_sec;
160 days = secs / 86400;
161 secs -= days * 86400;
162 hours = secs / 3600;
163 secs -= hours * 3600;
164 mins = secs / 60;
165 secs -= mins * 60;
167 /* add the micro-seconds */
168 secs = ((secs * 1000000) + (stop->tv_usec - start->tv_usec)) / 1000000;
170 if (!mins && !hours && !days) {
171 sprintf(buf, "%.3lfs", secs);
172 } else if (!hours && !days) {
173 sprintf(buf, "%um %.3lfs", mins, secs);
174 } else if (!days) {
175 sprintf(buf, "%uh %um %.3lfs", hours, mins, secs);
176 } else {
177 sprintf(buf, "%ud %uh %um %.3lfs", days, hours, mins, secs);
180 return buf;
183 static void show_progress(void)
185 time_t eta, elapsed;
186 float pct_done, real_pct_done;
188 totlines += lines_since_progress;
189 lines_since_progress = 0;
191 if (!do_progress)
192 return;
194 elapsed = time(NULL) - import_start.tv_sec;
195 if (!elapsed)
196 elapsed = 1;
198 real_pct_done = (float)imported / (float)(totsize - skipped) * 100;
199 pct_done = ((float)(imported + skipped) / (float)totsize) * 100;
200 eta = (elapsed / real_pct_done) * (100.0 - real_pct_done);
202 printf("\rImporting data: %.2f%% (%s) done ",
203 pct_done, tobytes(imported + skipped));
204 if (elapsed > 10) {
205 printf("ETA: ");
206 if (eta > 60)
207 printf("%lum%lus", eta / 60, eta % 60);
208 else
209 printf("%lus", eta);
211 printf(" ");
214 static void end_progress(void)
216 struct timeval tv;
218 gettimeofday(&tv, NULL);
221 * If any of the logfiles doesn't have a newline
222 * at end of file, imported will be slightly off.
223 * We set it hard here so as to make sure that
224 * the final progress output stops at exactly 100%
226 imported = totsize - skipped;
228 show_progress();
229 putchar('\n');
230 printf("%s, %llu lines imported in %s.",
231 tobytes(totsize), totlines, tv_delta(&import_start, &tv));
232 if (skipped)
233 printf(" %s in %u files skipped.", tobytes(skipped), skipped_files);
234 putchar('\n');
237 static int use_sql = 1, indexes_disabled;
238 static void disable_indexes(void)
240 if (indexes_disabled)
241 return;
244 * if we're more than 95% done before inserting anything,
245 * such as might be the case when running an incremental
246 * import, we might as well not bother with disabling
247 * the indexes, since enabling them again can take quite
248 * a long time
250 if (((float)(skipped + imported) / (float)totsize) * 100 >= 95.0)
251 return;
254 * We lock the table we'll be working with and disable
255 * indexes on it. Otherwise doing the actual inserts
256 * will take just about forever, as MySQL has to update
257 * and flush the index cache between each operation.
259 if (sql_query("ALTER TABLE %s DISABLE KEYS", sql_table_name()))
260 crash("Failed to disable keys: %s", sql_error());
261 if (sql_query("LOCK TABLES %s WRITE", sql_table_name()))
262 crash("Failed to lock table %s: %s", sql_table_name(), sql_error());
264 indexes_disabled = 1;
267 static void enable_indexes(void)
269 SQL_RESULT *res;
270 SQL_ROW row;
271 unsigned long entries;
272 time_t start;
274 sql_query("SELECT id FROM %s ORDER BY id DESC LIMIT 1", sql_table_name());
275 if (!(res = sql_get_result()))
276 entries = 0;
277 else {
278 row = sql_fetch_row(res);
279 entries = strtoul(row[0], NULL, 0);
280 sql_free_result(res);
283 signal(SIGINT, SIG_IGN);
284 sql_query("UNLOCK TABLES");
285 start = time(NULL);
286 printf("Creating sql table indexes. This will likely take ~%lu seconds\n",
287 (entries / 50000) + 1);
288 sql_query("ALTER TABLE %s ENABLE KEYS", sql_table_name());
289 printf("%lu database entries indexed in %lu seconds\n",
290 entries, time(NULL) - start);
293 static int insert_downtime_event(int type, char *host, char *service, int id)
295 nebstruct_downtime_data ds;
296 int result;
298 if (!is_interesting_service(host, service))
299 return 0;
301 dt_start += type == NEBTYPE_DOWNTIME_START;
302 dt_stop += type == NEBTYPE_DOWNTIME_STOP;
303 if (dt_depth > max_dt_depth)
304 max_dt_depth = dt_depth;
306 if (!use_sql || only_notifications)
307 return 0;
309 memset(&ds, 0, sizeof(ds));
311 ds.type = type;
312 ds.timestamp.tv_sec = ltime;
313 ds.host_name = host;
314 ds.service_description = service;
315 ds.downtime_id = id;
317 disable_indexes();
318 result = hook_downtime(NEBCALLBACK_DOWNTIME_DATA, (void *)&ds);
319 if (result < 0)
320 crash("Failed to insert downtime:\n type=%d, host=%s, service=%s, id=%d",
321 type, host, service, id);
323 return result;
326 typedef struct import_notification {
327 int type, reason, state;
328 } import_notification;
330 static int parse_import_notification(char *str, import_notification *n)
332 char *state_str = str;
334 n->reason = parse_notification_reason(str);
335 if (n->reason != NOTIFICATION_NORMAL) {
336 char *space, *paren;
338 space = strchr(str, ' ');
339 if (!space)
340 return -1;
341 paren = strchr(space, ')');
342 if (!paren)
343 return -1;
344 *paren = '\0';
346 state_str = space + 2;
349 n->type = SERVICE_NOTIFICATION;
350 n->state = parse_service_state_gently(state_str);
351 if (n->state < 0) {
352 n->type = HOST_NOTIFICATION;
353 n->state = parse_host_state_gently(state_str);
356 return 0;
359 static int insert_notification(struct string_code *sc)
361 int base_idx;
362 const char *desc;
363 struct import_notification n;
365 if (!only_notifications)
366 return 0;
368 if (sc->code - NEBTYPE_NOTIFICATION_END == CONCERNS_SERVICE) {
369 base_idx = 1;
370 desc = strv[2];
371 } else {
372 base_idx = 0;
373 desc = 0;
375 if (parse_import_notification(strv[base_idx + 2], &n) < 0) {
376 handle_unknown_event(strv[base_idx + 2]);
377 return 0;
380 if (!use_sql)
381 return 0;
383 disable_indexes();
384 return sql_query
385 ("INSERT INTO %s.%s("
386 "notification_type, start_time, end_time, contact_name, "
387 "host_name, service_description, "
388 "command_name, output, "
389 "state, reason_type) "
390 "VALUES("
391 "%d, %lu, %lu, '%s', "
392 "'%s', '%s', "
393 "'%s', '%s', "
394 "%d, %d)",
395 sql_db_name(), sql_table_name(),
396 n.type, ltime, ltime, sql_escape(strv[0]),
397 sql_escape(strv[1]), desc ? sql_escape(desc) : "",
398 sql_escape(strv[base_idx + 3]), sql_escape(strv[base_idx + 4]),
399 n.state, n.reason);
402 static int insert_service_check(struct string_code *sc)
404 nebstruct_service_check_data ds;
406 if (!is_interesting_service(strv[0], strv[1]))
407 return 0;
409 memset(&ds, 0, sizeof(ds));
411 ds.timestamp.tv_sec = ltime;
412 ds.type = sc->code;
413 ds.host_name = strv[0];
414 ds.service_description = strv[1];
415 if (sc->nvecs == 4) {
416 /* passive service check result */
417 if (*strv[2] >= '0' && *strv[2] <= '9')
418 ds.state = atoi(strv[2]);
419 else
420 ds.state = parse_service_state(strv[2]);
421 ds.state_type = HARD_STATE;
422 ds.current_attempt = 1;
423 ds.output = strv[3];
424 } else {
425 ds.state = parse_service_state(strv[2]);
426 ds.state_type = soft_hard(strv[3]);
427 ds.current_attempt = atoi(strv[4]);
428 ds.output = strv[5];
431 if (!use_sql || only_notifications)
432 return 0;
434 disable_indexes();
435 return hook_service_result(NEBCALLBACK_SERVICE_CHECK_DATA, (void *)&ds);
438 static int insert_host_check(struct string_code *sc)
440 nebstruct_host_check_data ds;
442 if (!is_interesting_host(strv[0]))
443 return 0;
445 memset(&ds, 0, sizeof(ds));
447 ds.timestamp.tv_sec = ltime;
448 ds.type = sc->code;
449 ds.host_name = strv[0];
450 if (sc->nvecs == 3) {
451 if (*strv[1] >= '0' && *strv[1] <= '9')
452 ds.state = atoi(strv[1]);
453 else
454 ds.state = parse_host_state(strv[1]);
455 /* passive host check result */
456 ds.output = strv[2];
457 ds.current_attempt = 1;
458 ds.state_type = HARD_STATE;
459 } else {
460 ds.state = parse_host_state(strv[1]);
461 ds.state_type = soft_hard(strv[2]);
462 ds.current_attempt = atoi(strv[3]);
463 ds.output = strv[4];
466 if (!use_sql || only_notifications)
467 return 0;
469 disable_indexes();
470 return hook_host_result(NEBCALLBACK_HOST_CHECK_DATA, (void *)&ds);
473 static int insert_process_event(int type)
475 nebstruct_process_data ds;
477 if (!use_sql || only_notifications)
478 return 0;
480 memset(&ds, 0, sizeof(ds));
481 ds.timestamp.tv_sec = ltime;
482 ds.type = type;
483 disable_indexes();
484 return hook_process_data(NEBCALLBACK_PROCESS_DATA, (void *)&ds);
487 static int insert_acknowledgement(struct string_code *sc)
489 return 0;
492 static void dt_print(char *tpc, time_t when, struct downtime_entry *dt)
494 if (!debug_level)
495 return;
497 printf("%s: time=%lu started=%lu start=%lu stop=%lu duration=%lu id=%d ",
498 tpc, when, dt->started, dt->start, dt->stop, dt->duration, dt->id);
499 printf("%s", dt->host);
500 if (dt->service)
501 printf(";%s", dt->service);
502 putchar('\n');
505 static struct downtime_entry *last_dte;
506 static struct downtime_entry *del_dte;
508 static void remove_downtime(struct downtime_entry *dt);
509 static int del_matching_dt(void *data)
511 struct downtime_entry *dt = data;
513 if (del_dte->id == dt->id) {
514 dt_print("ALSO", 0, dt);
515 remove_downtime(dt);
516 return HASH_WALK_REMOVE;
519 return 0;
522 static void stash_downtime_command(struct downtime_entry *dt)
524 dt->slot = dt->start % NUM_DENTRIES;
525 dt->next = dentry[dt->slot];
526 dentry[dt->slot] = dt;
529 static void remove_downtime(struct downtime_entry *dt)
531 if (!is_interesting_service(dt->host, dt->service))
532 return;
534 insert_downtime_event(NEBTYPE_DOWNTIME_STOP, dt->host, dt->service, dt->id);
536 dt_print("RM_DT", ltime, dt);
537 dt->purged = 1;
540 static struct downtime_entry *
541 dt_matches_command(struct downtime_entry *dt, char *host, char *service)
543 for (; dt; dt = dt->next) {
544 time_t diff;
546 if (ltime > dt->stop || ltime < dt->start) {
547 continue;
550 switch (dt->code) {
551 case SCHEDULE_SVC_DOWNTIME:
552 if (service && strcmp(service, dt->service))
553 continue;
555 /* fallthrough */
556 case SCHEDULE_HOST_DOWNTIME:
557 case SCHEDULE_HOST_SVC_DOWNTIME:
558 if (strcmp(host, dt->host)) {
559 continue;
562 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
563 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
564 /* these two have host set in dt, but
565 * it will not match all the possible hosts */
567 /* fallthrough */
568 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
569 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
570 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
571 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
572 break;
573 default:
574 crash("dt->code not set properly\n");
578 * Once we get here all the various other criteria have
579 * been matched, so we need to check if the daemon was
580 * running when this downtime was supposed to have
581 * started, and otherwise use the daemon start time
582 * as the value to diff against
584 if (daemon_stop < dt->start && daemon_start > dt->start) {
585 debug("Adjusting dt->start (%lu) to (%lu)\n",
586 dt->start, daemon_start);
587 dt->start = daemon_start;
588 if (dt->trigger && dt->duration)
589 dt->stop = dt->start + dt->duration;
592 diff = ltime - dt->start;
593 if (diff < 3 || dt->trigger || !dt->fixed)
594 return dt;
597 return NULL;
600 static struct downtime_entry *
601 find_downtime_command(char *host, char *service)
603 int i;
604 struct downtime_entry *shortcut = NULL;
606 if (last_dte && last_dte->start == ltime) {
607 shortcut = last_dte;
608 // return last_dte;
610 for (i = 0; i < NUM_DENTRIES; i++) {
611 struct downtime_entry *dt;
612 dt = dt_matches_command(dentry[i], host, service);
613 if (dt) {
614 if (shortcut && dt != shortcut)
615 if (debug_level)
616 printf("FIND shortcut no good\n");
617 last_dte = dt;
618 return dt;
622 debug("FIND not\n");
623 return NULL;
626 static int print_downtime(void *data)
628 struct downtime_entry *dt = (struct downtime_entry *)data;
630 dt_print("UNCLOSED", ltime, dt);
632 return 0;
635 static inline void set_next_dt_purge(time_t base, time_t add)
637 if (!next_dt_purge || next_dt_purge > base + add)
638 next_dt_purge = base + add;
640 if (next_dt_purge <= ltime)
641 next_dt_purge = ltime + 1;
644 static inline void add_downtime(char *host, char *service, int id)
646 struct downtime_entry *dt, *cmd, *old;
648 if (!is_interesting_service(host, service))
649 return;
651 dt = malloc(sizeof(*dt));
652 cmd = find_downtime_command(host, service);
653 if (!cmd) {
654 warn("DT with no ext cmd? %lu %s;%s", ltime, host, service);
655 memset(dt, 0, sizeof(*dt));
656 dt->duration = 7200; /* the default downtime duration in nagios */
657 dt->start = ltime;
658 dt->stop = dt->start + dt->duration;
660 else
661 memcpy(dt, cmd, sizeof(*dt));
663 dt->host = strdup(host);
664 dt->id = id;
665 dt->started = ltime;
667 set_next_dt_purge(ltime, dt->duration);
669 if (!service) {
670 dt->service = NULL;
671 old = hash_update(host_downtime, dt->host, dt);
673 else {
674 dt->service = strdup(service);
675 old = hash_update2(service_downtime, dt->host, dt->service, dt);
678 if (old && old != dt) {
679 free(old->host);
680 if (old->service)
681 free(old->service);
682 free(old);
685 dt_print("IN_DT", ltime, dt);
686 insert_downtime_event(NEBTYPE_DOWNTIME_START, dt->host, dt->service, dt->id);
689 static time_t last_host_dt_del, last_svc_dt_del;
690 static int register_downtime_command(struct string_code *sc)
692 struct downtime_entry *dt;
693 char *start_time, *end_time, *duration = NULL;
694 char *host = NULL, *service = NULL, *fixed, *triggered_by = NULL;
695 time_t foo;
697 switch (sc->code) {
698 case DEL_HOST_DOWNTIME:
699 last_host_dt_del = ltime;
700 return 0;
701 case DEL_SVC_DOWNTIME:
702 last_svc_dt_del = ltime;
703 return 0;
705 case SCHEDULE_HOST_DOWNTIME:
706 if (strtotimet(strv[5], &foo))
707 duration = strv[4];
708 /* fallthrough */
709 case SCHEDULE_AND_PROPAGATE_HOST_DOWNTIME:
710 case SCHEDULE_AND_PROPAGATE_TRIGGERED_HOST_DOWNTIME:
711 case SCHEDULE_HOST_SVC_DOWNTIME:
712 host = strv[0];
713 /* fallthrough */
714 case SCHEDULE_HOSTGROUP_HOST_DOWNTIME:
715 case SCHEDULE_HOSTGROUP_SVC_DOWNTIME:
716 case SCHEDULE_SERVICEGROUP_HOST_DOWNTIME:
717 case SCHEDULE_SERVICEGROUP_SVC_DOWNTIME:
718 start_time = strv[1];
719 end_time = strv[2];
720 fixed = strv[3];
721 if (strtotimet(strv[5], &foo))
722 triggered_by = strv[4];
723 if (!duration)
724 duration = strv[5];
726 break;
728 case SCHEDULE_SVC_DOWNTIME:
729 host = strv[0];
730 service = strv[1];
731 start_time = strv[2];
732 end_time = strv[3];
733 fixed = strv[4];
734 if (strtotimet(strv[6], &foo)) {
735 triggered_by = strv[5];
736 duration = strv[6];
738 else {
739 duration = strv[5];
741 break;
743 default:
744 crash("Unknown downtime type: %d", sc->code);
747 if (!(dt = calloc(sizeof(*dt), 1)))
748 crash("calloc(%u, 1) failed: %s", (uint)sizeof(*dt), strerror(errno));
750 dt->code = sc->code;
751 if (host)
752 dt->host = strdup(host);
753 if (service)
754 dt->service = strdup(service);
756 dt->trigger = triggered_by ? !!(*triggered_by - '0') : 0;
757 if (strtotimet(start_time, &dt->start) || strtotimet(end_time, &dt->stop))
759 print_strvec(strv, sc->nvecs);
760 crash("strtotime(): type: %s; start_time='%s'; end_time='%s'; duration='%s';",
761 command_codes[sc->code - 1].str, start_time, end_time, duration);
765 * sometimes downtime commands can be logged according to
766 * log version 1, while the log still claims to be version 2.
767 * Apparently, this happens when using a daemon supporting
768 * version 2 logging but a downtime command is added that
769 * follows the version 1 standard.
770 * As such, we simply ignore the result of the "duration"
771 * field conversion and just accept that it might not work
773 (void)strtotimet(duration, &dt->duration);
774 dt->fixed = *fixed - '0';
777 * ignore downtime scheduled to take place in the future.
778 * It will be picked up by the module anyways
780 if (dt->start > time(NULL)) {
781 free(dt);
782 return 0;
785 if (dt->duration > time(NULL)) {
786 warn("Bizarrely large duration (%lu)", dt->duration);
788 if (dt->start < ltime) {
789 if (dt->duration && dt->duration > ltime - dt->start)
790 dt->duration -= ltime - dt->start;
792 dt->start = ltime;
794 if (dt->stop < ltime || dt->stop < dt->start) {
795 /* retroactively scheduled downtime, or just plain wrong */
796 dt->stop = dt->start;
797 dt->duration = 0;
800 if (dt->fixed && dt->duration != dt->stop - dt->start) {
801 // warn("duration doesn't match stop - start: (%lu : %lu)",
802 // dt->duration, dt->stop - dt->start);
804 dt->duration = dt->stop - dt->start;
806 else if (dt->duration > 86400 * 14) {
807 warn("Oddly long duration: %lu", dt->duration);
810 debug("start=%lu; stop=%lu; duration=%lu; fixed=%d; trigger=%d; host=%s service=%s\n",
811 dt->start, dt->stop, dt->duration, dt->fixed, dt->trigger, dt->host, dt->service);
813 stash_downtime_command(dt);
814 return 0;
817 static int insert_downtime(struct string_code *sc)
819 int type;
820 struct downtime_entry *dt = NULL;
821 int id = 0;
822 time_t dt_del_cmd;
823 char *host, *service = NULL;
825 host = strv[0];
826 if (sc->nvecs == 4) {
827 service = strv[1];
828 dt = hash_find2(service_downtime, host, service);
830 else
831 dt = hash_find(host_downtime, host);
834 * to stop a downtime we can either get STOPPED or
835 * CANCELLED. So far, I've only ever seen STARTED
836 * for when it actually starts though, and since
837 * the Nagios daemon is reponsible for launching
838 * it, it's unlikely there are more variants of
839 * that string
841 type = NEBTYPE_DOWNTIME_STOP;
842 if (!strcmp(strv[sc->nvecs - 2], "STARTED"))
843 type = NEBTYPE_DOWNTIME_START;
845 switch (type) {
846 case NEBTYPE_DOWNTIME_START:
847 if (dt) {
848 if (!probably_ignore_downtime)
849 dt_print("ALRDY", ltime, dt);
850 return 0;
853 if (probably_ignore_downtime)
854 debug("Should probably ignore this downtime: %lu : %lu %s;%s\n",
855 probably_ignore_downtime, ltime, host, service);
857 if (ltime - last_downtime_start > 1)
858 downtime_id++;
860 id = downtime_id;
861 add_downtime(host, service, id);
862 last_downtime_start = ltime;
863 break;
865 case NEBTYPE_DOWNTIME_STOP:
866 if (!dt) {
868 * this can happen when overlapping downtime entries
869 * occur, and the start event for the second (or nth)
870 * downtime starts before the first downtime has had
871 * a stop event. It basically means we've almost
872 * certainly done something wrong.
874 //printf("no dt. ds.host_name == '%s'\n", ds.host_name);
875 //fprintf(stderr, "CRASHING: %s;%s\n", ds.host_name, ds.service_description);
876 //crash("DOWNTIME_STOP without matching DOWNTIME_START");
877 dt_skip++;
878 return 0;
881 dt_del_cmd = !dt->service ? last_host_dt_del : last_svc_dt_del;
883 if ((ltime - dt_del_cmd) > 1 && dt->duration - (ltime - dt->started) > 60) {
884 debug("Short dt duration (%lu) for %s;%s (dt->duration=%lu)\n",
885 ltime - dt->started, dt->host, dt->service, dt->duration);
887 if (ltime - dt->started > dt->duration + DT_PURGE_GRACETIME)
888 dt_print("Long", ltime, dt);
890 remove_downtime(dt);
892 * Now delete whatever matching downtimes we can find.
893 * this must be here, or we'll recurse like crazy into
894 * remove_downtime(), possibly exhausting the stack
895 * frame buffer
897 del_dte = dt;
898 if (!dt->service)
899 hash_walk_data(host_downtime, del_matching_dt);
900 else
901 hash_walk_data(service_downtime, del_matching_dt);
902 break;
904 default:
905 return -1;
908 return 0;
911 static int dt_purged;
912 static int purge_expired_dt(void *data)
914 struct downtime_entry *dt = data;
916 if (dt->purged) {
917 dt_skip++;
918 return 0;
921 set_next_dt_purge(dt->started, dt->duration);
923 if (ltime + DT_PURGE_GRACETIME > dt->stop) {
924 dt_purged++;
925 debug("PURGE %lu: purging expired dt %d (start=%lu; started=%lu; stop=%lu; duration=%lu; host=%s; service=%s",
926 ltime, dt->id, dt->start, dt->started, dt->stop, dt->duration, dt->host, dt->service);
927 remove_downtime(dt);
928 return HASH_WALK_REMOVE;
930 else {
931 dt_print("PURGED_NOT_TIME", ltime, dt);
934 return 0;
937 static int purged_downtimes;
938 static void purge_expired_downtime(void)
940 int tot_purged = 0;
942 next_dt_purge = 0;
943 dt_purged = 0;
944 hash_walk_data(host_downtime, purge_expired_dt);
945 if (dt_purged)
946 debug("PURGE %d host downtimes purged", dt_purged);
947 tot_purged += dt_purged;
948 dt_purged = 0;
949 hash_walk_data(service_downtime, purge_expired_dt);
950 if (dt_purged)
951 debug("PURGE %d service downtimes purged", dt_purged);
952 tot_purged += dt_purged;
953 if (tot_purged)
954 debug("PURGE total %d entries purged", tot_purged);
956 if (next_dt_purge)
957 debug("PURGE next downtime purge supposed to run @ %lu, in %lu seconds",
958 next_dt_purge, next_dt_purge - ltime);
960 purged_downtimes += tot_purged;
963 static inline void handle_start_event(void)
965 if (!daemon_is_running)
966 insert_process_event(NEBTYPE_PROCESS_START);
968 probably_ignore_downtime = daemon_start = ltime;
969 daemon_is_running = 1;
972 static inline void handle_stop_event(void)
974 if (daemon_is_running) {
975 insert_process_event(NEBTYPE_PROCESS_SHUTDOWN);
976 daemon_is_running = 0;
978 daemon_stop = ltime;
981 static int parse_line(char *line, uint len)
983 char *ptr, *colon;
984 int nvecs = 0;
985 struct string_code *sc;
986 static time_t last_ltime = 0;
988 imported += len + 1; /* make up for 1 lost byte per newline */
990 /* ignore empty lines */
991 if (!len)
992 return 0;
994 if (++lines_since_progress >= PROGRESS_INTERVAL)
995 show_progress();
997 /* skip obviously bogus lines */
998 if (len < 12 || *line != '[') {
999 warn("line %d; len too short, or line doesn't start with '[' (%s)", line_no, line);
1000 return -1;
1003 ltime = strtoul(line + 1, &ptr, 10);
1004 if (line + 1 == ptr) {
1005 crash("Failed to parse log timestamp from '%s'. I can't handle malformed logdata", line);
1006 return -1;
1009 if (ltime < last_ltime) {
1010 // warn("ltime < last_ltime (%lu < %lu) by %lu. Compensating...",
1011 // ltime, last_ltime, last_ltime - ltime);
1012 ltime = last_ltime;
1014 else
1015 last_ltime = ltime;
1018 * Incremental will be 0 if not set, or 1 if set but
1019 * the database is currently empty.
1020 * Note that this will not always do the correct thing,
1021 * as downtime entries that might have been scheduled for
1022 * purging may never show up as "stopped" in the database
1023 * with this scheme. As such, incremental imports absolutely
1024 * require that nothing is in scheduled downtime when the
1025 * import is running (well, started really, but it amounts
1026 * to the same thing).
1028 if (ltime < incremental)
1029 return 0;
1031 if (next_dt_purge && ltime >= next_dt_purge)
1032 purge_expired_downtime();
1034 if (probably_ignore_downtime && ltime - probably_ignore_downtime > 1)
1035 probably_ignore_downtime = 0;
1037 while (*ptr == ']' || *ptr == ' ')
1038 ptr++;
1040 if (!is_interesting(ptr))
1041 return 0;
1043 if (!(colon = strchr(ptr, ':'))) {
1044 /* stupid heuristic, but might be good for something,
1045 * somewhere, sometime. if nothing else, it should suppress
1046 * annoying output */
1047 if (is_start_event(ptr)) {
1048 handle_start_event();
1049 return 0;
1051 if (is_stop_event(ptr)) {
1052 handle_stop_event();
1053 return 0;
1057 * An unhandled event. We should probably crash here
1059 handle_unknown_event(line);
1060 return -1;
1063 /* an event happened without us having gotten a start-event */
1064 if (!daemon_is_running) {
1065 insert_process_event(NEBTYPE_PROCESS_START);
1066 daemon_start = ltime;
1067 daemon_is_running = 1;
1070 if (!(sc = get_event_type(ptr, colon - ptr))) {
1071 handle_unknown_event(line);
1072 return -1;
1075 if (sc->code == IGNORE_LINE)
1076 return 0;
1078 *colon = 0;
1079 ptr = colon + 1;
1080 while (*ptr == ' ')
1081 ptr++;
1083 if (sc->nvecs) {
1084 int i;
1086 nvecs = vectorize_string(ptr, sc->nvecs);
1088 if (nvecs != sc->nvecs) {
1089 /* broken line */
1090 warn("Line %d in %s seems to not have all the fields it should",
1091 line_no, cur_file->path);
1092 return -1;
1095 for (i = 0; i < sc->nvecs; i++) {
1096 if (!strv[i]) {
1097 /* this should never happen */
1098 warn("Line %d in %s seems to be broken, or we failed to parse it into a vector",
1099 line_no, cur_file->path);
1100 return -1;
1105 switch (sc->code) {
1106 char *semi_colon;
1108 case NEBTYPE_EXTERNALCOMMAND_END:
1109 semi_colon = strchr(ptr, ';');
1110 if (!semi_colon)
1111 return 0;
1112 if (!(sc = get_command_type(ptr, semi_colon - ptr))) {
1113 return 0;
1115 if (sc->code == RESTART_PROGRAM) {
1116 handle_stop_event();
1117 return 0;
1120 nvecs = vectorize_string(semi_colon + 1, sc->nvecs);
1121 if (nvecs != sc->nvecs) {
1122 warn("nvecs discrepancy: %d vs %d (%s)\n", nvecs, sc->nvecs, ptr);
1124 if (sc->code != ACKNOWLEDGE_HOST_PROBLEM &&
1125 sc->code != ACKNOWLEDGE_SVC_PROBLEM)
1127 register_downtime_command(sc);
1128 } else {
1129 insert_acknowledgement(sc);
1131 break;
1133 case NEBTYPE_HOSTCHECK_PROCESSED:
1134 return insert_host_check(sc);
1136 case NEBTYPE_SERVICECHECK_PROCESSED:
1137 return insert_service_check(sc);
1139 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_HOST:
1140 case NEBTYPE_DOWNTIME_LOAD + CONCERNS_SERVICE:
1141 return insert_downtime(sc);
1143 case NEBTYPE_NOTIFICATION_END + CONCERNS_HOST:
1144 case NEBTYPE_NOTIFICATION_END + CONCERNS_SERVICE:
1145 return insert_notification(sc);
1147 case IGNORE_LINE:
1148 return 0;
1151 return 0;
1154 static int parse_one_line(char *str, uint len)
1156 if (parse_line(str, len) && use_sql && sql_errno())
1157 crash("sql error: %s", sql_error());
1159 return 0;
1162 static int hash_one_line(char *line, uint len)
1164 return add_interesting_object(line);
1167 static int hash_interesting(const char *path)
1169 struct stat st;
1171 if (stat(path, &st) < 0)
1172 crash("failed to stat %s: %s", path, strerror(errno));
1174 lparse_path(path, st.st_size, hash_one_line);
1176 return 0;
1179 extern const char *__progname;
1180 __attribute__((__format__(__printf__, 1, 2)))
1181 static void usage(const char *fmt, ...)
1183 if (fmt && *fmt) {
1184 va_list ap;
1186 va_start(ap, fmt);
1187 vfprintf(stdout, fmt, ap);
1188 va_end(ap);
1191 printf("Usage %s [options] [logfiles]\n\n", __progname);
1192 printf(" [logfiles] refers to all the nagios logfiles you want to import\n");
1193 printf(" If --nagios-cfg is given or can be inferred no logfiles need to be supplied\n");
1194 printf("\nOptions:\n");
1195 printf(" --help this cruft\n");
1196 printf(" --no-progress don't display progress output\n");
1197 printf(" --no-sql don't access the database\n");
1198 printf(" --db-name database name\n");
1199 printf(" --db-table database table name\n");
1200 printf(" --db-user database user\n");
1201 printf(" --db-pass database password\n");
1202 printf(" --incremental perform an incremental import\n");
1203 printf(" --truncate-db truncate database before importing\n");
1204 printf(" --only-notifications only import notifications\n");
1205 printf(" --nagios-cfg=</path/to/nagios.cfg> path to nagios.cfg\n");
1206 printf("\n\n");
1208 if (fmt && *fmt)
1209 exit(1);
1211 exit(0);
1214 int main(int argc, char **argv)
1216 int i, truncate_db = 0;
1217 const char *nagios_cfg = NULL;
1218 char *db_name, *db_user, *db_pass, *db_table;
1220 db_name = db_user = db_pass = db_table = NULL;
1222 do_progress = isatty(fileno(stdout));
1224 strv = calloc(sizeof(char *), MAX_NVECS);
1225 dentry = calloc(sizeof(*dentry), NUM_DENTRIES);
1226 if (!strv || !dentry)
1227 crash("Failed to alloc initial structs");
1230 for (num_nfile = 0,i = 1; i < argc; i++) {
1231 char *opt, *arg = argv[i];
1232 int arg_len, eq_opt = 0;
1234 if ((opt = strchr(arg, '='))) {
1235 *opt++ = '\0';
1236 eq_opt = 1;
1238 else if (i < argc - 1) {
1239 opt = argv[i + 1];
1242 if (!prefixcmp(arg, "-h") || !prefixcmp(arg, "--help")) {
1243 usage(NULL);
1245 if (!prefixcmp(arg, "--incremental")) {
1246 incremental = 1;
1249 * nifty for debugging --incremental skipping log-files
1250 * The value will be overwritten unless --no-sql is also
1251 * in effect
1253 if (eq_opt) {
1254 incremental = strtoul(opt, NULL, 0);
1255 if (!incremental)
1256 usage("--incremental= requires a parameter");
1258 continue;
1260 if (!prefixcmp(arg, "--no-sql")) {
1261 use_sql = 0;
1262 continue;
1264 if (!prefixcmp(arg, "--only-notifications")) {
1265 only_notifications = 1;
1266 db_name = db_name ? db_name : "merlin";
1267 db_user = db_user ? db_user : "merlin";
1268 db_pass = db_pass ? db_pass : "merlin";
1269 db_table = db_table ? db_table : "notification";
1270 continue;
1272 if (!prefixcmp(arg, "--no-progress")) {
1273 do_progress = 0;
1274 continue;
1276 if (!prefixcmp(arg, "--debug") || !prefixcmp(arg, "-d")) {
1277 do_progress = 0;
1278 debug_level++;
1279 continue;
1281 if (!prefixcmp(arg, "--truncate-db")) {
1282 truncate_db = 1;
1283 continue;
1285 if (!prefixcmp(arg, "--nagios-cfg")) {
1286 if (!opt || !*opt) {
1287 crash("%s requires the path to nagios.cfg as argument", arg);
1289 nagios_cfg = opt;
1290 if (opt && !eq_opt)
1291 i++;
1292 continue;
1294 if (!prefixcmp(arg, "--db-name")) {
1295 if (!opt || !*opt)
1296 crash("%s requires a database name as an argument", arg);
1297 db_name = opt;
1298 if (opt && !eq_opt)
1299 i++;
1300 continue;
1302 if (!prefixcmp(arg, "--db-user")) {
1303 if (!opt || !*opt)
1304 crash("%s requires a database username as argument", arg);
1305 db_user = opt;
1306 if (opt && !eq_opt)
1307 i++;
1308 continue;
1310 if (!prefixcmp(arg, "--db-pass")) {
1311 if (!opt || !*opt)
1312 crash("%s requires a database username as argument", arg);
1313 db_pass = opt;
1314 if (opt && !eq_opt)
1315 i++;
1316 continue;
1318 if (!prefixcmp(arg, "--db-table")) {
1319 if (!opt || !*opt)
1320 crash("%s requires a database table name as argument", arg);
1321 db_table = opt;
1322 if (opt && !eq_opt)
1323 i++;
1324 continue;
1326 if (!prefixcmp(arg, "--interesting") || !prefixcmp(arg, "-i")) {
1327 if (!opt || !*opt)
1328 crash("%s requires a filename as argument", arg);
1329 hash_interesting(opt);
1330 if (opt && !eq_opt)
1331 i++;
1332 continue;
1335 /* non-argument, so treat as a config- or log-file */
1336 arg_len = strlen(arg);
1337 if (arg_len >= 10 && !strcmp(&arg[arg_len - 10], "nagios.cfg")) {
1338 nagios_cfg = arg;
1339 } else {
1340 add_naglog_path(arg);
1344 /* fallback for op5 systems */
1345 if (!nagios_cfg && !num_nfile) {
1346 nagios_cfg = "/opt/monitor/etc/nagios.cfg";
1348 if (nagios_cfg) {
1349 struct cfg_comp *conf;
1350 conf = cfg_parse_file(nagios_cfg);
1351 for (i = 0; i < conf->vars; i++) {
1352 struct cfg_var *v = conf->vlist[i];
1353 if (!strcmp(v->key, "log_file")) {
1354 add_naglog_path(v->value);
1356 if (!strcmp(v->key, "log_archive_path")) {
1357 add_naglog_path(v->value);
1362 if (use_sql) {
1363 db_name = db_name ? db_name : "monitor_reports";
1364 db_user = db_user ? db_user : "monitor";
1365 db_pass = db_pass ? db_pass : "monitor";
1366 db_table = db_table ? db_table : "report_data";
1367 sql_config("db_database", db_name);
1368 sql_config("db_user", db_user);
1369 sql_config("db_pass", db_pass);
1370 sql_config("db_table", db_table);
1372 if (sql_init() < 0)
1373 crash("sql_init() failed");
1374 if (truncate_db)
1375 sql_query("TRUNCATE %s", sql_table_name());
1377 if (incremental) {
1378 MYSQL_RES *result;
1379 MYSQL_ROW row;
1380 sql_query("SELECT %s FROM %s.%s ORDER BY %s DESC LIMIT 1",
1381 only_notifications ? "end_time" : "timestamp",
1382 db_name, db_table,
1383 only_notifications ? "end_time" : "timestamp");
1385 if (!(result = sql_get_result()))
1386 crash("Failed to get last timestamp: %s\n", sql_error());
1388 /* someone might use --incremental with an empty
1389 * database. We shouldn't crash in that case */
1390 if ((row = sql_fetch_row(result)))
1391 incremental = strtoul(row[0], NULL, 0);
1393 sql_free_result(result);
1397 log_grok_var("logfile", "/dev/null");
1398 log_grok_var("log_levels", "warn");
1400 if (!num_nfile)
1401 crash("Usage: %s [--incremental] [--interesting <file>] [--truncate-db] logfiles\n",
1402 __progname);
1404 if (log_init() < 0)
1405 crash("log_init() failed");
1407 qsort(nfile, num_nfile, sizeof(*nfile), nfile_cmp);
1409 host_downtime = hash_init(HASH_TABLE_SIZE);
1410 service_downtime = hash_init(HASH_TABLE_SIZE);
1412 if (hook_init() < 0)
1413 crash("Failed to initialize hooks");
1415 /* go through them once to count the total size for progress output */
1416 for (i = 0; i < num_nfile; i++) {
1417 totsize += nfile[i].size;
1420 gettimeofday(&import_start, NULL);
1421 printf("Importing %s of data from %d files\n",
1422 tobytes(totsize), num_nfile);
1424 for (i = 0; i < num_nfile; i++) {
1425 struct naglog_file *nf = &nfile[i];
1426 cur_file = nf;
1427 show_progress();
1430 * skip parsing files if they're not interesting, such
1431 * as during incremental imports.
1432 * 'incremental' will be 0 if we're doing a full import,
1433 * 1 if we're doing an incremental but the database is
1434 * empty and will contain the timestamp of the latest
1435 * entry in the database if we're doing an incremental
1436 * import to a populated table.
1437 * Note that we can never skip the last file in the list,
1438 * although the lparse routine should sift through it
1439 * pretty quickly in case it has nothing interesting.
1441 if (i + 1 < num_nfile && incremental > nfile[i + 1].first) {
1442 skipped_files++;
1443 skipped += nf->size;
1444 continue;
1446 debug("importing from %s (%lu : %u)\n", nf->path, nf->first, nf->cmp);
1447 line_no = 0;
1448 lparse_path(nf->path, nf->size, parse_one_line);
1449 imported++; /* make up for one lost byte per file */
1452 ltime = time(NULL);
1453 purge_expired_downtime();
1454 end_progress();
1456 if (debug_level) {
1457 if (dt_depth) {
1458 printf("Unclosed host downtimes:\n");
1459 puts("------------------------");
1460 hash_walk_data(host_downtime, print_downtime);
1461 printf("Unclosed service downtimes:\n");
1462 puts("---------------------------");
1463 hash_walk_data(service_downtime, print_downtime);
1465 printf("dt_depth: %d\n", dt_depth);
1467 printf("purged downtimes: %d\n", purged_downtimes);
1468 printf("max simultaneous host downtime hashes: %u\n",
1469 hash_entries_max(host_downtime));
1470 printf("max simultaneous service downtime hashes: %u\n",
1471 hash_entries_max(service_downtime));
1472 printf("max downtime depth: %u\n", max_dt_depth);
1475 if (use_sql) {
1476 enable_indexes();
1477 sql_close();
1480 if (warnings && debug_level)
1481 fprintf(stderr, "Total warnings: %d\n", warnings);
1483 if (debug_level || dt_start > dt_stop) {
1484 uint count;
1485 fprintf(stderr, "Downtime data %s\n started: %d\n stopped: %d\n delta : %d\n skipped: %d\n",
1486 dt_depth ? "mismatch!" : "consistent", dt_start, dt_stop, dt_depth, dt_skip);
1487 hash_debug_table(host_downtime, 0);
1488 hash_debug_table(service_downtime, 0);
1489 if ((count = hash_entries(host_downtime))) {
1490 fprintf(stderr, "host_downtime as %u entries remaining\n", count);
1492 if ((count = hash_entries(service_downtime))) {
1493 fprintf(stderr, "service_downtime has %u entries remaining\n", count);
1497 print_unhandled_events();
1499 return 0;