move swap on/off variants into same file
[trinity.git] / watchdog.c
blob65a222fba785a905cb9e82e346c6c58ceab1d9f2
1 #include <time.h>
2 #include <errno.h>
3 #include <fcntl.h>
4 #include <unistd.h>
5 #include <string.h>
6 #include <stdlib.h>
7 #include <signal.h>
8 #include <sys/prctl.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/wait.h>
12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
15 #include "child.h"
16 #include "files.h"
17 #include "locks.h"
18 #include "log.h"
19 #include "params.h" // quiet_level
20 #include "pids.h"
21 #include "shm.h"
22 #include "syscall.h"
23 #include "tables.h"
24 #include "taint.h"
25 #include "trinity.h" //check_taint
27 pid_t watchdog_pid;
29 static unsigned long hiscore = 0;
31 static int check_shm_sanity(void)
33 unsigned int i;
35 if (shm->running_childs == 0)
36 return SHM_OK;
38 for_each_pidslot(i) {
39 pid_t pid;
41 pid = shm->pids[i];
42 if (pid == EMPTY_PIDSLOT)
43 continue;
45 if (pid_is_valid(pid) == FALSE) {
46 shm->exit_reason = EXIT_PID_OUT_OF_RANGE;
47 return SHM_CORRUPT;
51 // FIXME: The '500000' is magic, and should be dynamically calculated.
52 // On startup, we should figure out how many getpid()'s per second we can do,
53 // and use that.
54 if (shm->total_syscalls_done - shm->previous_count > 500000) {
55 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
56 shm->previous_count, shm->total_syscalls_done);
57 shm->exit_reason = EXIT_SHM_CORRUPTION;
59 shm->previous_count = shm->total_syscalls_done;
61 return SHM_OK;
64 static unsigned int reap_dead_kids(void)
66 unsigned int i;
67 unsigned int alive = 0;
68 unsigned int reaped = 0;
70 for_each_pidslot(i) {
71 pid_t pid;
72 int ret;
74 pid = shm->pids[i];
75 if (pid == EMPTY_PIDSLOT)
76 continue;
78 ret = kill(pid, 0);
79 /* If it disappeared, reap it. */
80 if (ret == -1) {
81 if (errno == ESRCH) {
82 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid);
83 reap_child(pid);
84 reaped++;
85 } else {
86 output(0, "problem checking on pid %d (%d:%s)\n", pid, errno, strerror(errno));
88 } else {
89 alive++;
92 if (shm->running_childs == 0)
93 return 0;
96 if (reaped != 0)
97 output(0, "Reaped %d dead children\n", reaped);
99 return alive;
102 static void kill_all_kids(void)
104 unsigned int i;
106 shm->spawn_no_more = TRUE;
108 /* Wait for all the children to exit. */
109 while (shm->running_childs > 0) {
110 unsigned int alive;
112 /* Make sure there's no dead kids lying around.
113 * We need to do this in case the oom killer has been killing them,
114 * otherwise we end up stuck here with no child processes.
116 alive = reap_dead_kids();
117 if (alive == 0)
118 return;
120 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
121 for_each_pidslot(i) {
122 pid_t pid;
124 pid = shm->pids[i];
125 if (pid == EMPTY_PIDSLOT)
126 continue;
128 kill(pid, SIGKILL);
131 /* wait a second to give kids a chance to exit. */
132 sleep(1);
134 if (check_shm_sanity()) {
135 // FIXME: If we get here, we over-wrote the real exit_reason.
136 // We should have saved that, and handled appropriately.
137 return;
141 /* Just to be sure, clear out the pid slots. */
142 for_each_pidslot(i) {
143 shm->pids[i] = EMPTY_PIDSLOT;
147 static bool __check_main(void)
149 int ret;
151 if (shm->mainpid == 0)
152 return FALSE;
154 ret = kill(shm->mainpid, 0);
155 if (ret == -1) {
156 if (errno == ESRCH) {
157 output(0, "main pid %d has disappeared.\n", shm->mainpid);
158 shm->exit_reason = EXIT_MAIN_DISAPPEARED;
159 shm->mainpid = 0;
161 /* if main crashed while regenerating, we'll hang the watchdog,
162 * because nothing will ever set it back to FALSE. So we do it ourselves.
164 shm->regenerating = FALSE;
165 } else {
166 output(0, "problem checking on pid %d (%d:%s)\n", shm->mainpid, errno, strerror(errno));
168 return FALSE;
170 return TRUE;
173 static int check_main_alive(void)
175 int ret;
177 /* If we're in the process of exiting, wait, and return without checking. */
178 if (shm->exit_reason != STILL_RUNNING) {
179 while (shm->mainpid != 0) {
180 /* make sure main is still alive, to wait for kids. */
181 ret = __check_main();
182 if (ret == TRUE) {
183 sleep(1);
184 kill_all_kids();
187 return FALSE;
190 ret = __check_main();
191 return ret;
194 /* if the first arg was an fd, find out which one it was. */
195 unsigned int check_if_fd(unsigned int child)
197 unsigned int highest;
198 unsigned callno;
199 bool do32;
201 /* shortcut, if it's out of range, it's not going to be valid. */
202 if (shm->a1[child] > 1024)
203 return FALSE;
205 highest = highest_logfile();
206 if (shm->a1[child] < highest)
207 return FALSE;
209 lock(&shm->syscall_lock);
210 callno = shm->syscallno[child];
211 do32 = shm->do32bit[child];
212 unlock(&shm->syscall_lock);
214 if (biarch == FALSE) {
215 if (syscalls[callno].entry->arg1type == ARG_FD)
216 return TRUE;
217 return FALSE;
220 /* biarch case */
221 if (do32 == TRUE) {
222 if (syscalls_32bit[callno].entry->arg1type == ARG_FD)
223 return TRUE;
224 } else {
225 if (callno > max_nr_64bit_syscalls) {
226 output(0, "Weird, child:%d callno:%d (64bit max:%d)\n", child, callno, max_nr_64bit_syscalls);
227 return FALSE;
229 if (syscalls_64bit[callno].entry->arg1type == ARG_FD)
230 return TRUE;
233 return FALSE;
236 static void stuck_syscall_info(int childno)
238 unsigned int callno = shm->syscallno[childno];
239 char fdstr[20];
240 pid_t pid = shm->pids[childno];
242 memset(fdstr, 0, sizeof(fdstr));
244 if (check_if_fd(childno) == TRUE)
245 sprintf(fdstr, "(fd = %d)", (unsigned int) shm->a1[childno]);
247 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
248 pid, callno,
249 print_syscall_name(shm->syscallno[childno], shm->do32bit[childno]),
250 shm->do32bit[childno] ? " (32bit)" : "",
251 fdstr);
254 static void check_children(void)
256 struct timeval tv;
257 time_t diff;
258 time_t old, now;
259 unsigned int i;
261 for_each_pidslot(i) {
262 pid_t pid;
264 pid = shm->pids[i];
266 if (pid == EMPTY_PIDSLOT)
267 continue;
269 old = shm->tv[i].tv_sec;
271 if (old == 0)
272 continue;
274 gettimeofday(&tv, NULL);
275 now = tv.tv_sec;
277 /* if we wrapped, just reset it, we'll pick it up next time around. */
278 if (old > (now + 3)) {
279 output(1, "child %u wrapped! old=%lu now=%lu\n", i, old, now);
280 shm->tv[i].tv_sec = now;
281 continue;
284 diff = now - old;
286 /* if we're way off, we're comparing garbage. Reset it. */
287 if (diff > 1000) {
288 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i, pid, old, now, diff);
289 shm->tv[i].tv_sec = now;
290 continue;
293 /* After 30 seconds of no progress, send a kill signal. */
294 if (diff == 30) {
295 stuck_syscall_info(i);
296 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
297 pid, old, now, diff);
300 if (diff >= 30) {
301 int ret;
303 if (shm->kill_count[i] > 1) {
304 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
305 pid, shm->kill_count[i], diff);
306 } else {
307 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
308 pid, diff);
310 shm->kill_count[i]++;
311 ret = kill(pid, SIGKILL);
312 if (ret != 0) {
313 output(0, "couldn't kill pid %d [%s]\n", pid, strerror(errno));
315 sleep(1); // give child time to exit.
320 static void watchdog(void)
322 static const char watchdogname[17]="trinity-watchdog";
323 static unsigned long lastcount = 0;
324 bool watchdog_exit = FALSE;
325 int ret = 0;
327 while (shm->ready == FALSE) {
328 sleep(1);
329 if (shm->exit_reason != STILL_RUNNING)
330 return;
333 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid);
335 prctl(PR_SET_NAME, (unsigned long) &watchdogname);
336 (void)signal(SIGSEGV, SIG_DFL);
338 while (watchdog_exit == FALSE) {
340 if (check_shm_sanity() == SHM_CORRUPT)
341 goto corrupt;
343 if (check_main_alive() == FALSE)
344 goto main_dead;
346 if (shm->regenerating == FALSE) {
347 unsigned int i;
349 reap_dead_kids();
351 check_children();
353 if (syscalls_todo && (shm->total_syscalls_done >= syscalls_todo)) {
354 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo);
355 shm->exit_reason = EXIT_REACHED_COUNT;
358 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
359 if (shm->total_syscalls_done % 1000 == 0)
360 synclogs();
362 for_each_pidslot(i) {
363 if (shm->child_syscall_count[i] > hiscore)
364 hiscore = shm->child_syscall_count[i];
367 if (shm->total_syscalls_done > 1) {
368 if (shm->total_syscalls_done - lastcount > 10000) {
369 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
370 shm->total_syscalls_done,
371 shm->failures, shm->successes,
372 hiscore);
373 lastcount = shm->total_syscalls_done;
379 /* Only check taint if it mask allows it */
380 if (kernel_taint_mask != 0) {
381 ret = check_tainted();
382 if (((ret & kernel_taint_mask) & (~kernel_taint_initial)) != 0) {
383 gettimeofday(&shm->taint_tv, NULL);
385 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret, kernel_taint_initial, shm->seed);
386 shm->exit_reason = EXIT_KERNEL_TAINTED;
390 if (shm->need_reseed == FALSE) {
391 shm->reseed_counter++;
392 /* If we haven't reseeded in five minutes, trigger one. */
393 if (shm->reseed_counter == 300) {
394 output(0, "Triggering periodic reseed.\n");
395 shm->need_reseed = TRUE;
396 shm->reseed_counter = 0;
400 main_dead:
401 /* Are we done ? */
402 if (shm->exit_reason != STILL_RUNNING) {
403 /* Give children a chance to exit. */
404 sleep(1);
406 /* Are there still children running ? */
407 if (pidmap_empty() == TRUE)
408 watchdog_exit = TRUE;
409 else {
410 output(0, "exit_reason=%d, but %d children still running.\n",
411 shm->exit_reason, shm->running_childs);
412 kill_all_kids();
416 sleep(1);
419 corrupt:
420 /* We don't want to ever exit before main is waiting for us. */
421 while (shm->regenerating == TRUE)
422 sleep(1);
424 kill_all_kids();
427 void init_watchdog(void)
429 pid_t pid;
431 fflush(stdout);
432 pid = fork();
434 if (pid == 0) {
435 watchdog_pid = getpid();
436 watchdog();
437 output(0, "[%d] Watchdog exiting\n", watchdog_pid);
438 _exit(EXIT_SUCCESS);
440 } else {
441 watchdog_pid = pid;
442 output(0, "Started watchdog process, PID is %d\n", watchdog_pid);