note a leak that needs fixing eventually
[trinity.git] / watchdog.c
blob79de75764b41f65a2aa1c1f003b41057be1c110f
1 #include <time.h>
2 #include <errno.h>
3 #include <fcntl.h>
4 #include <unistd.h>
5 #include <string.h>
6 #include <stdlib.h>
7 #include <signal.h>
8 #include <sys/prctl.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/wait.h>
12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
15 #include "child.h"
16 #include "files.h"
17 #include "locks.h"
18 #include "log.h"
19 #include "params.h" // quiet_level
20 #include "pids.h"
21 #include "shm.h"
22 #include "syscall.h"
23 #include "tables.h"
24 #include "taint.h"
25 #include "trinity.h" //check_taint
27 pid_t watchdog_pid;
29 static unsigned long hiscore = 0;
31 static int check_shm_sanity(void)
33 unsigned int i;
35 if (shm->running_childs == 0)
36 return SHM_OK;
38 for_each_pidslot(i) {
39 pid_t pid;
41 pid = shm->pids[i];
42 if (pid == EMPTY_PIDSLOT)
43 continue;
45 if (pid_is_valid(pid) == FALSE) {
46 shm->exit_reason = EXIT_PID_OUT_OF_RANGE;
47 return SHM_CORRUPT;
51 // FIXME: The '500000' is magic, and should be dynamically calculated.
52 // On startup, we should figure out how many getpid()'s per second we can do,
53 // and use that.
54 if (shm->total_syscalls_done - shm->previous_op_count > 500000) {
55 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
56 shm->previous_op_count, shm->total_syscalls_done);
57 shm->exit_reason = EXIT_SHM_CORRUPTION;
59 shm->previous_op_count = shm->total_syscalls_done;
61 return SHM_OK;
64 static unsigned int reap_dead_kids(void)
66 unsigned int i;
67 unsigned int alive = 0;
68 unsigned int reaped = 0;
70 for_each_pidslot(i) {
71 pid_t pid;
72 int ret;
74 pid = shm->pids[i];
75 if (pid == EMPTY_PIDSLOT)
76 continue;
78 ret = kill(pid, 0);
79 /* If it disappeared, reap it. */
80 if (ret == -1) {
81 if (errno == ESRCH) {
82 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid);
83 reap_child(pid);
84 reaped++;
85 } else {
86 output(0, "problem checking on pid %d (%d:%s)\n", pid, errno, strerror(errno));
88 } else {
89 alive++;
92 if (shm->running_childs == 0)
93 return 0;
96 if (reaped != 0)
97 output(0, "Reaped %d dead children\n", reaped);
99 return alive;
102 static void kill_all_kids(void)
104 unsigned int i;
106 shm->spawn_no_more = TRUE;
108 /* Wait for all the children to exit. */
109 while (shm->running_childs > 0) {
110 unsigned int alive;
112 /* Make sure there's no dead kids lying around.
113 * We need to do this in case the oom killer has been killing them,
114 * otherwise we end up stuck here with no child processes.
116 alive = reap_dead_kids();
117 if (alive == 0)
118 return;
120 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
121 for_each_pidslot(i) {
122 pid_t pid;
124 pid = shm->pids[i];
125 if (pid == EMPTY_PIDSLOT)
126 continue;
128 kill(pid, SIGKILL);
131 /* wait a second to give kids a chance to exit. */
132 sleep(1);
134 if (check_shm_sanity()) {
135 // FIXME: If we get here, we over-wrote the real exit_reason.
136 // We should have saved that, and handled appropriately.
137 return;
141 /* Just to be sure, clear out the pid slots. */
142 for_each_pidslot(i) {
143 shm->pids[i] = EMPTY_PIDSLOT;
147 static bool __check_main(void)
149 int ret;
151 if (shm->mainpid == 0)
152 return FALSE;
154 ret = kill(shm->mainpid, 0);
155 if (ret == -1) {
156 if (errno == ESRCH) {
157 output(0, "main pid %d has disappeared.\n", shm->mainpid);
158 shm->exit_reason = EXIT_MAIN_DISAPPEARED;
159 shm->mainpid = 0;
161 /* if main crashed while regenerating, we'll hang the watchdog,
162 * because nothing will ever set it back to FALSE. So we do it ourselves.
164 shm->regenerating = FALSE;
165 } else {
166 output(0, "problem checking on pid %d (%d:%s)\n", shm->mainpid, errno, strerror(errno));
168 return FALSE;
170 return TRUE;
173 static int check_main_alive(void)
175 int ret;
177 /* If we're in the process of exiting, wait, and return without checking. */
178 if (shm->exit_reason != STILL_RUNNING) {
179 while (shm->mainpid != 0) {
180 /* make sure main is still alive, to wait for kids. */
181 ret = __check_main();
182 if (ret == TRUE) {
183 sleep(1);
184 kill_all_kids();
187 return FALSE;
190 ret = __check_main();
191 return ret;
194 /* if the first arg was an fd, find out which one it was. */
195 unsigned int check_if_fd(unsigned int child)
197 unsigned int fd = shm->syscall[child].a1;
198 unsigned int highest;
199 unsigned callno;
200 bool do32;
202 /* shortcut, if it's out of range, it's not going to be valid. */
203 if (fd > 1024)
204 return FALSE;
206 highest = highest_logfile();
207 if (fd < highest)
208 return FALSE;
210 lock(&shm->syscall_lock);
211 callno = shm->syscall[child].nr;
212 do32 = shm->syscall[child].do32bit;
213 unlock(&shm->syscall_lock);
215 if (biarch == FALSE) {
216 if (syscalls[callno].entry->arg1type == ARG_FD)
217 return TRUE;
218 return FALSE;
221 /* biarch case */
222 if (do32 == TRUE) {
223 if (syscalls_32bit[callno].entry->arg1type == ARG_FD)
224 return TRUE;
225 } else {
226 if (callno > max_nr_64bit_syscalls) {
227 output(0, "Weird, child:%d callno:%d (64bit max:%d)\n", child, callno, max_nr_64bit_syscalls);
228 return FALSE;
230 if (syscalls_64bit[callno].entry->arg1type == ARG_FD)
231 return TRUE;
234 return FALSE;
237 static void stuck_syscall_info(int childno)
239 unsigned int callno = shm->syscall[childno].nr;
240 char fdstr[20];
241 pid_t pid = shm->pids[childno];
243 memset(fdstr, 0, sizeof(fdstr));
245 if (check_if_fd(childno) == TRUE)
246 sprintf(fdstr, "(fd = %d)", (unsigned int) shm->syscall[childno].a1);
248 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
249 pid, callno,
250 print_syscall_name(callno, shm->syscall[childno].do32bit),
251 shm->syscall[childno].do32bit ? " (32bit)" : "",
252 fdstr);
255 static void check_children(void)
257 struct timeval tv;
258 time_t diff;
259 time_t old, now;
260 unsigned int i;
262 for_each_pidslot(i) {
263 pid_t pid;
265 pid = shm->pids[i];
267 if (pid == EMPTY_PIDSLOT)
268 continue;
270 old = shm->tv[i].tv_sec;
272 if (old == 0)
273 continue;
275 gettimeofday(&tv, NULL);
276 now = tv.tv_sec;
278 /* if we wrapped, just reset it, we'll pick it up next time around. */
279 if (old > (now + 3)) {
280 output(1, "child %u wrapped! old=%lu now=%lu\n", i, old, now);
281 shm->tv[i].tv_sec = now;
282 continue;
285 diff = now - old;
287 /* if we're way off, we're comparing garbage. Reset it. */
288 if (diff > 1000) {
289 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i, pid, old, now, diff);
290 shm->tv[i].tv_sec = now;
291 continue;
294 /* After 30 seconds of no progress, send a kill signal. */
295 if (diff == 30) {
296 stuck_syscall_info(i);
297 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
298 pid, old, now, diff);
301 if (diff >= 30) {
302 int ret;
304 if (shm->kill_count[i] > 1) {
305 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
306 pid, shm->kill_count[i], diff);
307 } else {
308 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
309 pid, diff);
311 shm->kill_count[i]++;
312 ret = kill(pid, SIGKILL);
313 if (ret != 0) {
314 output(0, "couldn't kill pid %d [%s]\n", pid, strerror(errno));
316 sleep(1); // give child time to exit.
321 static void watchdog(void)
323 static const char watchdogname[17]="trinity-watchdog";
324 static unsigned long lastcount = 0;
325 bool watchdog_exit = FALSE;
326 int ret = 0;
328 while (shm->ready == FALSE) {
329 sleep(1);
330 if (shm->exit_reason != STILL_RUNNING)
331 return;
334 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid);
336 prctl(PR_SET_NAME, (unsigned long) &watchdogname);
337 (void)signal(SIGSEGV, SIG_DFL);
339 while (watchdog_exit == FALSE) {
341 if (check_shm_sanity() == SHM_CORRUPT)
342 goto corrupt;
344 if (check_main_alive() == FALSE)
345 goto main_dead;
347 if (shm->regenerating == FALSE) {
348 unsigned int i;
350 reap_dead_kids();
352 check_children();
354 if (syscalls_todo && (shm->total_syscalls_done >= syscalls_todo)) {
355 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo);
356 shm->exit_reason = EXIT_REACHED_COUNT;
359 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
360 if (shm->total_syscalls_done % 1000 == 0)
361 synclogs();
363 for_each_pidslot(i) {
364 if (shm->child_op_count[i] > hiscore)
365 hiscore = shm->child_op_count[i];
368 if (shm->total_syscalls_done > 1) {
369 if (shm->total_syscalls_done - lastcount > 10000) {
370 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
371 shm->total_syscalls_done,
372 shm->failures, shm->successes,
373 hiscore);
374 lastcount = shm->total_syscalls_done;
380 /* Only check taint if it mask allows it */
381 if (kernel_taint_mask != 0) {
382 ret = check_tainted();
383 if (((ret & kernel_taint_mask) & (~kernel_taint_initial)) != 0) {
384 gettimeofday(&shm->taint_tv, NULL);
386 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret, kernel_taint_initial, shm->seed);
387 shm->exit_reason = EXIT_KERNEL_TAINTED;
391 main_dead:
392 /* Are we done ? */
393 if (shm->exit_reason != STILL_RUNNING) {
394 /* Give children a chance to exit. */
395 sleep(1);
397 /* Are there still children running ? */
398 if (pidmap_empty() == TRUE)
399 watchdog_exit = TRUE;
400 else {
401 output(0, "exit_reason=%d, but %d children still running.\n",
402 shm->exit_reason, shm->running_childs);
403 kill_all_kids();
407 sleep(1);
410 corrupt:
411 /* We don't want to ever exit before main is waiting for us. */
412 while (shm->regenerating == TRUE)
413 sleep(1);
415 kill_all_kids();
418 void init_watchdog(void)
420 pid_t pid;
422 fflush(stdout);
423 pid = fork();
425 if (pid == 0) {
426 watchdog_pid = getpid();
427 watchdog();
428 output(0, "[%d] Watchdog exiting\n", watchdog_pid);
429 _exit(EXIT_SUCCESS);
431 } else {
432 watchdog_pid = pid;
433 output(0, "Started watchdog process, PID is %d\n", watchdog_pid);