parse args before setting up the shm.
[trinity.git] / watchdog.c
blob8d9a3ba0d1461de1b620fc30030796accc1045d1
1 #include <time.h>
2 #include <errno.h>
3 #include <fcntl.h>
4 #include <unistd.h>
5 #include <string.h>
6 #include <stdlib.h>
7 #include <signal.h>
8 #include <sys/prctl.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/wait.h>
12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
15 #include "child.h"
16 #include "files.h"
17 #include "log.h"
18 #include "params.h" // quiet_level
19 #include "pids.h"
20 #include "shm.h"
21 #include "syscall.h"
22 #include "tables.h"
23 #include "trinity.h" //check_taint
25 pid_t watchdog_pid;
27 static unsigned long hiscore = 0;
29 static int check_shm_sanity(void)
31 unsigned int i;
32 pid_t pid;
34 if (shm->running_childs == 0)
35 return SHM_OK;
37 for_each_pidslot(i) {
38 pid = shm->pids[i];
39 if (pid == EMPTY_PIDSLOT)
40 continue;
42 if (pid_is_valid(pid) == FALSE) {
43 shm->exit_reason = EXIT_PID_OUT_OF_RANGE;
44 return SHM_CORRUPT;
48 // FIXME: The '500000' is magic, and should be dynamically calculated.
49 // On startup, we should figure out how many getpid()'s per second we can do,
50 // and use that.
51 if (shm->total_syscalls_done - shm->previous_count > 500000) {
52 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
53 shm->previous_count, shm->total_syscalls_done);
54 shm->exit_reason = EXIT_SHM_CORRUPTION;
56 shm->previous_count = shm->total_syscalls_done;
58 return SHM_OK;
61 static unsigned int reap_dead_kids(void)
63 unsigned int i;
64 unsigned int alive = 0;
65 unsigned int reaped = 0;
67 for_each_pidslot(i) {
68 pid_t pid;
69 int ret;
71 pid = shm->pids[i];
72 if (pid == EMPTY_PIDSLOT)
73 continue;
75 ret = kill(pid, 0);
76 /* If it disappeared, reap it. */
77 if (ret == -1) {
78 if (errno == ESRCH) {
79 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid);
80 reap_child(pid);
81 reaped++;
82 } else {
83 output(0, "problem checking on pid %d (%d:%s)\n", pid, errno, strerror(errno));
85 } else {
86 alive++;
89 if (shm->running_childs == 0)
90 return 0;
93 if (reaped != 0)
94 output(0, "Reaped %d dead children\n", reaped);
96 return alive;
99 static void kill_all_kids(void)
101 unsigned int i;
103 shm->spawn_no_more = TRUE;
105 /* Wait for all the children to exit. */
106 while (shm->running_childs > 0) {
107 unsigned int alive;
109 /* Make sure there's no dead kids lying around.
110 * We need to do this in case the oom killer has been killing them,
111 * otherwise we end up stuck here with no child processes.
113 alive = reap_dead_kids();
114 if (alive == 0)
115 return;
117 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
118 for_each_pidslot(i) {
119 pid_t pid;
121 pid = shm->pids[i];
122 if (pid == EMPTY_PIDSLOT)
123 continue;
125 kill(pid, SIGKILL);
128 /* wait a second to give kids a chance to exit. */
129 sleep(1);
131 if (check_shm_sanity()) {
132 // FIXME: If we get here, we over-wrote the real exit_reason.
133 // We should have saved that, and handled appropriately.
134 return;
138 /* Just to be sure, clear out the pid slots. */
139 for_each_pidslot(i) {
140 shm->pids[i] = EMPTY_PIDSLOT;
144 static bool __check_main(void)
146 int ret;
148 if (shm->mainpid == 0)
149 return FALSE;
151 ret = kill(shm->mainpid, 0);
152 if (ret == -1) {
153 if (errno == ESRCH) {
154 output(0, "main pid %d has disappeared.\n", shm->mainpid);
155 shm->exit_reason = EXIT_MAIN_DISAPPEARED;
156 shm->mainpid = 0;
158 /* if main crashed while regenerating, we'll hang the watchdog,
159 * because nothing will ever set it back to FALSE. So we do it ourselves.
161 shm->regenerating = FALSE;
162 } else {
163 output(0, "problem checking on pid %d (%d:%s)\n", shm->mainpid, errno, strerror(errno));
165 return FALSE;
167 return TRUE;
170 static int check_main_alive(void)
172 int ret;
174 /* If we're in the process of exiting, wait, and return without checking. */
175 if (shm->exit_reason != STILL_RUNNING) {
176 while (shm->mainpid != 0) {
177 /* make sure main is still alive, to wait for kids. */
178 ret = __check_main();
179 if (ret == TRUE) {
180 sleep(1);
181 kill_all_kids();
184 return FALSE;
187 ret = __check_main();
188 return ret;
191 /* if the first arg was an fd, find out which one it was. */
192 unsigned int check_if_fd(unsigned int child)
194 unsigned int highest;
195 unsigned callno = shm->syscallno[child];
197 /* shortcut, if it's out of range, it's not going to be valid. */
198 if (shm->a1[child] > 1024)
199 return FALSE;
201 highest = highest_logfile();
202 if (shm->a1[child] < highest)
203 return FALSE;
205 if (biarch == FALSE) {
206 if (syscalls[callno].entry->arg1type == ARG_FD)
207 return TRUE;
208 return FALSE;
211 /* biarch case */
212 if (shm->do32bit[child] == TRUE) {
213 if (syscalls_32bit[callno].entry->arg1type == ARG_FD)
214 return TRUE;
215 } else {
216 if (syscalls_64bit[callno].entry->arg1type == ARG_FD)
217 return TRUE;
220 return FALSE;
223 static void stuck_syscall_info(int childno)
225 unsigned int callno = shm->syscallno[childno];
226 char fdstr[20];
227 pid_t pid = shm->pids[childno];
229 memset(fdstr, 0, sizeof(fdstr));
231 if (check_if_fd(childno) == TRUE)
232 sprintf(fdstr, "(fd = %d)", (unsigned int) shm->a1[childno]);
234 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
235 pid, callno,
236 print_syscall_name(shm->syscallno[childno], shm->do32bit[childno]),
237 shm->do32bit[childno] ? " (32bit)" : "",
238 fdstr);
241 static void check_children(void)
243 struct timeval tv;
244 time_t diff;
245 time_t old, now;
246 pid_t pid;
247 unsigned int i;
249 for_each_pidslot(i) {
250 pid = shm->pids[i];
252 if (pid == EMPTY_PIDSLOT)
253 continue;
255 old = shm->tv[i].tv_sec;
257 if (old == 0)
258 continue;
260 gettimeofday(&tv, NULL);
261 now = tv.tv_sec;
263 /* if we wrapped, just reset it, we'll pick it up next time around. */
264 if (old > (now + 3)) {
265 output(1, "child %d wrapped! old=%ld now=%ld\n", i, old, now);
266 shm->tv[i].tv_sec = now;
267 continue;
270 diff = now - old;
272 /* if we're way off, we're comparing garbage. Reset it. */
273 if (diff > 1000) {
274 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i, pid, old, now, diff);
275 shm->tv[i].tv_sec = now;
276 continue;
279 /* After 30 seconds of no progress, send a kill signal. */
280 if (diff == 30) {
281 stuck_syscall_info(i);
282 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
283 pid, old, now, diff);
286 if (diff >= 30) {
287 int ret;
289 if (shm->kill_count[i] > 1) {
290 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
291 pid, shm->kill_count[i], diff);
292 } else {
293 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
294 pid, diff);
296 shm->kill_count[i]++;
297 ret = kill(pid, SIGKILL);
298 if (ret != 0) {
299 output(0, "couldn't kill pid %d [%s]\n", pid, strerror(errno));
301 sleep(1); // give child time to exit.
306 static void watchdog(void)
308 static const char watchdogname[17]="trinity-watchdog";
309 static unsigned long lastcount = 0;
310 bool watchdog_exit = FALSE;
311 int ret = 0;
313 while (shm->ready == FALSE) {
314 sleep(1);
315 if (shm->exit_reason != STILL_RUNNING)
316 return;
319 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid);
321 prctl(PR_SET_NAME, (unsigned long) &watchdogname);
322 (void)signal(SIGSEGV, SIG_DFL);
324 while (watchdog_exit == FALSE) {
326 if (check_shm_sanity() == SHM_CORRUPT)
327 goto corrupt;
329 if (check_main_alive() == FALSE)
330 goto main_dead;
332 if (shm->regenerating == FALSE) {
333 unsigned int i;
335 reap_dead_kids();
337 check_children();
339 if (syscalls_todo && (shm->total_syscalls_done >= syscalls_todo)) {
340 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo);
341 shm->exit_reason = EXIT_REACHED_COUNT;
344 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
345 if (shm->total_syscalls_done % 1000 == 0)
346 synclogs();
348 for_each_pidslot(i) {
349 if (shm->child_syscall_count[i] > hiscore)
350 hiscore = shm->child_syscall_count[i];
353 if (shm->total_syscalls_done > 1) {
354 if (shm->total_syscalls_done - lastcount > 10000) {
355 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
356 shm->total_syscalls_done,
357 shm->failures, shm->successes,
358 hiscore);
359 lastcount = shm->total_syscalls_done;
365 /* Only check taint if it mask allows it */
366 if (kernel_taint_mask != 0) {
367 ret = check_tainted();
368 if (((ret & kernel_taint_mask) & (~kernel_taint_initial)) != 0) {
369 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret, kernel_taint_initial, shm->seed);
370 shm->exit_reason = EXIT_KERNEL_TAINTED;
374 if (shm->need_reseed == FALSE) {
375 shm->reseed_counter++;
376 /* If we haven't reseeded in five minutes, trigger one. */
377 if (shm->reseed_counter == 300) {
378 output(0, "Triggering periodic reseed.\n");
379 shm->need_reseed = TRUE;
380 shm->reseed_counter = 0;
384 main_dead:
385 /* Are we done ? */
386 if (shm->exit_reason != STILL_RUNNING) {
387 /* Give children a chance to exit. */
388 sleep(1);
390 /* Are there still children running ? */
391 if (pidmap_empty() == TRUE)
392 watchdog_exit = TRUE;
393 else {
394 output(0, "exit_reason=%d, but %d children still running.\n",
395 shm->exit_reason, shm->running_childs);
396 kill_all_kids();
400 sleep(1);
403 corrupt:
404 /* We don't want to ever exit before main is waiting for us. */
405 while (shm->regenerating == TRUE)
406 sleep(1);
408 kill_all_kids();
411 void init_watchdog(void)
413 pid_t pid;
415 fflush(stdout);
416 pid = fork();
418 if (pid == 0) {
419 watchdog_pid = getpid();
420 watchdog();
421 output(0, "[%d] Watchdog exiting\n", watchdog_pid);
422 _exit(EXIT_SUCCESS);
424 } else {
425 watchdog_pid = pid;
426 output(0, "Started watchdog process, PID is %d\n", watchdog_pid);