only do the guard page on single pages.
[trinity.git] / watchdog.c
blob71a4f175579fd58505eeae947f4fed2035dd21a0
1 #include <time.h>
2 #include <errno.h>
3 #include <fcntl.h>
4 #include <unistd.h>
5 #include <string.h>
6 #include <stdlib.h>
7 #include <signal.h>
8 #include <sys/prctl.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/wait.h>
12 #include <sys/ptrace.h>
14 #include "trinity.h" //check_taint and biarch
15 #include "shm.h"
16 #include "files.h"
17 #include "syscall.h"
18 #include "pids.h"
19 #include "params.h" // quiet_level
20 #include "log.h"
21 #include "child.h"
22 #include "tables.h"
24 pid_t watchdog_pid;
26 static unsigned long hiscore = 0;
28 static int check_shm_sanity(void)
30 unsigned int i;
31 pid_t pid;
33 if (shm->running_childs == 0)
34 return SHM_OK;
36 for_each_pidslot(i) {
37 pid = shm->pids[i];
38 if (pid == EMPTY_PIDSLOT)
39 continue;
41 if (pid_is_valid(pid) == FALSE) {
42 shm->exit_reason = EXIT_PID_OUT_OF_RANGE;
43 return SHM_CORRUPT;
47 // FIXME: The '500000' is magic, and should be dynamically calculated.
48 // On startup, we should figure out how many getpid()'s per second we can do,
49 // and use that.
50 if (shm->total_syscalls_done - shm->previous_count > 500000) {
51 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
52 shm->previous_count, shm->total_syscalls_done);
53 shm->exit_reason = EXIT_SHM_CORRUPTION;
55 shm->previous_count = shm->total_syscalls_done;
57 return SHM_OK;
60 static unsigned int reap_dead_kids(void)
62 unsigned int i;
63 unsigned int alive = 0;
64 unsigned int reaped = 0;
66 for_each_pidslot(i) {
67 pid_t pid;
68 int ret;
70 pid = shm->pids[i];
71 if (pid == EMPTY_PIDSLOT)
72 continue;
74 ret = kill(pid, 0);
75 /* If it disappeared, reap it. */
76 if (ret == -1) {
77 if (errno == ESRCH) {
78 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid);
79 reap_child(pid);
80 reaped++;
81 } else {
82 output(0, "problem checking on pid %d (%d:%s)\n", pid, errno, strerror(errno));
84 } else {
85 alive++;
88 if (shm->running_childs == 0)
89 return 0;
92 if (reaped != 0)
93 output(0, "Reaped %d dead children\n", reaped);
95 return alive;
98 static void kill_all_kids(void)
100 unsigned int i;
102 shm->spawn_no_more = TRUE;
104 /* Wait for all the children to exit. */
105 while (shm->running_childs > 0) {
106 unsigned int alive;
108 /* Make sure there's no dead kids lying around.
109 * We need to do this in case the oom killer has been killing them,
110 * otherwise we end up stuck here with no child processes.
112 alive = reap_dead_kids();
113 if (alive == 0)
114 return;
116 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
117 for_each_pidslot(i) {
118 pid_t pid;
120 pid = shm->pids[i];
121 if (pid == EMPTY_PIDSLOT)
122 continue;
124 kill(pid, SIGKILL);
127 /* wait a second to give kids a chance to exit. */
128 sleep(1);
130 if (check_shm_sanity()) {
131 // FIXME: If we get here, we over-wrote the real exit_reason.
132 // We should have saved that, and handled appropriately.
133 return;
137 /* Just to be sure, clear out the pid slots. */
138 for_each_pidslot(i) {
139 shm->pids[i] = EMPTY_PIDSLOT;
143 static int check_main_alive(void)
145 int ret;
147 /* If we're in the process of exiting, wait, and return without checking. */
148 if (shm->exit_reason != STILL_RUNNING) {
149 while (shm->mainpid != 0) {
150 sleep(1);
151 kill_all_kids();
153 return FALSE;
156 if (shm->mainpid == 0) {
157 output(0, "main pid was zero!\n");
158 shm->exit_reason = EXIT_MAIN_DISAPPEARED;
159 return FALSE;
162 ret = kill(shm->mainpid, 0);
163 if (ret == -1) {
164 if (errno == ESRCH) {
165 output(0, "main pid %d has disappeared.\n", shm->mainpid);
166 shm->exit_reason = EXIT_MAIN_DISAPPEARED;
168 /* if main crashed while regenerating, we'll hang the watchdog,
169 * because nothing will ever set it back to FALSE. So we do it ourselves.
171 shm->regenerating = FALSE;
172 } else {
173 output(0, "problem checking on pid %d (%d:%s)\n", shm->mainpid, errno, strerror(errno));
175 return FALSE;
177 return TRUE;
180 /* if the first arg was an fd, find out which one it was. */
181 unsigned int check_if_fd(unsigned int child)
183 unsigned int highest;
184 unsigned callno = shm->syscallno[child];
186 /* shortcut, if it's out of range, it's not going to be valid. */
187 if (shm->a1[child] > 1024)
188 return FALSE;
190 highest = highest_logfile();
191 if (shm->a1[child] < highest)
192 return FALSE;
194 if (biarch == FALSE) {
195 if (syscalls[callno].entry->arg1type == ARG_FD)
196 return TRUE;
197 return FALSE;
200 /* biarch case */
201 if (shm->do32bit[child] == TRUE) {
202 if (syscalls_32bit[callno].entry->arg1type == ARG_FD)
203 return TRUE;
204 } else {
205 if (syscalls_64bit[callno].entry->arg1type == ARG_FD)
206 return TRUE;
209 return FALSE;
212 static void stuck_syscall_info(int childno)
214 unsigned int callno = shm->syscallno[childno];
215 char fdstr[20];
216 pid_t pid = shm->pids[childno];
218 memset(fdstr, 0, sizeof(fdstr));
220 if (check_if_fd(childno) == TRUE)
221 sprintf(fdstr, "(fd = %d)", (unsigned int) shm->a1[childno]);
223 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
224 pid, callno,
225 print_syscall_name(shm->syscallno[childno], shm->do32bit[childno]),
226 shm->do32bit[childno] ? " (32bit)" : "",
227 fdstr);
230 static void check_children(void)
232 struct timeval tv;
233 time_t diff;
234 time_t old, now;
235 pid_t pid;
236 unsigned int i;
238 gettimeofday(&tv, NULL);
239 now = tv.tv_sec;
241 for_each_pidslot(i) {
242 pid = shm->pids[i];
244 if (pid == EMPTY_PIDSLOT)
245 continue;
247 old = shm->tv[i].tv_sec;
249 if (old == 0)
250 continue;
252 /* if we wrapped, just reset it, we'll pick it up next time around. */
253 if (old > (now + 3)) {
254 output(1, "child %d wrapped! old=%ld now=%ld\n", i, old, now);
255 shm->tv[i].tv_sec = now;
256 continue;
259 diff = now - old;
261 /* if we're way off, we're comparing garbage. Reset it. */
262 if (diff > 1000) {
263 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i, pid, old, now, diff);
264 shm->tv[i].tv_sec = now;
265 continue;
268 /* After 30 seconds of no progress, send a kill signal. */
269 if (diff == 30) {
270 stuck_syscall_info(i);
271 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
272 pid, old, now, diff);
275 if (diff >= 30) {
276 int ret;
278 if (shm->kill_count[i] > 1) {
279 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
280 pid, shm->kill_count[i], diff);
281 } else {
282 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
283 pid, diff);
285 shm->kill_count[i]++;
286 ret = kill(pid, SIGKILL);
287 if (ret != 0) {
288 output(0, "couldn't kill pid %d [%s]\n", pid, strerror(errno));
290 sleep(1); // give child time to exit.
295 static void watchdog(void)
297 static const char watchdogname[17]="trinity-watchdog";
298 static unsigned long lastcount = 0;
299 bool watchdog_exit = FALSE;
300 int ret = 0;
302 while (shm->ready == FALSE) {
303 sleep(1);
304 if (shm->exit_reason != STILL_RUNNING)
305 return;
308 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid);
310 prctl(PR_SET_NAME, (unsigned long) &watchdogname);
311 (void)signal(SIGSEGV, SIG_DFL);
313 while (watchdog_exit == FALSE) {
315 if (check_shm_sanity() == SHM_CORRUPT)
316 goto corrupt;
318 if (check_main_alive() == FALSE)
319 goto main_dead;
321 if (shm->regenerating == FALSE) {
322 unsigned int i;
324 reap_dead_kids();
326 check_children();
328 if (syscalls_todo && (shm->total_syscalls_done >= syscalls_todo)) {
329 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo);
330 shm->exit_reason = EXIT_REACHED_COUNT;
333 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
334 if (shm->total_syscalls_done % 1000 == 0)
335 synclogs();
337 for_each_pidslot(i) {
338 if (shm->child_syscall_count[i] > hiscore)
339 hiscore = shm->child_syscall_count[i];
342 if (shm->total_syscalls_done > 1) {
343 if (shm->total_syscalls_done - lastcount > 10000) {
344 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
345 shm->total_syscalls_done,
346 shm->failures, shm->successes,
347 hiscore);
348 lastcount = shm->total_syscalls_done;
354 /* Only check taint if it mask allows it */
355 if (kernel_taint_mask != 0) {
356 ret = check_tainted();
357 if (((ret & kernel_taint_mask) & (~kernel_taint_initial)) != 0) {
358 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret, kernel_taint_initial, shm->seed);
359 shm->exit_reason = EXIT_KERNEL_TAINTED;
363 if (shm->need_reseed == FALSE) {
364 shm->reseed_counter++;
365 /* If we haven't reseeded in five minutes, trigger one. */
366 if (shm->reseed_counter == 300) {
367 output(0, "Triggering periodic reseed.\n");
368 shm->need_reseed = TRUE;
369 shm->reseed_counter = 0;
373 main_dead:
374 /* Are we done ? */
375 if (shm->exit_reason != STILL_RUNNING) {
376 /* Give children a chance to exit. */
377 sleep(1);
379 /* Are there still children running ? */
380 if (pidmap_empty() == TRUE)
381 watchdog_exit = TRUE;
382 else {
383 output(0, "exit_reason=%d, but %d children still running.\n",
384 shm->exit_reason, shm->running_childs);
385 kill_all_kids();
389 sleep(1);
392 corrupt:
393 /* We don't want to ever exit before main is waiting for us. */
394 while (shm->regenerating == TRUE)
395 sleep(1);
397 kill_all_kids();
400 void init_watchdog(void)
402 pid_t pid;
404 fflush(stdout);
405 pid = fork();
407 if (pid == 0) {
408 watchdog_pid = getpid();
409 watchdog();
410 output(0, "[%d] Watchdog exiting\n", watchdog_pid);
411 _exit(EXIT_SUCCESS);
413 } else {
414 watchdog_pid = pid;
415 output(0, "Started watchdog process, PID is %d\n", watchdog_pid);