load correct syscall table for s390 and s390x
[trinity.git] / watchdog.c
bloba58ee3fb4242f6a54f8981caf560eaa58bc6eeb3
1 #include <time.h>
2 #include <errno.h>
3 #include <fcntl.h>
4 #include <unistd.h>
5 #include <string.h>
6 #include <stdlib.h>
7 #include <signal.h>
8 #include <sys/prctl.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <sys/wait.h>
12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
15 #include "child.h"
16 #include "files.h"
17 #include "log.h"
18 #include "params.h" // quiet_level
19 #include "pids.h"
20 #include "shm.h"
21 #include "syscall.h"
22 #include "tables.h"
23 #include "trinity.h" //check_taint
25 pid_t watchdog_pid;
27 static unsigned long hiscore = 0;
29 static int check_shm_sanity(void)
31 unsigned int i;
32 pid_t pid;
34 if (shm->running_childs == 0)
35 return SHM_OK;
37 for_each_pidslot(i) {
38 pid = shm->pids[i];
39 if (pid == EMPTY_PIDSLOT)
40 continue;
42 if (pid_is_valid(pid) == FALSE) {
43 shm->exit_reason = EXIT_PID_OUT_OF_RANGE;
44 return SHM_CORRUPT;
48 // FIXME: The '500000' is magic, and should be dynamically calculated.
49 // On startup, we should figure out how many getpid()'s per second we can do,
50 // and use that.
51 if (shm->total_syscalls_done - shm->previous_count > 500000) {
52 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
53 shm->previous_count, shm->total_syscalls_done);
54 shm->exit_reason = EXIT_SHM_CORRUPTION;
56 shm->previous_count = shm->total_syscalls_done;
58 return SHM_OK;
61 static unsigned int reap_dead_kids(void)
63 unsigned int i;
64 unsigned int alive = 0;
65 unsigned int reaped = 0;
67 for_each_pidslot(i) {
68 pid_t pid;
69 int ret;
71 pid = shm->pids[i];
72 if (pid == EMPTY_PIDSLOT)
73 continue;
75 ret = kill(pid, 0);
76 /* If it disappeared, reap it. */
77 if (ret == -1) {
78 if (errno == ESRCH) {
79 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid);
80 reap_child(pid);
81 reaped++;
82 } else {
83 output(0, "problem checking on pid %d (%d:%s)\n", pid, errno, strerror(errno));
85 } else {
86 alive++;
89 if (shm->running_childs == 0)
90 return 0;
93 if (reaped != 0)
94 output(0, "Reaped %d dead children\n", reaped);
96 return alive;
99 static void kill_all_kids(void)
101 unsigned int i;
103 shm->spawn_no_more = TRUE;
105 /* Wait for all the children to exit. */
106 while (shm->running_childs > 0) {
107 unsigned int alive;
109 /* Make sure there's no dead kids lying around.
110 * We need to do this in case the oom killer has been killing them,
111 * otherwise we end up stuck here with no child processes.
113 alive = reap_dead_kids();
114 if (alive == 0)
115 return;
117 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
118 for_each_pidslot(i) {
119 pid_t pid;
121 pid = shm->pids[i];
122 if (pid == EMPTY_PIDSLOT)
123 continue;
125 kill(pid, SIGKILL);
128 /* wait a second to give kids a chance to exit. */
129 sleep(1);
131 if (check_shm_sanity()) {
132 // FIXME: If we get here, we over-wrote the real exit_reason.
133 // We should have saved that, and handled appropriately.
134 return;
138 /* Just to be sure, clear out the pid slots. */
139 for_each_pidslot(i) {
140 shm->pids[i] = EMPTY_PIDSLOT;
144 static int check_main_alive(void)
146 int ret;
148 /* If we're in the process of exiting, wait, and return without checking. */
149 if (shm->exit_reason != STILL_RUNNING) {
150 while (shm->mainpid != 0) {
151 sleep(1);
152 kill_all_kids();
154 return FALSE;
157 if (shm->mainpid == 0) {
158 output(0, "main pid was zero! (exit_reason:%d)\n", shm->exit_reason);
159 shm->exit_reason = EXIT_MAIN_DISAPPEARED;
160 return FALSE;
163 ret = kill(shm->mainpid, 0);
164 if (ret == -1) {
165 if (errno == ESRCH) {
166 output(0, "main pid %d has disappeared.\n", shm->mainpid);
167 shm->exit_reason = EXIT_MAIN_DISAPPEARED;
169 /* if main crashed while regenerating, we'll hang the watchdog,
170 * because nothing will ever set it back to FALSE. So we do it ourselves.
172 shm->regenerating = FALSE;
173 } else {
174 output(0, "problem checking on pid %d (%d:%s)\n", shm->mainpid, errno, strerror(errno));
176 return FALSE;
178 return TRUE;
181 /* if the first arg was an fd, find out which one it was. */
182 unsigned int check_if_fd(unsigned int child)
184 unsigned int highest;
185 unsigned callno = shm->syscallno[child];
187 /* shortcut, if it's out of range, it's not going to be valid. */
188 if (shm->a1[child] > 1024)
189 return FALSE;
191 highest = highest_logfile();
192 if (shm->a1[child] < highest)
193 return FALSE;
195 if (biarch == FALSE) {
196 if (syscalls[callno].entry->arg1type == ARG_FD)
197 return TRUE;
198 return FALSE;
201 /* biarch case */
202 if (shm->do32bit[child] == TRUE) {
203 if (syscalls_32bit[callno].entry->arg1type == ARG_FD)
204 return TRUE;
205 } else {
206 if (syscalls_64bit[callno].entry->arg1type == ARG_FD)
207 return TRUE;
210 return FALSE;
213 static void stuck_syscall_info(int childno)
215 unsigned int callno = shm->syscallno[childno];
216 char fdstr[20];
217 pid_t pid = shm->pids[childno];
219 memset(fdstr, 0, sizeof(fdstr));
221 if (check_if_fd(childno) == TRUE)
222 sprintf(fdstr, "(fd = %d)", (unsigned int) shm->a1[childno]);
224 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
225 pid, callno,
226 print_syscall_name(shm->syscallno[childno], shm->do32bit[childno]),
227 shm->do32bit[childno] ? " (32bit)" : "",
228 fdstr);
231 static void check_children(void)
233 struct timeval tv;
234 time_t diff;
235 time_t old, now;
236 pid_t pid;
237 unsigned int i;
239 for_each_pidslot(i) {
240 pid = shm->pids[i];
242 if (pid == EMPTY_PIDSLOT)
243 continue;
245 old = shm->tv[i].tv_sec;
247 if (old == 0)
248 continue;
250 gettimeofday(&tv, NULL);
251 now = tv.tv_sec;
253 /* if we wrapped, just reset it, we'll pick it up next time around. */
254 if (old > (now + 3)) {
255 output(1, "child %d wrapped! old=%ld now=%ld\n", i, old, now);
256 shm->tv[i].tv_sec = now;
257 continue;
260 diff = now - old;
262 /* if we're way off, we're comparing garbage. Reset it. */
263 if (diff > 1000) {
264 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i, pid, old, now, diff);
265 shm->tv[i].tv_sec = now;
266 continue;
269 /* After 30 seconds of no progress, send a kill signal. */
270 if (diff == 30) {
271 stuck_syscall_info(i);
272 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
273 pid, old, now, diff);
276 if (diff >= 30) {
277 int ret;
279 if (shm->kill_count[i] > 1) {
280 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
281 pid, shm->kill_count[i], diff);
282 } else {
283 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
284 pid, diff);
286 shm->kill_count[i]++;
287 ret = kill(pid, SIGKILL);
288 if (ret != 0) {
289 output(0, "couldn't kill pid %d [%s]\n", pid, strerror(errno));
291 sleep(1); // give child time to exit.
296 static void watchdog(void)
298 static const char watchdogname[17]="trinity-watchdog";
299 static unsigned long lastcount = 0;
300 bool watchdog_exit = FALSE;
301 int ret = 0;
303 while (shm->ready == FALSE) {
304 sleep(1);
305 if (shm->exit_reason != STILL_RUNNING)
306 return;
309 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid);
311 prctl(PR_SET_NAME, (unsigned long) &watchdogname);
312 (void)signal(SIGSEGV, SIG_DFL);
314 while (watchdog_exit == FALSE) {
316 if (check_shm_sanity() == SHM_CORRUPT)
317 goto corrupt;
319 if (check_main_alive() == FALSE)
320 goto main_dead;
322 if (shm->regenerating == FALSE) {
323 unsigned int i;
325 reap_dead_kids();
327 check_children();
329 if (syscalls_todo && (shm->total_syscalls_done >= syscalls_todo)) {
330 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo);
331 shm->exit_reason = EXIT_REACHED_COUNT;
334 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
335 if (shm->total_syscalls_done % 1000 == 0)
336 synclogs();
338 for_each_pidslot(i) {
339 if (shm->child_syscall_count[i] > hiscore)
340 hiscore = shm->child_syscall_count[i];
343 if (shm->total_syscalls_done > 1) {
344 if (shm->total_syscalls_done - lastcount > 10000) {
345 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
346 shm->total_syscalls_done,
347 shm->failures, shm->successes,
348 hiscore);
349 lastcount = shm->total_syscalls_done;
355 /* Only check taint if it mask allows it */
356 if (kernel_taint_mask != 0) {
357 ret = check_tainted();
358 if (((ret & kernel_taint_mask) & (~kernel_taint_initial)) != 0) {
359 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret, kernel_taint_initial, shm->seed);
360 shm->exit_reason = EXIT_KERNEL_TAINTED;
364 if (shm->need_reseed == FALSE) {
365 shm->reseed_counter++;
366 /* If we haven't reseeded in five minutes, trigger one. */
367 if (shm->reseed_counter == 300) {
368 output(0, "Triggering periodic reseed.\n");
369 shm->need_reseed = TRUE;
370 shm->reseed_counter = 0;
374 main_dead:
375 /* Are we done ? */
376 if (shm->exit_reason != STILL_RUNNING) {
377 /* Give children a chance to exit. */
378 sleep(1);
380 /* Are there still children running ? */
381 if (pidmap_empty() == TRUE)
382 watchdog_exit = TRUE;
383 else {
384 output(0, "exit_reason=%d, but %d children still running.\n",
385 shm->exit_reason, shm->running_childs);
386 kill_all_kids();
390 sleep(1);
393 corrupt:
394 /* We don't want to ever exit before main is waiting for us. */
395 while (shm->regenerating == TRUE)
396 sleep(1);
398 kill_all_kids();
401 void init_watchdog(void)
403 pid_t pid;
405 fflush(stdout);
406 pid = fork();
408 if (pid == 0) {
409 watchdog_pid = getpid();
410 watchdog();
411 output(0, "[%d] Watchdog exiting\n", watchdog_pid);
412 _exit(EXIT_SUCCESS);
414 } else {
415 watchdog_pid = pid;
416 output(0, "Started watchdog process, PID is %d\n", watchdog_pid);