12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
19 #include "params.h" // quiet_level
25 #include "trinity.h" //check_taint
29 static unsigned long hiscore
= 0;
31 static int check_shm_sanity(void)
35 if (shm
->running_childs
== 0)
42 if (pid
== EMPTY_PIDSLOT
)
45 if (pid_is_valid(pid
) == FALSE
) {
46 shm
->exit_reason
= EXIT_PID_OUT_OF_RANGE
;
51 // FIXME: The '500000' is magic, and should be dynamically calculated.
52 // On startup, we should figure out how many getpid()'s per second we can do,
54 if (shm
->total_syscalls_done
- shm
->previous_count
> 500000) {
55 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
56 shm
->previous_count
, shm
->total_syscalls_done
);
57 shm
->exit_reason
= EXIT_SHM_CORRUPTION
;
59 shm
->previous_count
= shm
->total_syscalls_done
;
64 static unsigned int reap_dead_kids(void)
67 unsigned int alive
= 0;
68 unsigned int reaped
= 0;
75 if (pid
== EMPTY_PIDSLOT
)
79 /* If it disappeared, reap it. */
82 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid
);
86 output(0, "problem checking on pid %d (%d:%s)\n", pid
, errno
, strerror(errno
));
92 if (shm
->running_childs
== 0)
97 output(0, "Reaped %d dead children\n", reaped
);
102 static void kill_all_kids(void)
106 shm
->spawn_no_more
= TRUE
;
108 /* Wait for all the children to exit. */
109 while (shm
->running_childs
> 0) {
112 /* Make sure there's no dead kids lying around.
113 * We need to do this in case the oom killer has been killing them,
114 * otherwise we end up stuck here with no child processes.
116 alive
= reap_dead_kids();
120 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
121 for_each_pidslot(i
) {
125 if (pid
== EMPTY_PIDSLOT
)
131 /* wait a second to give kids a chance to exit. */
134 if (check_shm_sanity()) {
135 // FIXME: If we get here, we over-wrote the real exit_reason.
136 // We should have saved that, and handled appropriately.
141 /* Just to be sure, clear out the pid slots. */
142 for_each_pidslot(i
) {
143 shm
->pids
[i
] = EMPTY_PIDSLOT
;
147 static bool __check_main(void)
151 if (shm
->mainpid
== 0)
154 ret
= kill(shm
->mainpid
, 0);
156 if (errno
== ESRCH
) {
157 output(0, "main pid %d has disappeared.\n", shm
->mainpid
);
158 shm
->exit_reason
= EXIT_MAIN_DISAPPEARED
;
161 /* if main crashed while regenerating, we'll hang the watchdog,
162 * because nothing will ever set it back to FALSE. So we do it ourselves.
164 shm
->regenerating
= FALSE
;
166 output(0, "problem checking on pid %d (%d:%s)\n", shm
->mainpid
, errno
, strerror(errno
));
173 static int check_main_alive(void)
177 /* If we're in the process of exiting, wait, and return without checking. */
178 if (shm
->exit_reason
!= STILL_RUNNING
) {
179 while (shm
->mainpid
!= 0) {
180 /* make sure main is still alive, to wait for kids. */
181 ret
= __check_main();
190 ret
= __check_main();
194 /* if the first arg was an fd, find out which one it was. */
195 unsigned int check_if_fd(unsigned int child
)
197 unsigned int fd
= shm
->syscall
[child
].a1
;
198 unsigned int highest
;
202 /* shortcut, if it's out of range, it's not going to be valid. */
206 highest
= highest_logfile();
210 lock(&shm
->syscall_lock
);
211 callno
= shm
->syscall
[child
].nr
;
212 do32
= shm
->syscall
[child
].do32bit
;
213 unlock(&shm
->syscall_lock
);
215 if (biarch
== FALSE
) {
216 if (syscalls
[callno
].entry
->arg1type
== ARG_FD
)
223 if (syscalls_32bit
[callno
].entry
->arg1type
== ARG_FD
)
226 if (callno
> max_nr_64bit_syscalls
) {
227 output(0, "Weird, child:%d callno:%d (64bit max:%d)\n", child
, callno
, max_nr_64bit_syscalls
);
230 if (syscalls_64bit
[callno
].entry
->arg1type
== ARG_FD
)
237 static void stuck_syscall_info(int childno
)
239 unsigned int callno
= shm
->syscall
[childno
].nr
;
241 pid_t pid
= shm
->pids
[childno
];
243 memset(fdstr
, 0, sizeof(fdstr
));
245 if (check_if_fd(childno
) == TRUE
)
246 sprintf(fdstr
, "(fd = %d)", (unsigned int) shm
->syscall
[childno
].a1
);
248 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
250 print_syscall_name(callno
, shm
->syscall
[childno
].do32bit
),
251 shm
->syscall
[childno
].do32bit
? " (32bit)" : "",
255 static void check_children(void)
262 for_each_pidslot(i
) {
267 if (pid
== EMPTY_PIDSLOT
)
270 old
= shm
->tv
[i
].tv_sec
;
275 gettimeofday(&tv
, NULL
);
278 /* if we wrapped, just reset it, we'll pick it up next time around. */
279 if (old
> (now
+ 3)) {
280 output(1, "child %u wrapped! old=%lu now=%lu\n", i
, old
, now
);
281 shm
->tv
[i
].tv_sec
= now
;
287 /* if we're way off, we're comparing garbage. Reset it. */
289 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i
, pid
, old
, now
, diff
);
290 shm
->tv
[i
].tv_sec
= now
;
294 /* After 30 seconds of no progress, send a kill signal. */
296 stuck_syscall_info(i
);
297 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
298 pid
, old
, now
, diff
);
304 if (shm
->kill_count
[i
] > 1) {
305 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
306 pid
, shm
->kill_count
[i
], diff
);
308 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
311 shm
->kill_count
[i
]++;
312 ret
= kill(pid
, SIGKILL
);
314 output(0, "couldn't kill pid %d [%s]\n", pid
, strerror(errno
));
316 sleep(1); // give child time to exit.
321 static void watchdog(void)
323 static const char watchdogname
[17]="trinity-watchdog";
324 static unsigned long lastcount
= 0;
325 bool watchdog_exit
= FALSE
;
328 while (shm
->ready
== FALSE
) {
330 if (shm
->exit_reason
!= STILL_RUNNING
)
334 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid
);
336 prctl(PR_SET_NAME
, (unsigned long) &watchdogname
);
337 (void)signal(SIGSEGV
, SIG_DFL
);
339 while (watchdog_exit
== FALSE
) {
341 if (check_shm_sanity() == SHM_CORRUPT
)
344 if (check_main_alive() == FALSE
)
347 if (shm
->regenerating
== FALSE
) {
354 if (syscalls_todo
&& (shm
->total_syscalls_done
>= syscalls_todo
)) {
355 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo
);
356 shm
->exit_reason
= EXIT_REACHED_COUNT
;
359 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
360 if (shm
->total_syscalls_done
% 1000 == 0)
363 for_each_pidslot(i
) {
364 if (shm
->child_syscall_count
[i
] > hiscore
)
365 hiscore
= shm
->child_syscall_count
[i
];
368 if (shm
->total_syscalls_done
> 1) {
369 if (shm
->total_syscalls_done
- lastcount
> 10000) {
370 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
371 shm
->total_syscalls_done
,
372 shm
->failures
, shm
->successes
,
374 lastcount
= shm
->total_syscalls_done
;
380 /* Only check taint if it mask allows it */
381 if (kernel_taint_mask
!= 0) {
382 ret
= check_tainted();
383 if (((ret
& kernel_taint_mask
) & (~kernel_taint_initial
)) != 0) {
384 gettimeofday(&shm
->taint_tv
, NULL
);
386 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret
, kernel_taint_initial
, shm
->seed
);
387 shm
->exit_reason
= EXIT_KERNEL_TAINTED
;
393 if (shm
->exit_reason
!= STILL_RUNNING
) {
394 /* Give children a chance to exit. */
397 /* Are there still children running ? */
398 if (pidmap_empty() == TRUE
)
399 watchdog_exit
= TRUE
;
401 output(0, "exit_reason=%d, but %d children still running.\n",
402 shm
->exit_reason
, shm
->running_childs
);
411 /* We don't want to ever exit before main is waiting for us. */
412 while (shm
->regenerating
== TRUE
)
418 void init_watchdog(void)
426 watchdog_pid
= getpid();
428 output(0, "[%d] Watchdog exiting\n", watchdog_pid
);
433 output(0, "Started watchdog process, PID is %d\n", watchdog_pid
);