12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
19 #include "params.h" // quiet_level
25 #include "trinity.h" //check_taint
29 static unsigned long hiscore
= 0;
31 static int check_shm_sanity(void)
35 if (shm
->running_childs
== 0)
42 if (pid
== EMPTY_PIDSLOT
)
45 if (pid_is_valid(pid
) == FALSE
) {
46 shm
->exit_reason
= EXIT_PID_OUT_OF_RANGE
;
51 // FIXME: The '500000' is magic, and should be dynamically calculated.
52 // On startup, we should figure out how many getpid()'s per second we can do,
54 if (shm
->total_syscalls_done
- shm
->previous_count
> 500000) {
55 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
56 shm
->previous_count
, shm
->total_syscalls_done
);
57 shm
->exit_reason
= EXIT_SHM_CORRUPTION
;
59 shm
->previous_count
= shm
->total_syscalls_done
;
64 static unsigned int reap_dead_kids(void)
67 unsigned int alive
= 0;
68 unsigned int reaped
= 0;
75 if (pid
== EMPTY_PIDSLOT
)
79 /* If it disappeared, reap it. */
82 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid
);
86 output(0, "problem checking on pid %d (%d:%s)\n", pid
, errno
, strerror(errno
));
92 if (shm
->running_childs
== 0)
97 output(0, "Reaped %d dead children\n", reaped
);
102 static void kill_all_kids(void)
106 shm
->spawn_no_more
= TRUE
;
108 /* Wait for all the children to exit. */
109 while (shm
->running_childs
> 0) {
112 /* Make sure there's no dead kids lying around.
113 * We need to do this in case the oom killer has been killing them,
114 * otherwise we end up stuck here with no child processes.
116 alive
= reap_dead_kids();
120 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
121 for_each_pidslot(i
) {
125 if (pid
== EMPTY_PIDSLOT
)
131 /* wait a second to give kids a chance to exit. */
134 if (check_shm_sanity()) {
135 // FIXME: If we get here, we over-wrote the real exit_reason.
136 // We should have saved that, and handled appropriately.
141 /* Just to be sure, clear out the pid slots. */
142 for_each_pidslot(i
) {
143 shm
->pids
[i
] = EMPTY_PIDSLOT
;
147 static bool __check_main(void)
151 if (shm
->mainpid
== 0)
154 ret
= kill(shm
->mainpid
, 0);
156 if (errno
== ESRCH
) {
157 output(0, "main pid %d has disappeared.\n", shm
->mainpid
);
158 shm
->exit_reason
= EXIT_MAIN_DISAPPEARED
;
161 /* if main crashed while regenerating, we'll hang the watchdog,
162 * because nothing will ever set it back to FALSE. So we do it ourselves.
164 shm
->regenerating
= FALSE
;
166 output(0, "problem checking on pid %d (%d:%s)\n", shm
->mainpid
, errno
, strerror(errno
));
173 static int check_main_alive(void)
177 /* If we're in the process of exiting, wait, and return without checking. */
178 if (shm
->exit_reason
!= STILL_RUNNING
) {
179 while (shm
->mainpid
!= 0) {
180 /* make sure main is still alive, to wait for kids. */
181 ret
= __check_main();
190 ret
= __check_main();
194 /* if the first arg was an fd, find out which one it was. */
195 unsigned int check_if_fd(unsigned int child
)
197 unsigned int highest
;
201 /* shortcut, if it's out of range, it's not going to be valid. */
202 if (shm
->a1
[child
] > 1024)
205 highest
= highest_logfile();
206 if (shm
->a1
[child
] < highest
)
209 lock(&shm
->syscall_lock
);
210 callno
= shm
->syscallno
[child
];
211 do32
= shm
->do32bit
[child
];
212 unlock(&shm
->syscall_lock
);
214 if (biarch
== FALSE
) {
215 if (syscalls
[callno
].entry
->arg1type
== ARG_FD
)
222 if (syscalls_32bit
[callno
].entry
->arg1type
== ARG_FD
)
225 if (callno
> max_nr_64bit_syscalls
) {
226 output(0, "Weird, child:%d callno:%d (64bit max:%d)\n", child
, callno
, max_nr_64bit_syscalls
);
229 if (syscalls_64bit
[callno
].entry
->arg1type
== ARG_FD
)
236 static void stuck_syscall_info(int childno
)
238 unsigned int callno
= shm
->syscallno
[childno
];
240 pid_t pid
= shm
->pids
[childno
];
242 memset(fdstr
, 0, sizeof(fdstr
));
244 if (check_if_fd(childno
) == TRUE
)
245 sprintf(fdstr
, "(fd = %d)", (unsigned int) shm
->a1
[childno
]);
247 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
249 print_syscall_name(shm
->syscallno
[childno
], shm
->do32bit
[childno
]),
250 shm
->do32bit
[childno
] ? " (32bit)" : "",
254 static void check_children(void)
261 for_each_pidslot(i
) {
266 if (pid
== EMPTY_PIDSLOT
)
269 old
= shm
->tv
[i
].tv_sec
;
274 gettimeofday(&tv
, NULL
);
277 /* if we wrapped, just reset it, we'll pick it up next time around. */
278 if (old
> (now
+ 3)) {
279 output(1, "child %u wrapped! old=%lu now=%lu\n", i
, old
, now
);
280 shm
->tv
[i
].tv_sec
= now
;
286 /* if we're way off, we're comparing garbage. Reset it. */
288 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i
, pid
, old
, now
, diff
);
289 shm
->tv
[i
].tv_sec
= now
;
293 /* After 30 seconds of no progress, send a kill signal. */
295 stuck_syscall_info(i
);
296 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
297 pid
, old
, now
, diff
);
303 if (shm
->kill_count
[i
] > 1) {
304 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
305 pid
, shm
->kill_count
[i
], diff
);
307 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
310 shm
->kill_count
[i
]++;
311 ret
= kill(pid
, SIGKILL
);
313 output(0, "couldn't kill pid %d [%s]\n", pid
, strerror(errno
));
315 sleep(1); // give child time to exit.
320 static void watchdog(void)
322 static const char watchdogname
[17]="trinity-watchdog";
323 static unsigned long lastcount
= 0;
324 bool watchdog_exit
= FALSE
;
327 while (shm
->ready
== FALSE
) {
329 if (shm
->exit_reason
!= STILL_RUNNING
)
333 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid
);
335 prctl(PR_SET_NAME
, (unsigned long) &watchdogname
);
336 (void)signal(SIGSEGV
, SIG_DFL
);
338 while (watchdog_exit
== FALSE
) {
340 if (check_shm_sanity() == SHM_CORRUPT
)
343 if (check_main_alive() == FALSE
)
346 if (shm
->regenerating
== FALSE
) {
353 if (syscalls_todo
&& (shm
->total_syscalls_done
>= syscalls_todo
)) {
354 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo
);
355 shm
->exit_reason
= EXIT_REACHED_COUNT
;
358 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
359 if (shm
->total_syscalls_done
% 1000 == 0)
362 for_each_pidslot(i
) {
363 if (shm
->child_syscall_count
[i
] > hiscore
)
364 hiscore
= shm
->child_syscall_count
[i
];
367 if (shm
->total_syscalls_done
> 1) {
368 if (shm
->total_syscalls_done
- lastcount
> 10000) {
369 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
370 shm
->total_syscalls_done
,
371 shm
->failures
, shm
->successes
,
373 lastcount
= shm
->total_syscalls_done
;
379 /* Only check taint if it mask allows it */
380 if (kernel_taint_mask
!= 0) {
381 ret
= check_tainted();
382 if (((ret
& kernel_taint_mask
) & (~kernel_taint_initial
)) != 0) {
383 gettimeofday(&shm
->taint_tv
, NULL
);
385 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret
, kernel_taint_initial
, shm
->seed
);
386 shm
->exit_reason
= EXIT_KERNEL_TAINTED
;
390 if (shm
->need_reseed
== FALSE
) {
391 shm
->reseed_counter
++;
392 /* If we haven't reseeded in five minutes, trigger one. */
393 if (shm
->reseed_counter
== 300) {
394 output(0, "Triggering periodic reseed.\n");
395 shm
->need_reseed
= TRUE
;
396 shm
->reseed_counter
= 0;
402 if (shm
->exit_reason
!= STILL_RUNNING
) {
403 /* Give children a chance to exit. */
406 /* Are there still children running ? */
407 if (pidmap_empty() == TRUE
)
408 watchdog_exit
= TRUE
;
410 output(0, "exit_reason=%d, but %d children still running.\n",
411 shm
->exit_reason
, shm
->running_childs
);
420 /* We don't want to ever exit before main is waiting for us. */
421 while (shm
->regenerating
== TRUE
)
427 void init_watchdog(void)
435 watchdog_pid
= getpid();
437 output(0, "[%d] Watchdog exiting\n", watchdog_pid
);
442 output(0, "Started watchdog process, PID is %d\n", watchdog_pid
);