12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
18 #include "params.h" // quiet_level
23 #include "trinity.h" //check_taint
27 static unsigned long hiscore
= 0;
29 static int check_shm_sanity(void)
34 if (shm
->running_childs
== 0)
39 if (pid
== EMPTY_PIDSLOT
)
42 if (pid_is_valid(pid
) == FALSE
) {
43 shm
->exit_reason
= EXIT_PID_OUT_OF_RANGE
;
48 // FIXME: The '500000' is magic, and should be dynamically calculated.
49 // On startup, we should figure out how many getpid()'s per second we can do,
51 if (shm
->total_syscalls_done
- shm
->previous_count
> 500000) {
52 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
53 shm
->previous_count
, shm
->total_syscalls_done
);
54 shm
->exit_reason
= EXIT_SHM_CORRUPTION
;
56 shm
->previous_count
= shm
->total_syscalls_done
;
61 static unsigned int reap_dead_kids(void)
64 unsigned int alive
= 0;
65 unsigned int reaped
= 0;
72 if (pid
== EMPTY_PIDSLOT
)
76 /* If it disappeared, reap it. */
79 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid
);
83 output(0, "problem checking on pid %d (%d:%s)\n", pid
, errno
, strerror(errno
));
89 if (shm
->running_childs
== 0)
94 output(0, "Reaped %d dead children\n", reaped
);
99 static void kill_all_kids(void)
103 shm
->spawn_no_more
= TRUE
;
105 /* Wait for all the children to exit. */
106 while (shm
->running_childs
> 0) {
109 /* Make sure there's no dead kids lying around.
110 * We need to do this in case the oom killer has been killing them,
111 * otherwise we end up stuck here with no child processes.
113 alive
= reap_dead_kids();
117 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
118 for_each_pidslot(i
) {
122 if (pid
== EMPTY_PIDSLOT
)
128 /* wait a second to give kids a chance to exit. */
131 if (check_shm_sanity()) {
132 // FIXME: If we get here, we over-wrote the real exit_reason.
133 // We should have saved that, and handled appropriately.
138 /* Just to be sure, clear out the pid slots. */
139 for_each_pidslot(i
) {
140 shm
->pids
[i
] = EMPTY_PIDSLOT
;
144 static bool __check_main(void)
148 if (shm
->mainpid
== 0)
151 ret
= kill(shm
->mainpid
, 0);
153 if (errno
== ESRCH
) {
154 output(0, "main pid %d has disappeared.\n", shm
->mainpid
);
155 shm
->exit_reason
= EXIT_MAIN_DISAPPEARED
;
158 /* if main crashed while regenerating, we'll hang the watchdog,
159 * because nothing will ever set it back to FALSE. So we do it ourselves.
161 shm
->regenerating
= FALSE
;
163 output(0, "problem checking on pid %d (%d:%s)\n", shm
->mainpid
, errno
, strerror(errno
));
170 static int check_main_alive(void)
174 /* If we're in the process of exiting, wait, and return without checking. */
175 if (shm
->exit_reason
!= STILL_RUNNING
) {
176 while (shm
->mainpid
!= 0) {
177 /* make sure main is still alive, to wait for kids. */
178 ret
= __check_main();
187 ret
= __check_main();
191 /* if the first arg was an fd, find out which one it was. */
192 unsigned int check_if_fd(unsigned int child
)
194 unsigned int highest
;
195 unsigned callno
= shm
->syscallno
[child
];
197 /* shortcut, if it's out of range, it's not going to be valid. */
198 if (shm
->a1
[child
] > 1024)
201 highest
= highest_logfile();
202 if (shm
->a1
[child
] < highest
)
205 if (biarch
== FALSE
) {
206 if (syscalls
[callno
].entry
->arg1type
== ARG_FD
)
212 if (shm
->do32bit
[child
] == TRUE
) {
213 if (syscalls_32bit
[callno
].entry
->arg1type
== ARG_FD
)
216 if (syscalls_64bit
[callno
].entry
->arg1type
== ARG_FD
)
223 static void stuck_syscall_info(int childno
)
225 unsigned int callno
= shm
->syscallno
[childno
];
227 pid_t pid
= shm
->pids
[childno
];
229 memset(fdstr
, 0, sizeof(fdstr
));
231 if (check_if_fd(childno
) == TRUE
)
232 sprintf(fdstr
, "(fd = %d)", (unsigned int) shm
->a1
[childno
]);
234 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
236 print_syscall_name(shm
->syscallno
[childno
], shm
->do32bit
[childno
]),
237 shm
->do32bit
[childno
] ? " (32bit)" : "",
241 static void check_children(void)
249 for_each_pidslot(i
) {
252 if (pid
== EMPTY_PIDSLOT
)
255 old
= shm
->tv
[i
].tv_sec
;
260 gettimeofday(&tv
, NULL
);
263 /* if we wrapped, just reset it, we'll pick it up next time around. */
264 if (old
> (now
+ 3)) {
265 output(1, "child %d wrapped! old=%ld now=%ld\n", i
, old
, now
);
266 shm
->tv
[i
].tv_sec
= now
;
272 /* if we're way off, we're comparing garbage. Reset it. */
274 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i
, pid
, old
, now
, diff
);
275 shm
->tv
[i
].tv_sec
= now
;
279 /* After 30 seconds of no progress, send a kill signal. */
281 stuck_syscall_info(i
);
282 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
283 pid
, old
, now
, diff
);
289 if (shm
->kill_count
[i
] > 1) {
290 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
291 pid
, shm
->kill_count
[i
], diff
);
293 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
296 shm
->kill_count
[i
]++;
297 ret
= kill(pid
, SIGKILL
);
299 output(0, "couldn't kill pid %d [%s]\n", pid
, strerror(errno
));
301 sleep(1); // give child time to exit.
306 static void watchdog(void)
308 static const char watchdogname
[17]="trinity-watchdog";
309 static unsigned long lastcount
= 0;
310 bool watchdog_exit
= FALSE
;
313 while (shm
->ready
== FALSE
) {
315 if (shm
->exit_reason
!= STILL_RUNNING
)
319 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid
);
321 prctl(PR_SET_NAME
, (unsigned long) &watchdogname
);
322 (void)signal(SIGSEGV
, SIG_DFL
);
324 while (watchdog_exit
== FALSE
) {
326 if (check_shm_sanity() == SHM_CORRUPT
)
329 if (check_main_alive() == FALSE
)
332 if (shm
->regenerating
== FALSE
) {
339 if (syscalls_todo
&& (shm
->total_syscalls_done
>= syscalls_todo
)) {
340 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo
);
341 shm
->exit_reason
= EXIT_REACHED_COUNT
;
344 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
345 if (shm
->total_syscalls_done
% 1000 == 0)
348 for_each_pidslot(i
) {
349 if (shm
->child_syscall_count
[i
] > hiscore
)
350 hiscore
= shm
->child_syscall_count
[i
];
353 if (shm
->total_syscalls_done
> 1) {
354 if (shm
->total_syscalls_done
- lastcount
> 10000) {
355 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
356 shm
->total_syscalls_done
,
357 shm
->failures
, shm
->successes
,
359 lastcount
= shm
->total_syscalls_done
;
365 /* Only check taint if it mask allows it */
366 if (kernel_taint_mask
!= 0) {
367 ret
= check_tainted();
368 if (((ret
& kernel_taint_mask
) & (~kernel_taint_initial
)) != 0) {
369 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret
, kernel_taint_initial
, shm
->seed
);
370 shm
->exit_reason
= EXIT_KERNEL_TAINTED
;
374 if (shm
->need_reseed
== FALSE
) {
375 shm
->reseed_counter
++;
376 /* If we haven't reseeded in five minutes, trigger one. */
377 if (shm
->reseed_counter
== 300) {
378 output(0, "Triggering periodic reseed.\n");
379 shm
->need_reseed
= TRUE
;
380 shm
->reseed_counter
= 0;
386 if (shm
->exit_reason
!= STILL_RUNNING
) {
387 /* Give children a chance to exit. */
390 /* Are there still children running ? */
391 if (pidmap_empty() == TRUE
)
392 watchdog_exit
= TRUE
;
394 output(0, "exit_reason=%d, but %d children still running.\n",
395 shm
->exit_reason
, shm
->running_childs
);
404 /* We don't want to ever exit before main is waiting for us. */
405 while (shm
->regenerating
== TRUE
)
411 void init_watchdog(void)
419 watchdog_pid
= getpid();
421 output(0, "[%d] Watchdog exiting\n", watchdog_pid
);
426 output(0, "Started watchdog process, PID is %d\n", watchdog_pid
);