12 #include <sys/ptrace.h>
14 #include "arch.h" // biarch
18 #include "params.h" // quiet_level
23 #include "trinity.h" //check_taint
27 static unsigned long hiscore
= 0;
29 static int check_shm_sanity(void)
34 if (shm
->running_childs
== 0)
39 if (pid
== EMPTY_PIDSLOT
)
42 if (pid_is_valid(pid
) == FALSE
) {
43 shm
->exit_reason
= EXIT_PID_OUT_OF_RANGE
;
48 // FIXME: The '500000' is magic, and should be dynamically calculated.
49 // On startup, we should figure out how many getpid()'s per second we can do,
51 if (shm
->total_syscalls_done
- shm
->previous_count
> 500000) {
52 output(0, "Execcount increased dramatically! (old:%ld new:%ld):\n",
53 shm
->previous_count
, shm
->total_syscalls_done
);
54 shm
->exit_reason
= EXIT_SHM_CORRUPTION
;
56 shm
->previous_count
= shm
->total_syscalls_done
;
61 static unsigned int reap_dead_kids(void)
64 unsigned int alive
= 0;
65 unsigned int reaped
= 0;
72 if (pid
== EMPTY_PIDSLOT
)
76 /* If it disappeared, reap it. */
79 output(0, "pid %d has disappeared (oom-killed maybe?). Reaping.\n", pid
);
83 output(0, "problem checking on pid %d (%d:%s)\n", pid
, errno
, strerror(errno
));
89 if (shm
->running_childs
== 0)
94 output(0, "Reaped %d dead children\n", reaped
);
99 static void kill_all_kids(void)
103 shm
->spawn_no_more
= TRUE
;
105 /* Wait for all the children to exit. */
106 while (shm
->running_childs
> 0) {
109 /* Make sure there's no dead kids lying around.
110 * We need to do this in case the oom killer has been killing them,
111 * otherwise we end up stuck here with no child processes.
113 alive
= reap_dead_kids();
117 /* Ok, some kids are still alive. 'help' them along with a SIGKILL */
118 for_each_pidslot(i
) {
122 if (pid
== EMPTY_PIDSLOT
)
128 /* wait a second to give kids a chance to exit. */
131 if (check_shm_sanity()) {
132 // FIXME: If we get here, we over-wrote the real exit_reason.
133 // We should have saved that, and handled appropriately.
138 /* Just to be sure, clear out the pid slots. */
139 for_each_pidslot(i
) {
140 shm
->pids
[i
] = EMPTY_PIDSLOT
;
144 static int check_main_alive(void)
148 /* If we're in the process of exiting, wait, and return without checking. */
149 if (shm
->exit_reason
!= STILL_RUNNING
) {
150 while (shm
->mainpid
!= 0) {
157 if (shm
->mainpid
== 0) {
158 output(0, "main pid was zero!\n");
159 shm
->exit_reason
= EXIT_MAIN_DISAPPEARED
;
163 ret
= kill(shm
->mainpid
, 0);
165 if (errno
== ESRCH
) {
166 output(0, "main pid %d has disappeared.\n", shm
->mainpid
);
167 shm
->exit_reason
= EXIT_MAIN_DISAPPEARED
;
169 /* if main crashed while regenerating, we'll hang the watchdog,
170 * because nothing will ever set it back to FALSE. So we do it ourselves.
172 shm
->regenerating
= FALSE
;
174 output(0, "problem checking on pid %d (%d:%s)\n", shm
->mainpid
, errno
, strerror(errno
));
181 /* if the first arg was an fd, find out which one it was. */
182 unsigned int check_if_fd(unsigned int child
)
184 unsigned int highest
;
185 unsigned callno
= shm
->syscallno
[child
];
187 /* shortcut, if it's out of range, it's not going to be valid. */
188 if (shm
->a1
[child
] > 1024)
191 highest
= highest_logfile();
192 if (shm
->a1
[child
] < highest
)
195 if (biarch
== FALSE
) {
196 if (syscalls
[callno
].entry
->arg1type
== ARG_FD
)
202 if (shm
->do32bit
[child
] == TRUE
) {
203 if (syscalls_32bit
[callno
].entry
->arg1type
== ARG_FD
)
206 if (syscalls_64bit
[callno
].entry
->arg1type
== ARG_FD
)
213 static void stuck_syscall_info(int childno
)
215 unsigned int callno
= shm
->syscallno
[childno
];
217 pid_t pid
= shm
->pids
[childno
];
219 memset(fdstr
, 0, sizeof(fdstr
));
221 if (check_if_fd(childno
) == TRUE
)
222 sprintf(fdstr
, "(fd = %d)", (unsigned int) shm
->a1
[childno
]);
224 output(0, "[%d] Stuck in syscall %d:%s%s%s.\n",
226 print_syscall_name(shm
->syscallno
[childno
], shm
->do32bit
[childno
]),
227 shm
->do32bit
[childno
] ? " (32bit)" : "",
231 static void check_children(void)
239 gettimeofday(&tv
, NULL
);
242 for_each_pidslot(i
) {
245 if (pid
== EMPTY_PIDSLOT
)
248 old
= shm
->tv
[i
].tv_sec
;
253 /* if we wrapped, just reset it, we'll pick it up next time around. */
254 if (old
> (now
+ 3)) {
255 output(1, "child %d wrapped! old=%ld now=%ld\n", i
, old
, now
);
256 shm
->tv
[i
].tv_sec
= now
;
262 /* if we're way off, we're comparing garbage. Reset it. */
264 output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i
, pid
, old
, now
, diff
);
265 shm
->tv
[i
].tv_sec
= now
;
269 /* After 30 seconds of no progress, send a kill signal. */
271 stuck_syscall_info(i
);
272 output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
273 pid
, old
, now
, diff
);
279 if (shm
->kill_count
[i
] > 1) {
280 output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
281 pid
, shm
->kill_count
[i
], diff
);
283 output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
286 shm
->kill_count
[i
]++;
287 ret
= kill(pid
, SIGKILL
);
289 output(0, "couldn't kill pid %d [%s]\n", pid
, strerror(errno
));
291 sleep(1); // give child time to exit.
296 static void watchdog(void)
298 static const char watchdogname
[17]="trinity-watchdog";
299 static unsigned long lastcount
= 0;
300 bool watchdog_exit
= FALSE
;
303 while (shm
->ready
== FALSE
) {
305 if (shm
->exit_reason
!= STILL_RUNNING
)
309 output(0, "Watchdog is alive. (pid:%d)\n", watchdog_pid
);
311 prctl(PR_SET_NAME
, (unsigned long) &watchdogname
);
312 (void)signal(SIGSEGV
, SIG_DFL
);
314 while (watchdog_exit
== FALSE
) {
316 if (check_shm_sanity() == SHM_CORRUPT
)
319 if (check_main_alive() == FALSE
)
322 if (shm
->regenerating
== FALSE
) {
329 if (syscalls_todo
&& (shm
->total_syscalls_done
>= syscalls_todo
)) {
330 output(0, "Reached limit %d. Telling children to exit.\n", syscalls_todo
);
331 shm
->exit_reason
= EXIT_REACHED_COUNT
;
334 // Periodic log syncing. FIXME: This is kinda ugly, and mostly unnecessary.
335 if (shm
->total_syscalls_done
% 1000 == 0)
338 for_each_pidslot(i
) {
339 if (shm
->child_syscall_count
[i
] > hiscore
)
340 hiscore
= shm
->child_syscall_count
[i
];
343 if (shm
->total_syscalls_done
> 1) {
344 if (shm
->total_syscalls_done
- lastcount
> 10000) {
345 output(0, "%ld iterations. [F:%ld S:%ld HI:%ld]\n",
346 shm
->total_syscalls_done
,
347 shm
->failures
, shm
->successes
,
349 lastcount
= shm
->total_syscalls_done
;
355 /* Only check taint if it mask allows it */
356 if (kernel_taint_mask
!= 0) {
357 ret
= check_tainted();
358 if (((ret
& kernel_taint_mask
) & (~kernel_taint_initial
)) != 0) {
359 output(0, "kernel became tainted! (%d/%d) Last seed was %u\n", ret
, kernel_taint_initial
, shm
->seed
);
360 shm
->exit_reason
= EXIT_KERNEL_TAINTED
;
364 if (shm
->need_reseed
== FALSE
) {
365 shm
->reseed_counter
++;
366 /* If we haven't reseeded in five minutes, trigger one. */
367 if (shm
->reseed_counter
== 300) {
368 output(0, "Triggering periodic reseed.\n");
369 shm
->need_reseed
= TRUE
;
370 shm
->reseed_counter
= 0;
376 if (shm
->exit_reason
!= STILL_RUNNING
) {
377 /* Give children a chance to exit. */
380 /* Are there still children running ? */
381 if (pidmap_empty() == TRUE
)
382 watchdog_exit
= TRUE
;
384 output(0, "exit_reason=%d, but %d children still running.\n",
385 shm
->exit_reason
, shm
->running_childs
);
394 /* We don't want to ever exit before main is waiting for us. */
395 while (shm
->regenerating
== TRUE
)
401 void init_watchdog(void)
409 watchdog_pid
= getpid();
411 output(0, "[%d] Watchdog exiting\n", watchdog_pid
);
416 output(0, "Started watchdog process, PID is %d\n", watchdog_pid
);