2 fork on steroids to avoid SIGCHLD and waitpid
4 Copyright (C) Stefan Metzmacher 2010
5 Copyright (C) Ralph Boehme 2017
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "system/wait.h"
23 #include "system/filesys.h"
24 #include "system/network.h"
25 #include "lib/util/samba_util.h"
26 #include "lib/util/sys_rw.h"
27 #include "lib/util/tfork.h"
28 #include "lib/util/debug.h"
29 #include "lib/util/util_process.h"
41 * This is how the process hierarchy looks like:
62 #ifdef HAVE_VALGRIND_HELGRIND_H
63 #include <valgrind/helgrind.h>
65 #ifndef ANNOTATE_BENIGN_RACE_SIZED
66 #define ANNOTATE_BENIGN_RACE_SIZED(obj, size, description)
69 #define TFORK_ANNOTATE_BENIGN_RACE(obj) \
70 ANNOTATE_BENIGN_RACE_SIZED( \
71 (obj), sizeof(*(obj)), \
72 "no race, serialized by tfork_[un]install_sigchld_handler");
75 * The resulting (private) state per tfork_create() call, returned as a opaque
76 * handle to the caller.
80 * This is returned to the caller with tfork_event_fd()
85 * This is used in the caller by tfork_status() to read the worker exit
86 * status and to tell the waiter to exit by closing the fd.
95 * Internal per-thread state maintained while inside tfork.
105 * A global state that synchronizes access to handling SIGCHLD and waiting for
108 struct tfork_signal_state
{
113 pthread_mutex_t mutex
;
117 * pid of the waiter child. This points at waiter_pid in either struct
118 * tfork or struct tfork_state, depending on who called
119 * tfork_install_sigchld_handler().
121 * When tfork_install_sigchld_handler() is called the waiter_pid is
122 * still -1 and only set later after fork(), that's why this is must be
123 * a pointer. The signal handler checks this.
127 struct sigaction oldact
;
131 static struct tfork_signal_state signal_state
;
134 static pthread_once_t tfork_global_is_initialized
= PTHREAD_ONCE_INIT
;
135 static pthread_key_t tfork_global_key
;
137 static struct tfork_state
*global_state
;
140 static void tfork_sigchld_handler(int signum
, siginfo_t
*si
, void *p
);
143 static void tfork_global_destructor(void *state
)
145 anonymous_shared_free(state
);
149 static int tfork_acquire_sighandling(void)
154 ret
= pthread_mutex_lock(&signal_state
.mutex
);
159 while (!signal_state
.available
) {
160 ret
= pthread_cond_wait(&signal_state
.cond
,
161 &signal_state
.mutex
);
167 signal_state
.available
= false;
169 ret
= pthread_mutex_unlock(&signal_state
.mutex
);
178 static int tfork_release_sighandling(void)
183 ret
= pthread_mutex_lock(&signal_state
.mutex
);
188 signal_state
.available
= true;
190 ret
= pthread_cond_signal(&signal_state
.cond
);
192 pthread_mutex_unlock(&signal_state
.mutex
);
196 ret
= pthread_mutex_unlock(&signal_state
.mutex
);
206 static void tfork_atfork_prepare(void)
210 ret
= pthread_mutex_lock(&signal_state
.mutex
);
214 static void tfork_atfork_parent(void)
218 ret
= pthread_mutex_unlock(&signal_state
.mutex
);
223 static void tfork_atfork_child(void)
228 ret
= pthread_mutex_unlock(&signal_state
.mutex
);
231 ret
= pthread_key_delete(tfork_global_key
);
234 ret
= pthread_key_create(&tfork_global_key
, tfork_global_destructor
);
238 * There's no data race on the cond variable from the signal state, we
239 * are writing here, but there are no readers yet. Some data race
240 * detection tools report a race, but the readers are in the parent
243 TFORK_ANNOTATE_BENIGN_RACE(&signal_state
.cond
);
246 * There's no way to destroy a condition variable if there are waiters,
247 * pthread_cond_destroy() will return EBUSY. Just zero out memory and
248 * then initialize again. This is not backed by POSIX but should be ok.
250 ZERO_STRUCT(signal_state
.cond
);
251 ret
= pthread_cond_init(&signal_state
.cond
, NULL
);
255 if (signal_state
.pid
!= NULL
) {
257 ret
= sigaction(SIGCHLD
, &signal_state
.oldact
, NULL
);
261 ret
= pthread_sigmask(SIG_SETMASK
, &signal_state
.oldset
, NULL
);
263 ret
= sigprocmask(SIG_SETMASK
, &signal_state
.oldset
, NULL
);
267 signal_state
.pid
= NULL
;
270 signal_state
.available
= true;
273 static void tfork_global_initialize(void)
278 pthread_atfork(tfork_atfork_prepare
,
282 ret
= pthread_key_create(&tfork_global_key
, tfork_global_destructor
);
285 ret
= pthread_mutex_init(&signal_state
.mutex
, NULL
);
288 ret
= pthread_cond_init(&signal_state
.cond
, NULL
);
292 * In a threaded process there's no data race on t->waiter_pid as
293 * we're serializing globally via tfork_acquire_sighandling() and
294 * tfork_release_sighandling().
296 TFORK_ANNOTATE_BENIGN_RACE(&signal_state
.pid
);
299 signal_state
.available
= true;
302 static struct tfork_state
*tfork_global_get(void)
304 struct tfork_state
*state
= NULL
;
310 state
= (struct tfork_state
*)pthread_getspecific(tfork_global_key
);
312 state
= global_state
;
318 state
= (struct tfork_state
*)anonymous_shared_allocate(
319 sizeof(struct tfork_state
));
325 ret
= pthread_setspecific(tfork_global_key
, state
);
327 anonymous_shared_free(state
);
334 static void tfork_global_free(void)
336 struct tfork_state
*state
= NULL
;
342 state
= (struct tfork_state
*)pthread_getspecific(tfork_global_key
);
344 state
= global_state
;
351 ret
= pthread_setspecific(tfork_global_key
, NULL
);
356 anonymous_shared_free(state
);
360 * Only one thread at a time is allowed to handle SIGCHLD signals
362 static int tfork_install_sigchld_handler(pid_t
*pid
)
365 struct sigaction act
;
368 ret
= tfork_acquire_sighandling();
373 assert(signal_state
.pid
== NULL
);
374 signal_state
.pid
= pid
;
376 act
= (struct sigaction
) {
377 .sa_sigaction
= tfork_sigchld_handler
,
378 .sa_flags
= SA_SIGINFO
,
381 ret
= sigaction(SIGCHLD
, &act
, &signal_state
.oldact
);
387 sigaddset(&set
, SIGCHLD
);
389 ret
= pthread_sigmask(SIG_UNBLOCK
, &set
, &signal_state
.oldset
);
391 ret
= sigprocmask(SIG_UNBLOCK
, &set
, &signal_state
.oldset
);
400 static int tfork_uninstall_sigchld_handler(void)
404 signal_state
.pid
= NULL
;
406 ret
= sigaction(SIGCHLD
, &signal_state
.oldact
, NULL
);
412 ret
= pthread_sigmask(SIG_SETMASK
, &signal_state
.oldset
, NULL
);
414 ret
= sigprocmask(SIG_SETMASK
, &signal_state
.oldset
, NULL
);
420 ret
= tfork_release_sighandling();
428 static void tfork_sigchld_handler(int signum
, siginfo_t
*si
, void *p
)
430 if ((signal_state
.pid
!= NULL
) &&
431 (*signal_state
.pid
!= -1) &&
432 (si
->si_pid
== *signal_state
.pid
))
438 * Not our child, forward to old handler
440 if (signal_state
.oldact
.sa_flags
& SA_SIGINFO
) {
441 signal_state
.oldact
.sa_sigaction(signum
, si
, p
);
445 if (signal_state
.oldact
.sa_handler
== SIG_IGN
) {
448 if (signal_state
.oldact
.sa_handler
== SIG_DFL
) {
451 signal_state
.oldact
.sa_handler(signum
);
454 static pid_t
tfork_start_waiter_and_worker(struct tfork_state
*state
,
459 int status_sp_caller_fd
= -1;
460 int status_sp_waiter_fd
= -1;
461 int event_pipe_caller_fd
= -1;
462 int event_pipe_waiter_fd
= -1;
463 int ready_pipe_caller_fd
= -1;
464 int ready_pipe_worker_fd
= -1;
480 ret
= socketpair(AF_UNIX
, SOCK_STREAM
, 0, p
);
484 set_close_on_exec(p
[0]);
485 set_close_on_exec(p
[1]);
486 status_sp_caller_fd
= p
[0];
487 status_sp_waiter_fd
= p
[1];
491 close(status_sp_caller_fd
);
492 close(status_sp_waiter_fd
);
495 set_close_on_exec(p
[0]);
496 set_close_on_exec(p
[1]);
497 event_pipe_caller_fd
= p
[0];
498 event_pipe_waiter_fd
= p
[1];
503 close(status_sp_caller_fd
);
504 close(status_sp_waiter_fd
);
505 close(event_pipe_caller_fd
);
506 close(event_pipe_waiter_fd
);
509 set_close_on_exec(p
[0]);
510 set_close_on_exec(p
[1]);
511 ready_pipe_worker_fd
= p
[0];
512 ready_pipe_caller_fd
= p
[1];
516 close(status_sp_caller_fd
);
517 close(status_sp_waiter_fd
);
518 close(event_pipe_caller_fd
);
519 close(event_pipe_waiter_fd
);
520 close(ready_pipe_caller_fd
);
521 close(ready_pipe_worker_fd
);
528 * In a threaded process there's no data race on
529 * state->waiter_pid as we're serializing globally via
530 * tfork_acquire_sighandling() and tfork_release_sighandling().
532 TFORK_ANNOTATE_BENIGN_RACE(&state
->waiter_pid
);
534 state
->waiter_pid
= pid
;
536 close(status_sp_waiter_fd
);
537 close(event_pipe_waiter_fd
);
538 close(ready_pipe_worker_fd
);
540 set_blocking(event_pipe_caller_fd
, false);
543 * wait for the waiter to get ready.
545 nread
= sys_read(status_sp_caller_fd
, &c
, sizeof(char));
546 if (nread
!= sizeof(char)) {
551 * Notify the worker to start.
553 nwritten
= sys_write(ready_pipe_caller_fd
,
554 &(char){0}, sizeof(char));
555 if (nwritten
!= sizeof(char)) {
556 close(ready_pipe_caller_fd
);
559 close(ready_pipe_caller_fd
);
561 *_event_fd
= event_pipe_caller_fd
;
562 *_status_fd
= status_sp_caller_fd
;
568 /* cleanup sigchld_handler */
569 tfork_atfork_child();
573 * The "waiter" child.
575 process_set_title("tfork waiter", "tfork waiter process");
577 CatchSignal(SIGCHLD
, SIG_DFL
);
579 close(status_sp_caller_fd
);
580 close(event_pipe_caller_fd
);
581 close(ready_pipe_caller_fd
);
585 state
->waiter_errno
= errno
;
593 close(status_sp_waiter_fd
);
594 close(event_pipe_waiter_fd
);
597 * Wait for the caller to give us a go!
599 nread
= sys_read(ready_pipe_worker_fd
, &c
, sizeof(char));
600 if (nread
!= sizeof(char)) {
603 close(ready_pipe_worker_fd
);
607 state
->worker_pid
= pid
;
608 process_set_title("tfork(%d)", "tfork waiter process(%d)", pid
);
610 close(ready_pipe_worker_fd
);
613 * We're going to stay around until child2 exits, so lets close all fds
614 * other than the pipe fd we may have inherited from the caller.
616 * Dup event_sp_waiter_fd and status_sp_waiter_fd onto fds 0 and 1 so we
617 * can then call closefrom(2).
619 if (event_pipe_waiter_fd
> 0) {
622 if (status_sp_waiter_fd
== 0) {
627 fd
= dup2(event_pipe_waiter_fd
, dup_fd
);
628 } while ((fd
== -1) && (errno
== EINTR
));
630 state
->waiter_errno
= errno
;
631 kill(state
->worker_pid
, SIGKILL
);
632 state
->worker_pid
= -1;
635 event_pipe_waiter_fd
= fd
;
638 if (status_sp_waiter_fd
> 1) {
640 fd
= dup2(status_sp_waiter_fd
, 1);
641 } while ((fd
== -1) && (errno
== EINTR
));
643 state
->waiter_errno
= errno
;
644 kill(state
->worker_pid
, SIGKILL
);
645 state
->worker_pid
= -1;
648 status_sp_waiter_fd
= fd
;
653 /* Tell the caller we're ready */
654 nwritten
= sys_write(status_sp_waiter_fd
, &(char){0}, sizeof(char));
655 if (nwritten
!= sizeof(char)) {
663 ret
= waitpid(pid
, &status
, 0);
664 } while ((ret
== -1) && (errno
== EINTR
));
671 * This writes the worker child exit status via our internal socketpair
672 * so the tfork_status() implementation can read it from its end.
674 nwritten
= sys_write(status_sp_waiter_fd
, &status
, sizeof(status
));
675 if (nwritten
== -1) {
676 if (errno
!= EPIPE
&& errno
!= ECONNRESET
) {
680 * The caller exitted and didn't call tfork_status().
684 if (nwritten
!= sizeof(status
)) {
689 * This write to the event_fd returned by tfork_event_fd() and notifies
690 * the caller that the worker child is done and he may now call
693 nwritten
= sys_write(event_pipe_waiter_fd
, &(char){0}, sizeof(char));
694 if (nwritten
!= sizeof(char)) {
699 * Wait for our parent (the process that called tfork_create()) to
700 * close() the socketpair fd in tfork_status().
702 * Again, the caller might have exitted without calling tfork_status().
704 nread
= sys_read(status_sp_waiter_fd
, &c
, 1);
706 if (errno
== EPIPE
|| errno
== ECONNRESET
) {
718 static int tfork_create_reap_waiter(pid_t waiter_pid
)
723 if (waiter_pid
== -1) {
727 kill(waiter_pid
, SIGKILL
);
730 pid
= waitpid(waiter_pid
, &waiter_status
, 0);
731 } while ((pid
== -1) && (errno
== EINTR
));
732 assert(pid
== waiter_pid
);
737 struct tfork
*tfork_create(void)
739 struct tfork_state
*state
= NULL
;
740 struct tfork
*t
= NULL
;
747 ret
= pthread_once(&tfork_global_is_initialized
,
748 tfork_global_initialize
);
753 tfork_global_initialize();
756 state
= tfork_global_get();
760 *state
= (struct tfork_state
) {
762 .waiter_errno
= ECANCELED
,
766 t
= malloc(sizeof(struct tfork
));
772 *t
= (struct tfork
) {
779 ret
= tfork_install_sigchld_handler(&state
->waiter_pid
);
784 pid
= tfork_start_waiter_and_worker(state
,
799 * In a threaded process there's no data race on t->waiter_pid as
800 * we're serializing globally via tfork_acquire_sighandling() and
801 * tfork_release_sighandling().
803 TFORK_ANNOTATE_BENIGN_RACE(&t
->waiter_pid
);
806 t
->worker_pid
= state
->worker_pid
;
813 if (t
->status_fd
!= -1) {
816 if (t
->event_fd
!= -1) {
820 ret2
= tfork_create_reap_waiter(state
->waiter_pid
);
828 ret2
= tfork_uninstall_sigchld_handler();
839 pid_t
tfork_child_pid(const struct tfork
*t
)
841 return t
->worker_pid
;
844 int tfork_event_fd(struct tfork
*t
)
846 int fd
= t
->event_fd
;
848 assert(t
->event_fd
!= -1);
854 int tfork_status(struct tfork
**_t
, bool wait
)
856 struct tfork
*t
= *_t
;
868 set_blocking(t
->status_fd
, true);
870 nread
= sys_read(t
->status_fd
, &status
, sizeof(int));
872 set_blocking(t
->status_fd
, false);
874 nread
= read(t
->status_fd
, &status
, sizeof(int));
876 ((errno
== EAGAIN
) || (errno
== EWOULDBLOCK
) || errno
== EINTR
)) {
881 if (nread
!= sizeof(int)) {
885 ret
= tfork_install_sigchld_handler(&t
->waiter_pid
);
891 * This triggers process exit in the waiter.
892 * We write to the fd as well as closing it, as any tforked sibling
893 * processes will also have the writable end of this socket open.
898 nwritten
= sys_write(t
->status_fd
, &(char){0}, sizeof(char));
899 if (nwritten
!= sizeof(char)) {
907 pid
= waitpid(t
->waiter_pid
, &waiter_status
, 0);
908 } while ((pid
== -1) && (errno
== EINTR
));
909 assert(pid
== t
->waiter_pid
);
911 if (t
->event_fd
!= -1) {
920 ret
= tfork_uninstall_sigchld_handler();
926 int tfork_destroy(struct tfork
**_t
)
928 struct tfork
*t
= *_t
;
936 kill(t
->worker_pid
, SIGKILL
);
938 ret
= tfork_status(_t
, true);