4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013, Joyent, Inc. All rights reserved.
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/sysmacros.h>
33 #include <sys/signal.h>
35 #include <sys/policy.h>
37 #include <sys/systm.h>
38 #include <sys/cpuvar.h>
40 #include <sys/vnode.h>
42 #include <sys/errno.h>
45 #include <sys/cmn_err.h>
47 #include <sys/tuneable.h>
48 #include <sys/class.h>
50 #include <sys/session.h>
51 #include <sys/ucontext.h>
52 #include <sys/stack.h>
53 #include <sys/procfs.h>
54 #include <sys/prsystm.h>
55 #include <sys/vmsystm.h>
56 #include <sys/vtrace.h>
57 #include <sys/debug.h>
58 #include <sys/shm_impl.h>
59 #include <sys/door_data.h>
64 #include <sys/schedctl.h>
65 #include <sys/utrap.h>
67 #include <sys/resource.h>
68 #include <sys/cyclic.h>
71 #include <sys/contract_impl.h>
72 #include <sys/contract/process_impl.h>
74 #include <sys/dtrace.h>
78 #include <sys/class.h>
79 #include <sys/corectl.h>
80 #include <sys/brand.h>
83 static int64_t cfork(int, int, int);
84 static int getproc(proc_t
**, pid_t
, uint_t
);
85 #define GETPROC_USER 0x0
86 #define GETPROC_KERNEL 0x1
88 static void fork_fail(proc_t
*);
89 static void forklwp_fail(proc_t
*);
91 int fork_fail_pending
;
93 extern struct kmem_cache
*process_cache
;
96 * The vfork() system call trap is no longer invoked by libc.
97 * It is retained only for the benefit of applications running
98 * within a solaris10 branded zone. It should be eliminated
99 * when we no longer support solaris10 branded zones.
104 curthread
->t_post_sys
= 1; /* so vfwait() will be called */
105 return (cfork(1, 1, 0));
109 * forksys system call - forkx, forkallx, vforkx. This is the
110 * interface invoked by libc for fork1(), forkall(), and vfork()
113 forksys(int subcode
, int flags
)
117 return (cfork(0, 1, flags
)); /* forkx(flags) */
119 return (cfork(0, 0, flags
)); /* forkallx(flags) */
121 curthread
->t_post_sys
= 1; /* so vfwait() will be called */
122 return (cfork(1, 1, flags
)); /* vforkx(flags) */
124 return ((int64_t)set_errno(EINVAL
));
129 * Remove the associations of a child process from its parent and siblings.
132 disown_proc(proc_t
*pp
, proc_t
*cp
)
136 ASSERT(MUTEX_HELD(&pidlock
));
138 orphpp
= &pp
->p_orphan
;
139 while (*orphpp
!= cp
)
140 orphpp
= &(*orphpp
)->p_nextorph
;
141 *orphpp
= cp
->p_nextorph
;
143 if (pp
->p_child
== cp
)
144 pp
->p_child
= cp
->p_sibling
;
146 cp
->p_sibling
->p_psibling
= cp
->p_psibling
;
148 cp
->p_psibling
->p_sibling
= cp
->p_sibling
;
153 cfork(int isvfork
, int isfork1
, int flags
)
155 proc_t
*p
= ttoproc(curthread
);
165 rctl_alloc_gp_t
*dup_gp
;
172 * Allow only these two flags.
174 if ((flags
& ~(FORK_NOSIGCHLD
| FORK_WAITPID
)) != 0) {
176 atomic_inc_32(&curproc
->p_zone
->zone_ffmisc
);
181 * fork is not supported for the /proc agent lwp.
183 if (curthread
== p
->p_agenttp
) {
185 atomic_inc_32(&curproc
->p_zone
->zone_ffmisc
);
189 if ((error
= secpolicy_basic_fork(CRED())) != 0) {
190 atomic_inc_32(&p
->p_zone
->zone_ffmisc
);
195 * If the calling lwp is doing a fork1() then the
196 * other lwps in this process are not duplicated and
197 * don't need to be held where their kernel stacks can be
198 * cloned. If doing forkall(), the process is held with
199 * SHOLDFORK, so that the lwps are at a point where their
200 * stacks can be copied which is on entry or exit from
203 if (!holdlwps(isfork1
? SHOLDFORK1
: SHOLDFORK
)) {
206 atomic_inc_32(&p
->p_zone
->zone_ffmisc
);
212 * Ensure that the user stack is fully constructed
213 * before creating the child process structure.
215 (void) flush_user_windows_to_stack(NULL
);
218 mutex_enter(&p
->p_lock
);
220 * If this is vfork(), cancel any suspend request we might
221 * have gotten from some other thread via lwp_suspend().
222 * Otherwise we could end up with a deadlock on return
223 * from the vfork() in both the parent and the child.
226 curthread
->t_proc_flag
&= ~TP_HOLDLWP
;
228 * Prevent our resource set associations from being changed during fork.
230 pool_barrier_enter();
231 mutex_exit(&p
->p_lock
);
234 * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
236 if (getproc(&cp
, 0, GETPROC_USER
) < 0) {
237 mutex_enter(&p
->p_lock
);
240 mutex_exit(&p
->p_lock
);
245 TRACE_2(TR_FAC_PROC
, TR_PROC_FORK
, "proc_fork:cp %p p %p", cp
, p
);
248 * Assign an address space to child
252 * Clear any watched areas and remember the
253 * watched pages for restoring in vfwait().
256 if (avl_numnodes(&as
->a_wpage
) != 0) {
257 AS_LOCK_ENTER(as
, RW_WRITER
);
259 p
->p_wpage
= as
->a_wpage
;
260 avl_create(&as
->a_wpage
, wp_compare
,
261 sizeof (struct watched_page
),
262 offsetof(struct watched_page
, wp_link
));
266 cp
->p_flag
|= SVFORK
;
269 * Use the parent's shm segment list information for
270 * the child as it uses its address space till it execs.
272 cp
->p_segacct
= p
->p_segacct
;
275 * We need to hold P_PR_LOCK until the address space has
276 * been duplicated and we've had a chance to remove from the
277 * child any DTrace probes that were in the parent. Holding
278 * P_PR_LOCK prevents any new probes from being added and any
279 * extant probes from being removed.
281 mutex_enter(&p
->p_lock
);
283 p
->p_flag
|= SFORKING
;
284 mutex_exit(&p
->p_lock
);
286 error
= as_dup(p
->p_as
, cp
);
288 mutex_enter(&p
->p_lock
);
291 mutex_enter(&pidlock
);
293 mutex_enter(&cp
->p_lock
);
296 ASSERT(cp
->p_pool
->pool_ref
> 0);
297 atomic_dec_32(&cp
->p_pool
->pool_ref
);
298 mutex_exit(&cp
->p_lock
);
300 mutex_exit(&pidlock
);
303 mutex_enter(&p
->p_lock
);
304 p
->p_flag
&= ~SFORKING
;
307 mutex_exit(&p
->p_lock
);
309 * Preserve ENOMEM error condition but
310 * map all others to EAGAIN.
312 error
= (error
== ENOMEM
) ? ENOMEM
: EAGAIN
;
313 atomic_inc_32(&p
->p_zone
->zone_ffnomem
);
318 * Remove all DTrace tracepoints from the child process. We
319 * need to do this _before_ duplicating USDT providers since
320 * any associated probes may be immediately enabled.
322 if (p
->p_dtrace_count
> 0)
323 dtrace_fasttrap_fork(p
, cp
);
325 mutex_enter(&p
->p_lock
);
328 /* Duplicate parent's shared memory */
333 * Duplicate any helper actions and providers. The SFORKING
334 * we set above informs the code to enable USDT probes that
335 * sprlock() may fail because the child is being forked.
337 if (p
->p_dtrace_helpers
!= NULL
) {
338 ASSERT(dtrace_helpers_fork
!= NULL
);
339 (*dtrace_helpers_fork
)(p
, cp
);
342 mutex_enter(&p
->p_lock
);
343 p
->p_flag
&= ~SFORKING
;
344 mutex_exit(&p
->p_lock
);
348 * Duplicate parent's resource controls.
350 dup_set
= rctl_set_create();
352 dup_gp
= rctl_set_dup_prealloc(p
->p_rctls
);
353 mutex_enter(&p
->p_rctls
->rcs_lock
);
354 if (rctl_set_dup_ready(p
->p_rctls
, dup_gp
))
356 mutex_exit(&p
->p_rctls
->rcs_lock
);
357 rctl_prealloc_destroy(dup_gp
);
360 e
.rcep_t
= RCENTITY_PROCESS
;
361 cp
->p_rctls
= rctl_set_dup(p
->p_rctls
, p
, cp
, &e
, dup_set
, dup_gp
,
362 RCD_DUP
| RCD_CALLBACK
);
363 mutex_exit(&p
->p_rctls
->rcs_lock
);
365 rctl_prealloc_destroy(dup_gp
);
368 * Allocate the child's lwp directory and lwpid hash table.
373 cp
->p_lwpdir_sz
= p
->p_lwpdir_sz
;
374 cp
->p_lwpdir
= cp
->p_lwpfree
= ldp
=
375 kmem_zalloc(cp
->p_lwpdir_sz
* sizeof (lwpdir_t
), KM_SLEEP
);
376 for (i
= 1; i
< cp
->p_lwpdir_sz
; i
++, ldp
++)
377 ldp
->ld_next
= ldp
+ 1;
378 cp
->p_tidhash_sz
= (cp
->p_lwpdir_sz
+ 2) / 2;
380 kmem_zalloc(cp
->p_tidhash_sz
* sizeof (tidhash_t
), KM_SLEEP
);
383 * Duplicate parent's lwps.
384 * Mutual exclusion is not needed because the process is
385 * in the hold state and only the current lwp is running.
387 klgrpset_clear(cp
->p_lgrpset
);
389 clone
= forklwp(ttolwp(curthread
), cp
, curthread
->t_tid
);
393 * Inherit only the lwp_wait()able flag,
394 * Daemon threads should not call fork1(), but oh well...
396 lwptot(clone
)->t_proc_flag
|=
397 (curthread
->t_proc_flag
& TP_TWAIT
);
399 /* this is forkall(), no one can be in lwp_wait() */
400 ASSERT(p
->p_lwpwait
== 0 && p
->p_lwpdwait
== 0);
401 /* for each entry in the parent's lwp directory... */
402 for (i
= 0, ldp
= p
->p_lwpdir
; i
< p
->p_lwpdir_sz
; i
++, ldp
++) {
406 if ((lep
= ldp
->ld_entry
) == NULL
)
409 if ((t
= lep
->le_thread
) != NULL
) {
410 clwp
= forklwp(ttolwp(t
), cp
, t
->t_tid
);
415 * Inherit lwp_wait()able and daemon flags.
418 (t
->t_proc_flag
& (TP_TWAIT
|TP_DAEMON
));
420 * Keep track of the clone of curthread to
421 * post return values through lwp_setrval().
422 * Mark other threads for special treatment
423 * by lwp_rtt() / post_syscall().
428 ct
->t_flag
|= T_FORKALL
;
431 * Replicate zombie lwps in the child.
433 clep
= kmem_zalloc(sizeof (*clep
), KM_SLEEP
);
434 clep
->le_lwpid
= lep
->le_lwpid
;
435 clep
->le_start
= lep
->le_start
;
436 lwp_hash_in(cp
, clep
,
437 cp
->p_tidhash
, cp
->p_tidhash_sz
, 0);
443 * Put new process in the parent's process contract, or put it
444 * in a new one if there is an active process template. Send a
445 * fork event (if requested) to whatever contract the child is
446 * a member of. Fails if the parent has been SIGKILLed.
448 if (contract_process_fork(NULL
, cp
, p
, B_TRUE
) == NULL
) {
449 atomic_inc_32(&p
->p_zone
->zone_ffmisc
);
454 * No fork failures occur beyond this point.
457 cp
->p_lwpid
= p
->p_lwpid
;
459 cp
->p_lwpdaemon
= p
->p_lwpdaemon
;
460 cp
->p_zombcnt
= p
->p_zombcnt
;
462 * If the parent's lwp ids have wrapped around, so have the
465 cp
->p_flag
|= p
->p_flag
& SLWPWRAP
;
468 mutex_enter(&p
->p_lock
);
469 corectl_path_hold(cp
->p_corefile
= p
->p_corefile
);
470 corectl_content_hold(cp
->p_content
= p
->p_content
);
471 mutex_exit(&p
->p_lock
);
474 * Duplicate process context ops, if any.
483 * If the child process has been marked to stop on exit
484 * from this fork, arrange for all other lwps to stop in
485 * sympathy with the active lwp.
487 if (PTOU(cp
)->u_systrap
&&
488 prismember(&PTOU(cp
)->u_exitmask
, curthread
->t_sysnum
)) {
489 mutex_enter(&cp
->p_lock
);
492 t
->t_proc_flag
|= TP_PRSTOP
;
493 aston(t
); /* so TP_PRSTOP will be seen */
494 } while ((t
= t
->t_forw
) != cp
->p_tlist
);
495 mutex_exit(&cp
->p_lock
);
498 * If the parent process has been marked to stop on exit
499 * from this fork, and its asynchronous-stop flag has not
500 * been set, arrange for all other lwps to stop before
501 * they return back to user level.
503 if (!(p
->p_proc_flag
& P_PR_ASYNC
) && PTOU(p
)->u_systrap
&&
504 prismember(&PTOU(p
)->u_exitmask
, curthread
->t_sysnum
)) {
505 mutex_enter(&p
->p_lock
);
508 t
->t_proc_flag
|= TP_PRSTOP
;
509 aston(t
); /* so TP_PRSTOP will be seen */
510 } while ((t
= t
->t_forw
) != p
->p_tlist
);
511 mutex_exit(&p
->p_lock
);
514 if (PROC_IS_BRANDED(p
))
515 BROP(p
)->b_lwp_setrval(clone
, p
->p_pid
, 1);
517 lwp_setrval(clone
, p
->p_pid
, 1);
519 /* set return values for parent */
520 r
.r_val1
= (int)cp
->p_pid
;
524 * pool_barrier_exit() can now be called because the child process has:
525 * - all identifying features cloned or set (p_pid, p_task, p_pool)
526 * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
527 * - any other fields set which are used in resource set binding.
529 mutex_enter(&p
->p_lock
);
531 mutex_exit(&p
->p_lock
);
533 mutex_enter(&pidlock
);
534 mutex_enter(&cp
->p_lock
);
537 * Set flags telling the child what (not) to do on exit.
539 if (flags
& FORK_NOSIGCHLD
)
540 cp
->p_pidflag
|= CLDNOSIGCHLD
;
541 if (flags
& FORK_WAITPID
)
542 cp
->p_pidflag
|= CLDWAITPID
;
545 * Now that there are lwps and threads attached, add the new
546 * process to the process group.
548 pgjoin(cp
, p
->p_pgidp
);
551 * We are now done with all the lwps in the child process.
556 * Set the lwp_suspend()ed lwps running.
557 * They will suspend properly at syscall exit.
559 if (t
->t_proc_flag
& TP_HOLDLWP
)
562 /* set TS_CREATE to allow continuelwps() to work */
564 ASSERT(t
->t_state
== TS_STOPPED
&&
565 !(t
->t_schedflag
& (TS_CREATE
|TS_CSTART
)));
566 t
->t_schedflag
|= TS_CREATE
;
569 } while ((t
= t
->t_forw
) != cp
->p_tlist
);
570 mutex_exit(&cp
->p_lock
);
573 CPU_STATS_ADDQ(CPU
, sys
, sysvfork
, 1);
574 mutex_enter(&p
->p_lock
);
575 p
->p_flag
|= SVFWAIT
;
576 curthread
->t_flag
|= T_VFPARENT
;
577 DTRACE_PROC1(create
, proc_t
*, cp
);
578 cv_broadcast(&pr_pid_cv
[p
->p_slot
]); /* inform /proc */
579 mutex_exit(&p
->p_lock
);
581 * Grab child's p_lock before dropping pidlock to ensure
582 * the process will not disappear before we set it running.
584 mutex_enter(&cp
->p_lock
);
585 mutex_exit(&pidlock
);
588 mutex_exit(&cp
->p_lock
);
590 CPU_STATS_ADDQ(CPU
, sys
, sysfork
, 1);
591 DTRACE_PROC1(create
, proc_t
*, cp
);
593 * It is CL_FORKRET's job to drop pidlock.
594 * If we do it here, the process could be set running
595 * and disappear before CL_FORKRET() is called.
597 CL_FORKRET(curthread
, cp
->p_tlist
);
598 schedctl_set_cidpri(curthread
);
599 ASSERT(MUTEX_NOT_HELD(&pidlock
));
606 if (avl_numnodes(&p
->p_wpage
) != 0) {
607 /* restore watchpoints to parent */
609 AS_LOCK_ENTER(as
, RW_WRITER
);
610 as
->a_wpage
= p
->p_wpage
;
611 avl_create(&p
->p_wpage
, wp_compare
,
612 sizeof (struct watched_page
),
613 offsetof(struct watched_page
, wp_link
));
626 for (i
= 0, ldp
= cp
->p_lwpdir
; i
< cp
->p_lwpdir_sz
; i
++, ldp
++)
627 if ((lep
= ldp
->ld_entry
) != NULL
)
628 kmem_free(lep
, sizeof (*lep
));
629 kmem_free(cp
->p_lwpdir
,
630 cp
->p_lwpdir_sz
* sizeof (*cp
->p_lwpdir
));
633 cp
->p_lwpfree
= NULL
;
637 kmem_free(cp
->p_tidhash
,
638 cp
->p_tidhash_sz
* sizeof (*cp
->p_tidhash
));
639 cp
->p_tidhash
= NULL
;
640 cp
->p_tidhash_sz
= 0;
644 if (cp
->p_dtrace_helpers
!= NULL
) {
645 ASSERT(dtrace_helpers_cleanup
!= NULL
);
646 (*dtrace_helpers_cleanup
)(cp
);
648 rctl_set_free(cp
->p_rctls
);
649 mutex_enter(&pidlock
);
652 * Detach failed child from task.
654 mutex_enter(&cp
->p_lock
);
657 ASSERT(cp
->p_pool
->pool_ref
> 0);
658 atomic_dec_32(&cp
->p_pool
->pool_ref
);
659 mutex_exit(&cp
->p_lock
);
663 mutex_exit(&pidlock
);
667 mutex_enter(&p
->p_lock
);
670 mutex_exit(&p
->p_lock
);
673 return ((int64_t)set_errno(error
));
677 * Free allocated resources from getproc() if a fork failed.
680 fork_fail(proc_t
*cp
)
682 uf_info_t
*fip
= P_FINFO(cp
);
685 sigdelq(cp
, NULL
, 0);
687 mutex_enter(&pidlock
);
688 upcount_dec(crgetruid(cp
->p_cred
), crgetzoneid(cp
->p_cred
));
689 mutex_exit(&pidlock
);
692 * single threaded, so no locking needed here
696 kmem_free(fip
->fi_list
, fip
->fi_nfiles
* sizeof (uf_entry_t
));
698 VN_RELE(PTOU(curproc
)->u_cdir
);
699 if (PTOU(curproc
)->u_rdir
)
700 VN_RELE(PTOU(curproc
)->u_rdir
);
704 VN_RELE(cp
->p_execdir
);
705 if (PTOU(curproc
)->u_cwd
)
706 refstr_rele(PTOU(curproc
)->u_cwd
);
707 if (PROC_IS_BRANDED(cp
)) {
708 brand_clearbrand(cp
, B_TRUE
);
713 * Clean up the lwps already created for this child process.
714 * The fork failed while duplicating all the lwps of the parent
715 * and those lwps already created must be freed.
716 * This process is invisible to the rest of the system,
717 * so we don't need to hold p->p_lock to protect the list.
720 forklwp_fail(proc_t
*p
)
726 if (PROC_IS_BRANDED(p
))
729 while ((t
= p
->p_tlist
) != NULL
) {
731 * First remove the lwp from the process's p_tlist.
734 p
->p_tlist
= t
->t_forw
;
738 t
->t_forw
->t_back
= t
->t_back
;
739 t
->t_back
->t_forw
= t
->t_forw
;
742 mutex_enter(&p
->p_zone
->zone_nlwps_lock
);
744 tk
->tk_proj
->kpj_nlwps
--;
745 p
->p_zone
->zone_nlwps
--;
746 mutex_exit(&p
->p_zone
->zone_nlwps_lock
);
748 ASSERT(t
->t_schedctl
== NULL
);
751 BROP(p
)->b_freelwp(ttolwp(t
));
753 if (t
->t_door
!= NULL
) {
754 kmem_free(t
->t_door
, sizeof (door_data_t
));
757 lwp_ctmpl_clear(ttolwp(t
));
760 * Remove the thread from the all threads list.
761 * We need to hold pidlock for this.
763 mutex_enter(&pidlock
);
764 t
->t_next
->t_prev
= t
->t_prev
;
765 t
->t_prev
->t_next
= t
->t_next
;
766 CL_EXIT(t
); /* tell the scheduler that we're exiting */
767 cv_broadcast(&t
->t_joincv
); /* tell anyone in thread_join */
768 mutex_exit(&pidlock
);
771 * Let the lgroup load averages know that this thread isn't
772 * going to show up (i.e. un-do what was done on behalf of
773 * this thread by the earlier lgrp_move_thread()).
776 lgrp_move_thread(t
, NULL
, 1);
780 * The thread was created TS_STOPPED.
781 * We change it to TS_FREE to avoid an
782 * ASSERT() panic in thread_free().
784 t
->t_state
= TS_FREE
;
790 extern struct as kas
;
793 * fork a kernel process.
796 newproc(void (*pc
)(), caddr_t arg
, id_t cid
, int pri
, struct contract
**ct
,
802 cont_process_t
*ctp
= NULL
;
805 ASSERT(cid
!= sysdccid
);
806 ASSERT(cid
!= syscid
|| ct
== NULL
);
807 if (CLASS_KERNEL(cid
)) {
808 rctl_alloc_gp_t
*init_gp
;
809 rctl_set_t
*init_set
;
813 if (getproc(&p
, pid
, GETPROC_KERNEL
) < 0)
817 * Release the hold on the p_exec and p_execdir, these
818 * were acquired in getproc()
820 if (p
->p_execdir
!= NULL
)
821 VN_RELE(p
->p_execdir
);
822 if (p
->p_exec
!= NULL
)
824 p
->p_flag
|= SNOWAIT
;
828 init_set
= rctl_set_create();
829 init_gp
= rctl_set_init_prealloc(RCENTITY_PROCESS
);
832 * kernel processes do not inherit /proc tracing flags.
834 sigemptyset(&p
->p_sigmask
);
835 premptyset(&p
->p_fltmask
);
838 premptyset(&(up
->u_entrymask
));
839 premptyset(&(up
->u_exitmask
));
840 mutex_enter(&p
->p_lock
);
842 e
.rcep_t
= RCENTITY_PROCESS
;
843 p
->p_rctls
= rctl_set_init(RCENTITY_PROCESS
, p
, &e
, init_set
,
845 mutex_exit(&p
->p_lock
);
847 rctl_prealloc_destroy(init_gp
);
849 t
= lwp_kernel_create(p
, pc
, arg
, TS_STOPPED
, pri
);
851 rctl_alloc_gp_t
*init_gp
, *default_gp
;
852 rctl_set_t
*init_set
;
856 if (getproc(&p
, pid
, GETPROC_USER
) < 0)
859 * init creates a new task, distinct from the task
860 * containing kernel "processes".
862 tk
= task_create(0, p
->p_zone
);
863 mutex_enter(&tk
->tk_zone
->zone_nlwps_lock
);
864 tk
->tk_proj
->kpj_ntasks
++;
866 mutex_exit(&tk
->tk_zone
->zone_nlwps_lock
);
868 default_gp
= rctl_rlimit_set_prealloc(RLIM_NLIMITS
);
869 init_gp
= rctl_set_init_prealloc(RCENTITY_PROCESS
);
870 init_set
= rctl_set_create();
872 mutex_enter(&pidlock
);
873 mutex_enter(&p
->p_lock
);
874 tk_old
= p
->p_task
; /* switch to new task */
878 mutex_exit(&pidlock
);
880 mutex_enter(&tk_old
->tk_zone
->zone_nlwps_lock
);
882 mutex_exit(&tk_old
->tk_zone
->zone_nlwps_lock
);
885 e
.rcep_t
= RCENTITY_PROCESS
;
886 p
->p_rctls
= rctl_set_init(RCENTITY_PROCESS
, p
, &e
, init_set
,
888 rctlproc_default_init(p
, default_gp
);
889 mutex_exit(&p
->p_lock
);
892 rctl_prealloc_destroy(default_gp
);
893 rctl_prealloc_destroy(init_gp
);
895 if ((lwp
= lwp_create(pc
, arg
, 0, p
, TS_STOPPED
, pri
,
896 &curthread
->t_hold
, cid
, 1)) == NULL
) {
900 mutex_enter(&pidlock
);
901 disown_proc(p
->p_parent
, p
);
903 mutex_enter(&p
->p_lock
);
906 ASSERT(p
->p_pool
->pool_ref
> 0);
907 atomic_add_32(&p
->p_pool
->pool_ref
, -1);
908 mutex_exit(&p
->p_lock
);
911 mutex_exit(&pidlock
);
917 ctp
= contract_process_fork(sys_process_tmpl
, p
, curproc
,
921 *ct
= &ctp
->conp_contract
;
924 ASSERT3U(t
->t_tid
, ==, 1);
926 mutex_enter(&pidlock
);
927 pgjoin(p
, p
->p_parent
->p_pgidp
);
929 mutex_enter(&p
->p_lock
);
930 t
->t_proc_flag
&= ~TP_HOLDLWP
;
932 mutex_exit(&p
->p_lock
);
933 mutex_exit(&pidlock
);
938 * create a child proc struct.
941 getproc(proc_t
**cpp
, pid_t pid
, uint_t flags
)
955 if (zone_status_get(curproc
->p_zone
) >= ZONE_IS_SHUTTING_DOWN
)
956 return (-1); /* no point in starting new processes */
958 pp
= (flags
& GETPROC_KERNEL
) ? &p0
: curproc
;
960 proj
= task
->tk_proj
;
963 mutex_enter(&pp
->p_lock
);
964 mutex_enter(&zone
->zone_nlwps_lock
);
965 if (proj
!= proj0p
) {
966 if (task
->tk_nprocs
>= task
->tk_nprocs_ctl
)
967 if (rctl_test(rc_task_nprocs
, task
->tk_rctls
,
968 pp
, 1, 0) & RCT_DENY
)
971 if (proj
->kpj_nprocs
>= proj
->kpj_nprocs_ctl
)
972 if (rctl_test(rc_project_nprocs
, proj
->kpj_rctls
,
973 pp
, 1, 0) & RCT_DENY
)
976 if (zone
->zone_nprocs
>= zone
->zone_nprocs_ctl
)
977 if (rctl_test(rc_zone_nprocs
, zone
->zone_rctls
,
978 pp
, 1, 0) & RCT_DENY
)
982 mutex_exit(&zone
->zone_nlwps_lock
);
983 mutex_exit(&pp
->p_lock
);
984 atomic_inc_32(&zone
->zone_ffcap
);
991 mutex_exit(&zone
->zone_nlwps_lock
);
992 mutex_exit(&pp
->p_lock
);
994 cp
= kmem_cache_alloc(process_cache
, KM_SLEEP
);
995 bzero(cp
, sizeof (proc_t
));
998 * Make proc entry for child process
1000 mutex_init(&cp
->p_splock
, NULL
, MUTEX_DEFAULT
, NULL
);
1001 mutex_init(&cp
->p_crlock
, NULL
, MUTEX_DEFAULT
, NULL
);
1002 mutex_init(&cp
->p_pflock
, NULL
, MUTEX_DEFAULT
, NULL
);
1004 mutex_init(&cp
->p_ldtlock
, NULL
, MUTEX_DEFAULT
, NULL
);
1006 mutex_init(&cp
->p_maplock
, NULL
, MUTEX_DEFAULT
, NULL
);
1008 cp
->p_mstart
= gethrtime();
1011 * p_zone must be set before we call pid_allocate since the process
1012 * will be visible after that and code such as prfind_zone will
1013 * look at the p_zone field.
1015 cp
->p_zone
= pp
->p_zone
;
1016 cp
->p_t1_lgrpid
= LGRP_NONE
;
1017 cp
->p_tr_lgrpid
= LGRP_NONE
;
1019 if ((newpid
= pid_allocate(cp
, pid
, PID_ALLOC_PROC
)) == -1) {
1020 if (nproc
== v
.v_proc
) {
1021 CPU_STATS_ADDQ(CPU
, sys
, procovf
, 1);
1022 cmn_err(CE_WARN
, "out of processes");
1027 mutex_enter(&pp
->p_lock
);
1028 cp
->p_exec
= pp
->p_exec
;
1029 cp
->p_execdir
= pp
->p_execdir
;
1030 mutex_exit(&pp
->p_lock
);
1033 VN_HOLD(cp
->p_exec
);
1035 * Each VOP_OPEN() must be paired with a corresponding
1036 * VOP_CLOSE(). In this case, the executable will be
1037 * closed for the child in either proc_exit() or gexec().
1039 if (VOP_OPEN(&cp
->p_exec
, FREAD
, CRED(), NULL
) != 0) {
1040 VN_RELE(cp
->p_exec
);
1041 cp
->p_exec
= NULLVP
;
1042 cp
->p_execdir
= NULLVP
;
1047 VN_HOLD(cp
->p_execdir
);
1050 * If not privileged make sure that this user hasn't exceeded
1051 * v.v_maxup processes, and that users collectively haven't
1052 * exceeded v.v_maxupttl processes.
1054 mutex_enter(&pidlock
);
1055 ASSERT(nproc
< v
.v_proc
); /* otherwise how'd we get our pid? */
1057 ruid
= crgetruid(cr
);
1058 zoneid
= crgetzoneid(cr
);
1059 if (nproc
>= v
.v_maxup
&& /* short-circuit; usually false */
1060 (nproc
>= v
.v_maxupttl
||
1061 upcount_get(ruid
, zoneid
) >= v
.v_maxup
) &&
1062 secpolicy_newproc(cr
) != 0) {
1063 mutex_exit(&pidlock
);
1064 zcmn_err(zoneid
, CE_NOTE
,
1065 "out of per-user processes for uid %d", ruid
);
1070 * Everything is cool, put the new proc on the active process list.
1071 * It is already on the pid list and in /proc.
1072 * Increment the per uid process count (upcount).
1075 upcount_inc(ruid
, zoneid
);
1077 cp
->p_next
= practive
;
1078 practive
->p_prev
= cp
;
1081 cp
->p_ignore
= pp
->p_ignore
;
1082 cp
->p_siginfo
= pp
->p_siginfo
;
1083 cp
->p_flag
= pp
->p_flag
& (SJCTL
|SNOWAIT
|SNOCD
);
1084 cp
->p_sessp
= pp
->p_sessp
;
1086 cp
->p_brand
= pp
->p_brand
;
1087 if (PROC_IS_BRANDED(pp
))
1088 BROP(pp
)->b_copy_procdata(cp
, pp
);
1089 cp
->p_bssbase
= pp
->p_bssbase
;
1090 cp
->p_brkbase
= pp
->p_brkbase
;
1091 cp
->p_brksize
= pp
->p_brksize
;
1092 cp
->p_brkpageszc
= pp
->p_brkpageszc
;
1093 cp
->p_stksize
= pp
->p_stksize
;
1094 cp
->p_stkpageszc
= pp
->p_stkpageszc
;
1095 cp
->p_stkprot
= pp
->p_stkprot
;
1096 cp
->p_datprot
= pp
->p_datprot
;
1097 cp
->p_usrstack
= pp
->p_usrstack
;
1098 cp
->p_model
= pp
->p_model
;
1099 cp
->p_ppid
= pp
->p_pid
;
1100 cp
->p_ancpid
= pp
->p_pid
;
1101 cp
->p_portcnt
= pp
->p_portcnt
;
1103 * Security flags are preserved on fork, the inherited copy come into
1106 cp
->p_secflags
= pp
->p_secflags
;
1109 * Initialize watchpoint structures
1111 avl_create(&cp
->p_warea
, wa_compare
, sizeof (struct watched_area
),
1112 offsetof(struct watched_area
, wa_link
));
1115 * Initialize immediate resource control values.
1117 cp
->p_stk_ctl
= pp
->p_stk_ctl
;
1118 cp
->p_fsz_ctl
= pp
->p_fsz_ctl
;
1119 cp
->p_vmem_ctl
= pp
->p_vmem_ctl
;
1120 cp
->p_fno_ctl
= pp
->p_fno_ctl
;
1123 * Link up to parent-child-sibling chain. No need to lock
1124 * in general since only a call to freeproc() (done by the
1125 * same parent as newproc()) diddles with the child chain.
1127 cp
->p_sibling
= pp
->p_child
;
1129 pp
->p_child
->p_psibling
= cp
;
1134 cp
->p_child_ns
= NULL
;
1135 cp
->p_sibling_ns
= NULL
;
1137 cp
->p_nextorph
= pp
->p_orphan
;
1138 cp
->p_nextofkin
= pp
;
1142 * Inherit profiling state; do not inherit REALPROF profiling state.
1144 cp
->p_prof
= pp
->p_prof
;
1145 cp
->p_rprof_cyclic
= CYCLIC_NONE
;
1148 * Inherit pool pointer from the parent. Kernel processes are
1149 * always bound to the default pool.
1151 mutex_enter(&pp
->p_lock
);
1152 if (flags
& GETPROC_KERNEL
) {
1153 cp
->p_pool
= pool_default
;
1156 cp
->p_pool
= pp
->p_pool
;
1158 atomic_inc_32(&cp
->p_pool
->pool_ref
);
1159 mutex_exit(&pp
->p_lock
);
1162 * Add the child process to the current task. Kernel processes
1163 * are always attached to task0.
1165 mutex_enter(&cp
->p_lock
);
1166 if (flags
& GETPROC_KERNEL
)
1167 task_attach(task0p
, cp
);
1169 task_attach(pp
->p_task
, cp
);
1170 mutex_exit(&cp
->p_lock
);
1171 mutex_exit(&pidlock
);
1173 avl_create(&cp
->p_ct_held
, contract_compar
, sizeof (contract_t
),
1174 offsetof(contract_t
, ct_ctlist
));
1177 * Duplicate any audit information kept in the process table
1179 if (audit_active
) /* copy audit data to cp */
1182 crhold(cp
->p_cred
= cr
);
1185 * Bump up the counts on the file structures pointed at by the
1186 * parent's file table since the child will point at them too.
1188 fcnt_add(P_FINFO(pp
), 1);
1190 if (PTOU(pp
)->u_cdir
) {
1191 VN_HOLD(PTOU(pp
)->u_cdir
);
1195 * We must be at or before vfs_mountroot(); it will take care of
1196 * assigning our current directory.
1199 if (PTOU(pp
)->u_rdir
)
1200 VN_HOLD(PTOU(pp
)->u_rdir
);
1201 if (PTOU(pp
)->u_cwd
)
1202 refstr_hold(PTOU(pp
)->u_cwd
);
1205 * copy the parent's uarea.
1208 bcopy(PTOU(pp
), uarea
, sizeof (*uarea
));
1209 flist_fork(P_FINFO(pp
), P_FINFO(cp
));
1211 gethrestime(&uarea
->u_start
);
1212 uarea
->u_ticks
= ddi_get_lbolt();
1213 uarea
->u_mem
= rm_asrss(pp
->p_as
);
1214 uarea
->u_acflag
= AFORK
;
1217 * If inherit-on-fork, copy /proc tracing flags to child.
1219 if ((pp
->p_proc_flag
& P_PR_FORK
) != 0) {
1220 cp
->p_proc_flag
|= pp
->p_proc_flag
& (P_PR_TRACE
|P_PR_FORK
);
1221 cp
->p_sigmask
= pp
->p_sigmask
;
1222 cp
->p_fltmask
= pp
->p_fltmask
;
1224 sigemptyset(&cp
->p_sigmask
);
1225 premptyset(&cp
->p_fltmask
);
1226 uarea
->u_systrap
= 0;
1227 premptyset(&uarea
->u_entrymask
);
1228 premptyset(&uarea
->u_exitmask
);
1231 * If microstate accounting is being inherited, mark child
1233 if ((pp
->p_flag
& SMSFORK
) != 0)
1234 cp
->p_flag
|= pp
->p_flag
& (SMSFORK
|SMSACCT
);
1237 * Inherit fixalignment flag from the parent
1239 cp
->p_fixalignment
= pp
->p_fixalignment
;
1245 ASSERT(MUTEX_NOT_HELD(&pidlock
));
1247 mutex_destroy(&cp
->p_crlock
);
1248 mutex_destroy(&cp
->p_pflock
);
1250 mutex_destroy(&cp
->p_ldtlock
);
1253 proc_entry_free(cp
->p_pidp
);
1254 (void) pid_rele(cp
->p_pidp
);
1256 kmem_cache_free(process_cache
, cp
);
1258 mutex_enter(&zone
->zone_nlwps_lock
);
1261 zone
->zone_nprocs
--;
1262 mutex_exit(&zone
->zone_nlwps_lock
);
1263 atomic_inc_32(&zone
->zone_ffnoproc
);
1267 * We most likely got into this situation because some process is
1268 * forking out of control. As punishment, put it to sleep for a
1269 * bit so it can't eat the machine alive. Sleep interval is chosen
1270 * to allow no more than one fork failure per cpu per clock tick
1271 * on average (yes, I just made this up). This has two desirable
1272 * properties: (1) it sets a constant limit on the fork failure
1273 * rate, and (2) the busier the system is, the harsher the penalty
1274 * for abusing it becomes.
1276 INCR_COUNT(&fork_fail_pending
, &pidlock
);
1277 delay(fork_fail_pending
/ ncpus
+ 1);
1278 DECR_COUNT(&fork_fail_pending
, &pidlock
);
1280 return (-1); /* out of memory or proc slots */
1284 * Release virtual memory.
1285 * In the case of vfork(), the child was given exclusive access to its
1286 * parent's address space. The parent is waiting in vfwait() for the
1287 * child to release its exclusive claim via relvm().
1292 proc_t
*p
= curproc
;
1294 ASSERT((unsigned)p
->p_lwpcnt
<= 1);
1296 prrelvm(); /* inform /proc */
1298 if (p
->p_flag
& SVFORK
) {
1299 proc_t
*pp
= p
->p_parent
;
1301 * The child process is either exec'ing or exit'ing.
1302 * The child is now separated from the parent's address
1303 * space. The parent process is made dispatchable.
1305 * This is a delicate locking maneuver, involving
1306 * both the parent's p_lock and the child's p_lock.
1307 * As soon as the SVFORK flag is turned off, the
1308 * parent is free to run, but it must not run until
1309 * we wake it up using its p_cv because it might
1310 * exit and we would be referencing invalid memory.
1311 * Therefore, we hold the parent with its p_lock
1312 * while protecting our p_flags with our own p_lock.
1315 mutex_enter(&p
->p_lock
); /* grab child's lock first */
1316 prbarrier(p
); /* make sure /proc is blocked out */
1317 mutex_enter(&pp
->p_lock
);
1320 * Check if parent is locked by /proc.
1322 if (pp
->p_proc_flag
& P_PR_LOCK
) {
1324 * Delay until /proc is done with the parent.
1325 * We must drop our (the child's) p->p_lock, wait
1326 * via prbarrier() on the parent, then start over.
1328 mutex_exit(&p
->p_lock
);
1330 mutex_exit(&pp
->p_lock
);
1333 p
->p_flag
&= ~SVFORK
;
1338 * notify hat of change in thread's address space
1340 hat_thread_exit(curthread
);
1344 * child sizes are copied back to parent because
1345 * child may have grown.
1347 pp
->p_brkbase
= p
->p_brkbase
;
1348 pp
->p_brksize
= p
->p_brksize
;
1349 pp
->p_stksize
= p
->p_stksize
;
1352 * Copy back the shm accounting information
1353 * to the parent process.
1355 pp
->p_segacct
= p
->p_segacct
;
1356 p
->p_segacct
= NULL
;
1359 * The parent is no longer waiting for the vfork()d child.
1360 * Restore the parent's watched pages, if any. This is
1361 * safe because we know the parent is not locked by /proc
1363 pp
->p_flag
&= ~SVFWAIT
;
1364 if (avl_numnodes(&pp
->p_wpage
) != 0) {
1365 pp
->p_as
->a_wpage
= pp
->p_wpage
;
1366 avl_create(&pp
->p_wpage
, wp_compare
,
1367 sizeof (struct watched_page
),
1368 offsetof(struct watched_page
, wp_link
));
1370 cv_signal(&pp
->p_cv
);
1371 mutex_exit(&pp
->p_lock
);
1372 mutex_exit(&p
->p_lock
);
1374 if (p
->p_as
!= &kas
) {
1381 * We grab p_lock for the benefit of /proc
1384 mutex_enter(&p
->p_lock
);
1385 prbarrier(p
); /* make sure /proc is blocked out */
1388 mutex_exit(&p
->p_lock
);
1391 * notify hat of change in thread's address space
1393 hat_thread_exit(curthread
);
1397 p
->p_tr_lgrpid
= LGRP_NONE
;
1403 * Wait for child to exec or exit.
1404 * Called by parent of vfork'ed process.
1405 * See important comments in relvm(), above.
1411 proc_t
*pp
= ttoproc(curthread
);
1415 * Wait for child to exec or exit.
1418 mutex_enter(&pidlock
);
1420 if (cp
== NULL
|| cp
->p_parent
!= pp
) {
1422 * Child has exit()ed.
1424 mutex_exit(&pidlock
);
1428 * Grab the child's p_lock before releasing pidlock.
1429 * Otherwise, the child could exit and we would be
1430 * referencing invalid memory.
1432 mutex_enter(&cp
->p_lock
);
1433 mutex_exit(&pidlock
);
1434 if (!(cp
->p_flag
& SVFORK
)) {
1436 * Child has exec()ed or is exit()ing.
1438 mutex_exit(&cp
->p_lock
);
1441 mutex_enter(&pp
->p_lock
);
1442 mutex_exit(&cp
->p_lock
);
1444 * We might be waked up spuriously from the cv_wait().
1445 * We have to do the whole operation over again to be
1446 * sure the child's SVFORK flag really is turned off.
1447 * We cannot make reference to the child because it can
1448 * exit before we return and we would be referencing
1451 * Because this is potentially a very long-term wait,
1452 * we call cv_wait_sig() (for its jobcontrol and /proc
1453 * side-effects) unless there is a current signal, in
1454 * which case we use cv_wait() because we cannot return
1455 * from this function until the child has released the
1456 * address space. Calling cv_wait_sig() with a current
1457 * signal would lead to an indefinite loop here because
1458 * cv_wait_sig() returns immediately in this case.
1461 cv_wait(&pp
->p_cv
, &pp
->p_lock
);
1463 signalled
= !cv_wait_sig(&pp
->p_cv
, &pp
->p_lock
);
1464 mutex_exit(&pp
->p_lock
);
1467 /* restore watchpoints to parent */
1468 if (pr_watch_active(pp
)) {
1469 struct as
*as
= pp
->p_as
;
1470 AS_LOCK_ENTER(as
, RW_WRITER
);
1475 mutex_enter(&pp
->p_lock
);
1476 prbarrier(pp
); /* barrier against /proc locking */
1478 mutex_exit(&pp
->p_lock
);