4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/schedctl.h>
31 #include <sys/thread.h>
32 #include <sys/class.h>
35 #include <sys/cmn_err.h>
36 #include <sys/stack.h>
37 #include <sys/debug.h>
38 #include <sys/cpuvar.h>
39 #include <sys/sobject.h>
41 #include <sys/modctl.h>
42 #include <sys/syscall.h>
43 #include <sys/sysmacros.h>
44 #include <sys/vmsystm.h>
46 #include <sys/vnode.h>
49 #include <sys/bitmap.h>
50 #include <sys/atomic.h>
51 #include <sys/fcntl.h>
52 #include <vm/seg_kp.h>
53 #include <vm/seg_vn.h>
55 #include <sys/fs_subr.h>
58 * Page handling structures. This is set up as a list of per-page
59 * control structures (sc_page_ctl), with p->p_pagep pointing to
60 * the first. The per-page structures point to the actual pages
61 * and contain pointers to the user address for each mapped page.
63 * All data is protected by p->p_sc_lock. Since this lock is
64 * held while waiting for memory, schedctl_shared_alloc() should
65 * not be called while holding p_lock.
68 typedef struct sc_page_ctl
{
69 struct sc_page_ctl
*spc_next
;
70 sc_shared_t
*spc_base
; /* base of kernel page */
71 sc_shared_t
*spc_end
; /* end of usable space */
72 ulong_t
*spc_map
; /* bitmap of allocated space on page */
73 size_t spc_space
; /* amount of space on page */
74 caddr_t spc_uaddr
; /* user-level address of the page */
75 struct anon_map
*spc_amp
; /* anonymous memory structure */
78 static size_t sc_pagesize
; /* size of usable space on page */
79 static size_t sc_bitmap_len
; /* # of bits in allocation bitmap */
80 static size_t sc_bitmap_words
; /* # of words in allocation bitmap */
83 static void schedctl_save(sc_shared_t
*);
84 static void schedctl_restore(sc_shared_t
*);
85 static void schedctl_fork(kthread_t
*, kthread_t
*);
87 /* Functions for handling shared pages */
88 static int schedctl_shared_alloc(sc_shared_t
**, uintptr_t *);
89 static sc_page_ctl_t
*schedctl_page_lookup(sc_shared_t
*);
90 static int schedctl_map(struct anon_map
*, caddr_t
*, caddr_t
);
91 static int schedctl_getpage(struct anon_map
**, caddr_t
*);
92 static void schedctl_freepage(struct anon_map
*, caddr_t
);
95 * System call interface to scheduler activations.
96 * This always operates on the current lwp.
101 kthread_t
*t
= curthread
;
106 if (t
->t_schedctl
== NULL
) {
108 * Allocate and initialize the shared structure.
110 if ((error
= schedctl_shared_alloc(&ssp
, &uaddr
)) != 0)
111 return ((caddr_t
)(uintptr_t)set_errno(error
));
112 bzero(ssp
, sizeof (*ssp
));
114 installctx(t
, ssp
, schedctl_save
, schedctl_restore
,
115 schedctl_fork
, NULL
, NULL
, NULL
);
117 thread_lock(t
); /* protect against ts_tick and ts_update */
119 t
->t_sc_uaddr
= uaddr
;
120 ssp
->sc_cid
= t
->t_cid
;
121 ssp
->sc_cpri
= t
->t_cpri
;
122 ssp
->sc_priority
= DISP_PRIO(t
);
126 return ((caddr_t
)t
->t_sc_uaddr
);
131 * Clean up scheduler activations state associated with an exiting
132 * (or execing) lwp. t is always the current thread.
135 schedctl_lwp_cleanup(kthread_t
*t
)
137 sc_shared_t
*ssp
= t
->t_schedctl
;
138 proc_t
*p
= ttoproc(t
);
139 sc_page_ctl_t
*pagep
;
142 ASSERT(MUTEX_NOT_HELD(&p
->p_lock
));
144 thread_lock(t
); /* protect against ts_tick and ts_update */
145 t
->t_schedctl
= NULL
;
150 * Remove the context op to avoid the final call to
151 * schedctl_save when switching away from this lwp.
153 (void) removectx(t
, ssp
, schedctl_save
, schedctl_restore
,
154 schedctl_fork
, NULL
, NULL
, NULL
);
157 * Do not unmap the shared page until the process exits.
158 * User-level library code relies on this for adaptive mutex locking.
160 mutex_enter(&p
->p_sc_lock
);
161 ssp
->sc_state
= SC_FREE
;
162 pagep
= schedctl_page_lookup(ssp
);
163 index
= (index_t
)(ssp
- pagep
->spc_base
);
164 BT_CLEAR(pagep
->spc_map
, index
);
165 pagep
->spc_space
+= sizeof (sc_shared_t
);
166 mutex_exit(&p
->p_sc_lock
);
171 * Cleanup the list of schedctl shared pages for the process.
172 * Called from exec() and exit() system calls.
175 schedctl_proc_cleanup(void)
178 sc_page_ctl_t
*pagep
;
181 ASSERT(p
->p_lwpcnt
== 1); /* we are single-threaded now */
182 ASSERT(curthread
->t_schedctl
== NULL
);
185 * Since we are single-threaded, we don't have to hold p->p_sc_lock.
189 while (pagep
!= NULL
) {
190 ASSERT(pagep
->spc_space
== sc_pagesize
);
191 next
= pagep
->spc_next
;
193 * Unmap the user space and free the mapping structure.
195 (void) as_unmap(p
->p_as
, pagep
->spc_uaddr
, PAGESIZE
);
196 schedctl_freepage(pagep
->spc_amp
, (caddr_t
)(pagep
->spc_base
));
197 kmem_free(pagep
->spc_map
, sizeof (ulong_t
) * sc_bitmap_words
);
198 kmem_free(pagep
, sizeof (sc_page_ctl_t
));
205 * Called by resume just before switching away from the current thread.
206 * Save new thread state.
209 schedctl_save(sc_shared_t
*ssp
)
211 ssp
->sc_state
= curthread
->t_state
;
216 * Called by resume after switching to the current thread.
217 * Save new thread state and CPU.
220 schedctl_restore(sc_shared_t
*ssp
)
222 ssp
->sc_state
= SC_ONPROC
;
223 ssp
->sc_cpu
= CPU
->cpu_id
;
228 * On fork, remove inherited mappings from the child's address space.
229 * The child's threads must call schedctl() to get new shared mappings.
232 schedctl_fork(kthread_t
*pt
, kthread_t
*ct
)
234 proc_t
*pp
= ttoproc(pt
);
235 proc_t
*cp
= ttoproc(ct
);
236 sc_page_ctl_t
*pagep
;
238 ASSERT(ct
->t_schedctl
== NULL
);
241 * Do this only once, whether we are doing fork1() or forkall().
242 * Don't do it at all if the child process is a child of vfork()
243 * because a child of vfork() borrows the parent's address space.
245 if (pt
!= curthread
|| (cp
->p_flag
& SVFORK
))
248 mutex_enter(&pp
->p_sc_lock
);
249 for (pagep
= pp
->p_pagep
; pagep
!= NULL
; pagep
= pagep
->spc_next
)
250 (void) as_unmap(cp
->p_as
, pagep
->spc_uaddr
, PAGESIZE
);
251 mutex_exit(&pp
->p_sc_lock
);
256 * Returns non-zero if the specified thread shouldn't be preempted at this time.
257 * Called by ts_preempt(), ts_tick(), and ts_update().
260 schedctl_get_nopreempt(kthread_t
*t
)
262 ASSERT(THREAD_LOCK_HELD(t
));
263 return (t
->t_schedctl
->sc_preemptctl
.sc_nopreempt
);
268 * Sets the value of the nopreempt field for the specified thread.
269 * Called by ts_preempt() to clear the field on preemption.
272 schedctl_set_nopreempt(kthread_t
*t
, short val
)
274 ASSERT(THREAD_LOCK_HELD(t
));
275 t
->t_schedctl
->sc_preemptctl
.sc_nopreempt
= val
;
280 * Sets the value of the yield field for the specified thread.
281 * Called by ts_preempt() and ts_tick() to set the field, and
282 * ts_yield() to clear it.
283 * The kernel never looks at this field so we don't need a
284 * schedctl_get_yield() function.
287 schedctl_set_yield(kthread_t
*t
, short val
)
289 ASSERT(THREAD_LOCK_HELD(t
));
290 t
->t_schedctl
->sc_preemptctl
.sc_yield
= val
;
295 * Sets the values of the cid and priority fields for the specified thread.
296 * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI().
297 * Called following calls to CL_FORKRET() and CL_ENTERCLASS().
300 schedctl_set_cidpri(kthread_t
*t
)
302 sc_shared_t
*tdp
= t
->t_schedctl
;
305 tdp
->sc_cid
= t
->t_cid
;
306 tdp
->sc_cpri
= t
->t_cpri
;
307 tdp
->sc_priority
= DISP_PRIO(t
);
313 * Returns non-zero if the specified thread has requested that all
314 * signals be blocked. Called by signal-related code that tests
315 * the signal mask of a thread that may not be the current thread
316 * and where the process's p_lock cannot be acquired.
319 schedctl_sigblock(kthread_t
*t
)
321 sc_shared_t
*tdp
= t
->t_schedctl
;
324 return (tdp
->sc_sigblock
);
330 * If the sc_sigblock field is set for the specified thread, set
331 * its signal mask to block all maskable signals, then clear the
332 * sc_sigblock field. This finishes what user-level code requested
333 * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
334 * Called from signal-related code either by the current thread for
335 * itself or by a thread that holds the process's p_lock (/proc code).
338 schedctl_finish_sigblock(kthread_t
*t
)
340 sc_shared_t
*tdp
= t
->t_schedctl
;
342 ASSERT(t
== curthread
|| MUTEX_HELD(&ttoproc(t
)->p_lock
));
344 if (tdp
!= NULL
&& tdp
->sc_sigblock
) {
345 t
->t_hold
.__sigbits
[0] = FILLSET0
& ~CANTMASK0
;
346 t
->t_hold
.__sigbits
[1] = FILLSET1
& ~CANTMASK1
;
347 t
->t_hold
.__sigbits
[2] = FILLSET2
& ~CANTMASK2
;
348 tdp
->sc_sigblock
= 0;
354 * Return non-zero if the current thread has declared that it has
355 * a cancellation pending and that cancellation is not disabled.
356 * If SIGCANCEL is blocked, we must be going over the wire in an
357 * NFS transaction (sigintr() was called); return zero in this case.
360 schedctl_cancel_pending(void)
362 sc_shared_t
*tdp
= curthread
->t_schedctl
;
365 (tdp
->sc_flgs
& SC_CANCEL_FLG
) &&
367 !sigismember(&curthread
->t_hold
, SIGCANCEL
))
374 * Inform libc that the kernel returned EINTR from some system call
375 * due to there being a cancellation pending (SC_CANCEL_FLG set or
376 * we received an SI_LWP SIGCANCEL while in a system call), rather
377 * than because of some other signal. User-level code can try to
378 * recover from receiving other signals, but it can't recover from
382 schedctl_cancel_eintr(void)
384 sc_shared_t
*tdp
= curthread
->t_schedctl
;
387 tdp
->sc_flgs
|= SC_EINTR_FLG
;
392 * Return non-zero if the current thread has declared that
393 * it is calling into the kernel to park, else return zero.
396 schedctl_is_park(void)
398 sc_shared_t
*tdp
= curthread
->t_schedctl
;
401 return ((tdp
->sc_flgs
& SC_PARK_FLG
) != 0);
403 * If we're here and there is no shared memory (how could
404 * that happen?) then just assume we really are here to park.
411 * Declare thread is parking.
413 * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid)
414 * in order to declare that the thread is calling into the kernel to park.
416 * This interface exists ONLY to support older versions of libthread which
417 * are not aware of the SC_PARK_FLG flag.
419 * Older versions of libthread which are not aware of the SC_PARK_FLG flag
420 * need to be modified or emulated to call lwpsys_park(4, ...) instead of
421 * lwpsys_park(0, ...). This will invoke schedctl_set_park() before
422 * lwp_park() to declare that the thread is parking.
425 schedctl_set_park(void)
427 sc_shared_t
*tdp
= curthread
->t_schedctl
;
429 tdp
->sc_flgs
|= SC_PARK_FLG
;
434 * Clear the parking flag on return from parking in the kernel.
437 schedctl_unpark(void)
439 sc_shared_t
*tdp
= curthread
->t_schedctl
;
442 tdp
->sc_flgs
&= ~SC_PARK_FLG
;
447 * Page handling code.
454 * Amount of page that can hold sc_shared_t structures. If
455 * sizeof (sc_shared_t) is a power of 2, this should just be
458 sc_pagesize
= PAGESIZE
- (PAGESIZE
% sizeof (sc_shared_t
));
461 * Allocation bitmap is one bit per struct on a page.
463 sc_bitmap_len
= sc_pagesize
/ sizeof (sc_shared_t
);
464 sc_bitmap_words
= howmany(sc_bitmap_len
, BT_NBIPUL
);
469 schedctl_shared_alloc(sc_shared_t
**kaddrp
, uintptr_t *uaddrp
)
472 sc_page_ctl_t
*pagep
;
478 ASSERT(MUTEX_NOT_HELD(&p
->p_lock
));
479 mutex_enter(&p
->p_sc_lock
);
482 * Try to find space for the new data in existing pages
483 * within the process's list of shared pages.
485 for (pagep
= p
->p_pagep
; pagep
!= NULL
; pagep
= pagep
->spc_next
)
486 if (pagep
->spc_space
!= 0)
490 base
= pagep
->spc_uaddr
;
492 struct anon_map
*amp
;
496 * No room, need to allocate a new page. Also set up
497 * a mapping to the kernel address space for the new
498 * page and lock it in memory.
500 if ((error
= schedctl_getpage(&
, &kaddr
)) != 0) {
501 mutex_exit(&p
->p_sc_lock
);
504 if ((error
= schedctl_map(amp
, &base
, kaddr
)) != 0) {
505 schedctl_freepage(amp
, kaddr
);
506 mutex_exit(&p
->p_sc_lock
);
511 * Allocate and initialize the page control structure.
513 pagep
= kmem_alloc(sizeof (sc_page_ctl_t
), KM_SLEEP
);
514 pagep
->spc_amp
= amp
;
515 pagep
->spc_base
= (sc_shared_t
*)kaddr
;
516 pagep
->spc_end
= (sc_shared_t
*)(kaddr
+ sc_pagesize
);
517 pagep
->spc_uaddr
= base
;
519 pagep
->spc_map
= kmem_zalloc(sizeof (ulong_t
) * sc_bitmap_words
,
521 pagep
->spc_space
= sc_pagesize
;
523 pagep
->spc_next
= p
->p_pagep
;
528 * Got a page, now allocate space for the data. There should
529 * be space unless something's wrong.
531 ASSERT(pagep
!= NULL
&& pagep
->spc_space
>= sizeof (sc_shared_t
));
532 index
= bt_availbit(pagep
->spc_map
, sc_bitmap_len
);
536 * Get location with pointer arithmetic. spc_base is of type
537 * sc_shared_t *. Mark as allocated.
539 ssp
= pagep
->spc_base
+ index
;
540 BT_SET(pagep
->spc_map
, index
);
541 pagep
->spc_space
-= sizeof (sc_shared_t
);
543 mutex_exit(&p
->p_sc_lock
);
546 * Return kernel and user addresses.
549 *uaddrp
= (uintptr_t)base
+ ((uintptr_t)ssp
& PAGEOFFSET
);
555 * Find the page control structure corresponding to a kernel address.
557 static sc_page_ctl_t
*
558 schedctl_page_lookup(sc_shared_t
*ssp
)
561 sc_page_ctl_t
*pagep
;
563 ASSERT(MUTEX_HELD(&p
->p_sc_lock
));
564 for (pagep
= p
->p_pagep
; pagep
!= NULL
; pagep
= pagep
->spc_next
) {
565 if (ssp
>= pagep
->spc_base
&& ssp
< pagep
->spc_end
)
568 return (NULL
); /* This "can't happen". Should we panic? */
573 * This function is called when a page needs to be mapped into a
574 * process's address space. Allocate the user address space and
575 * set up the mapping to the page. Assumes the page has already
576 * been allocated and locked in memory via schedctl_getpage.
579 schedctl_map(struct anon_map
*amp
, caddr_t
*uaddrp
, caddr_t kaddr
)
582 struct as
*as
= curproc
->p_as
;
583 struct segvn_crargs vn_a
;
587 /* pass address of kernel mapping as offset to avoid VAC conflicts */
588 map_addr(&addr
, PAGESIZE
, (offset_t
)(uintptr_t)kaddr
, 1, 0);
595 * Use segvn to set up the mapping to the page.
600 vn_a
.type
= MAP_SHARED
;
601 vn_a
.prot
= vn_a
.maxprot
= PROT_ALL
;
605 vn_a
.lgrp_mem_policy_flags
= 0;
606 error
= as_map(as
, addr
, PAGESIZE
, segvn_create
, &vn_a
);
618 * Allocate a new page from anonymous memory. Also, create a kernel
619 * mapping to the page and lock the page in memory.
622 schedctl_getpage(struct anon_map
**newamp
, caddr_t
*newaddr
)
624 struct anon_map
*amp
;
628 * Set up anonymous memory struct. No swap reservation is
629 * needed since the page will be locked into memory.
631 amp
= anonmap_alloc(PAGESIZE
, 0, ANON_SLEEP
);
636 kaddr
= segkp_get_withanonmap(segkp
, PAGESIZE
,
637 KPD_NO_ANON
| KPD_LOCKED
| KPD_ZERO
, amp
);
645 * The page is left SE_SHARED locked so that it won't be
646 * paged out or relocated (KPD_LOCKED above).
656 * Take the necessary steps to allow a page to be released.
657 * This is called when the process is doing exit() or exec().
658 * There should be no accesses to the page after this.
659 * The kernel mapping of the page is released and the page is unlocked.
662 schedctl_freepage(struct anon_map
*amp
, caddr_t kaddr
)
665 * Release the lock on the page and remove the kernel mapping.
667 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
668 segkp_release(segkp
, kaddr
);
671 * Decrement the refcnt so the anon_map structure will be freed.
673 if (--amp
->refcnt
== 0) {
675 * The current process no longer has the page mapped, so
676 * we have to free everything rather than letting as_free
680 anon_free(amp
->ahp
, 0, PAGESIZE
);
681 ANON_LOCK_EXIT(&
->a_rwlock
);
684 ANON_LOCK_EXIT(&
->a_rwlock
);