4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2017 by Delphix. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/systm.h>
28 #include <sys/cmn_err.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
33 #include <sys/debug.h>
34 #include <sys/cpupart.h>
37 #include <sys/cyclic.h>
40 #include <sys/loadavg.h>
41 #include <sys/class.h>
44 #include <sys/pool_pset.h>
45 #include <sys/policy.h>
48 * Calling pool_lock() protects the pools configuration, which includes
49 * CPU partitions. cpu_lock protects the CPU partition list, and prevents
50 * partitions from being created or destroyed while the lock is held.
51 * The lock ordering with respect to related locks is:
53 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock
55 * Blocking memory allocations may be made while holding "pool_lock"
60 * The cp_default partition is allocated statically, but its lgroup load average
61 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
62 * saves some memory since the space allocated reflects the actual number of
63 * lgroups supported by the platform. The lgrp facility provides a temporary
64 * space to hold lpl information during system bootstrap.
67 cpupart_t
*cp_list_head
;
69 static cpupartid_t cp_id_next
;
71 uint_t cp_numparts_nonempty
;
74 * Need to limit total number of partitions to avoid slowing down the
75 * clock code too much. The clock code traverses the list of
76 * partitions and needs to be able to execute in a reasonable amount
77 * of time (less than 1/hz seconds). The maximum is sized based on
78 * max_ncpus so it shouldn't be a problem unless there are large
79 * numbers of empty partitions.
81 static uint_t cp_max_numparts
;
84 * Processor sets and CPU partitions are different but related concepts.
85 * A processor set is a user-level abstraction allowing users to create
86 * sets of CPUs and bind threads exclusively to those sets. A CPU
87 * partition is a kernel dispatcher object consisting of a set of CPUs
88 * and a global dispatch queue. The processor set abstraction is
89 * implemented via a CPU partition, and currently there is a 1-1
90 * mapping between processor sets and partitions (excluding the default
91 * partition, which is not visible as a processor set). Hence, the
92 * numbering for processor sets and CPU partitions is identical. This
93 * may not always be true in the future, and these macros could become
94 * less trivial if we support e.g. a processor set containing multiple
97 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
98 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
100 static int cpupart_unbind_threads(cpupart_t
*, boolean_t
);
103 * Find a CPU partition given a processor set ID.
106 cpupart_find_all(psetid_t psid
)
109 cpupartid_t cpid
= PSTOCP(psid
);
111 ASSERT(MUTEX_HELD(&cpu_lock
));
113 /* default partition not visible as a processor set */
114 if (psid
== CP_DEFAULT
)
118 return (curthread
->t_cpupart
);
122 if (cp
->cp_id
== cpid
)
125 } while (cp
!= cp_list_head
);
130 * Find a CPU partition given a processor set ID if the processor set
131 * should be visible from the calling zone.
134 cpupart_find(psetid_t psid
)
138 ASSERT(MUTEX_HELD(&cpu_lock
));
139 cp
= cpupart_find_all(psid
);
140 if (cp
!= NULL
&& !INGLOBALZONE(curproc
) && pool_pset_enabled() &&
141 zone_pset_get(curproc
->p_zone
) != CPTOPS(cp
->cp_id
))
147 cpupart_kstat_update(kstat_t
*ksp
, int rw
)
149 cpupart_t
*cp
= (cpupart_t
*)ksp
->ks_private
;
150 cpupart_kstat_t
*cpksp
= ksp
->ks_data
;
152 if (rw
== KSTAT_WRITE
)
155 cpksp
->cpk_updates
.value
.ui64
= cp
->cp_updates
;
156 cpksp
->cpk_runnable
.value
.ui64
= cp
->cp_nrunnable_cum
;
157 cpksp
->cpk_waiting
.value
.ui64
= cp
->cp_nwaiting_cum
;
158 cpksp
->cpk_ncpus
.value
.ui32
= cp
->cp_ncpus
;
159 cpksp
->cpk_avenrun_1min
.value
.ui32
= cp
->cp_hp_avenrun
[0] >>
161 cpksp
->cpk_avenrun_5min
.value
.ui32
= cp
->cp_hp_avenrun
[1] >>
163 cpksp
->cpk_avenrun_15min
.value
.ui32
= cp
->cp_hp_avenrun
[2] >>
169 cpupart_kstat_create(cpupart_t
*cp
)
174 ASSERT(MUTEX_HELD(&cpu_lock
));
177 * We have a bit of a chicken-egg problem since this code will
178 * get called to create the kstats for CP_DEFAULT before the
179 * pools framework gets initialized. We circumvent the problem
180 * by special-casing cp_default.
182 if (cp
!= &cp_default
&& pool_pset_enabled())
183 zoneid
= GLOBAL_ZONEID
;
186 ksp
= kstat_create_zone("unix", cp
->cp_id
, "pset", "misc",
188 sizeof (cpupart_kstat_t
) / sizeof (kstat_named_t
), 0, zoneid
);
190 cpupart_kstat_t
*cpksp
= ksp
->ks_data
;
192 kstat_named_init(&cpksp
->cpk_updates
, "updates",
194 kstat_named_init(&cpksp
->cpk_runnable
, "runnable",
196 kstat_named_init(&cpksp
->cpk_waiting
, "waiting",
198 kstat_named_init(&cpksp
->cpk_ncpus
, "ncpus",
200 kstat_named_init(&cpksp
->cpk_avenrun_1min
, "avenrun_1min",
202 kstat_named_init(&cpksp
->cpk_avenrun_5min
, "avenrun_5min",
204 kstat_named_init(&cpksp
->cpk_avenrun_15min
, "avenrun_15min",
207 ksp
->ks_update
= cpupart_kstat_update
;
208 ksp
->ks_private
= cp
;
216 * Initialize the cpupart's lgrp partions (lpls)
219 cpupart_lpl_initialize(cpupart_t
*cp
)
223 sz
= cp
->cp_nlgrploads
= lgrp_plat_max_lgrps();
224 cp
->cp_lgrploads
= kmem_zalloc(sizeof (lpl_t
) * sz
, KM_SLEEP
);
226 for (i
= 0; i
< sz
; i
++) {
228 * The last entry of the lpl's resource set is always NULL
229 * by design (to facilitate iteration)...hence the "oversizing"
232 cp
->cp_lgrploads
[i
].lpl_rset_sz
= sz
+ 1;
233 cp
->cp_lgrploads
[i
].lpl_rset
=
234 kmem_zalloc(sizeof (struct lgrp_ld
*) * (sz
+ 1), KM_SLEEP
);
235 cp
->cp_lgrploads
[i
].lpl_id2rset
=
236 kmem_zalloc(sizeof (int) * (sz
+ 1), KM_SLEEP
);
237 cp
->cp_lgrploads
[i
].lpl_lgrpid
= i
;
242 * Teardown the cpupart's lgrp partitions
245 cpupart_lpl_teardown(cpupart_t
*cp
)
250 for (i
= 0; i
< cp
->cp_nlgrploads
; i
++) {
251 lpl
= &cp
->cp_lgrploads
[i
];
253 sz
= lpl
->lpl_rset_sz
;
254 kmem_free(lpl
->lpl_rset
, sizeof (struct lgrp_ld
*) * sz
);
255 kmem_free(lpl
->lpl_id2rset
, sizeof (int) * sz
);
256 lpl
->lpl_rset
= NULL
;
257 lpl
->lpl_id2rset
= NULL
;
259 kmem_free(cp
->cp_lgrploads
, sizeof (lpl_t
) * cp
->cp_nlgrploads
);
260 cp
->cp_lgrploads
= NULL
;
264 * Initialize the default partition and kpreempt disp queue.
267 cpupart_initialize_default(void)
271 cp_list_head
= &cp_default
;
272 cp_default
.cp_next
= &cp_default
;
273 cp_default
.cp_prev
= &cp_default
;
274 cp_default
.cp_id
= CP_DEFAULT
;
275 cp_default
.cp_kp_queue
.disp_maxrunpri
= -1;
276 cp_default
.cp_kp_queue
.disp_max_unbound_pri
= -1;
277 cp_default
.cp_kp_queue
.disp_cpu
= NULL
;
278 cp_default
.cp_gen
= 0;
279 cp_default
.cp_loadavg
.lg_cur
= 0;
280 cp_default
.cp_loadavg
.lg_len
= 0;
281 cp_default
.cp_loadavg
.lg_total
= 0;
282 for (i
= 0; i
< S_LOADAVG_SZ
; i
++) {
283 cp_default
.cp_loadavg
.lg_loads
[i
] = 0;
285 DISP_LOCK_INIT(&cp_default
.cp_kp_queue
.disp_lock
);
286 cp_id_next
= CP_DEFAULT
+ 1;
287 cpupart_kstat_create(&cp_default
);
289 if (cp_max_numparts
== 0) /* allow for /etc/system tuning */
290 cp_max_numparts
= max_ncpus
* 2 + 1;
292 * Allocate space for cp_default list of lgrploads
294 cpupart_lpl_initialize(&cp_default
);
297 * The initial lpl topology is created in a special lpl list
298 * lpl_bootstrap. It should be copied to cp_default.
299 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
300 * to the correct lpl in the cp_default.cp_lgrploads list.
302 lpl_topo_bootstrap(cp_default
.cp_lgrploads
,
303 cp_default
.cp_nlgrploads
);
306 cp_default
.cp_attr
= PSET_NOESCAPE
;
307 cp_numparts_nonempty
= 1;
311 t0
.t_lpl
= &cp_default
.cp_lgrploads
[LGRP_ROOTID
];
313 bitset_init(&cp_default
.cp_cmt_pgs
);
314 bitset_init_fanout(&cp_default
.cp_haltset
, cp_haltset_fanout
);
316 bitset_resize(&cp_default
.cp_haltset
, max_ncpus
);
321 cpupart_move_cpu(cpu_t
*cp
, cpupart_t
*newpp
, int forced
)
324 cpu_t
*ncp
, *newlist
;
326 int move_threads
= 1;
332 boolean_t unbind_all_threads
= (forced
!= 0);
334 ASSERT(MUTEX_HELD(&cpu_lock
));
335 ASSERT(newpp
!= NULL
);
337 oldpp
= cp
->cpu_part
;
338 ASSERT(oldpp
!= NULL
);
339 ASSERT(oldpp
->cp_ncpus
> 0);
341 if (newpp
== oldpp
) {
343 * Don't need to do anything.
348 cpu_state_change_notify(cp
->cpu_id
, CPU_CPUPART_OUT
);
350 if (!disp_bound_partition(cp
, 0)) {
352 * Don't need to move threads if there are no threads in
353 * the partition. Note that threads can't enter the
354 * partition while we're holding cpu_lock.
357 } else if (oldpp
->cp_ncpus
== 1) {
359 * The last CPU is removed from a partition which has threads
360 * running in it. Some of these threads may be bound to this
363 * Attempt to unbind threads from the CPU and from the processor
364 * set. Note that no threads should be bound to this CPU since
365 * cpupart_move_threads will refuse to move bound threads to
368 (void) cpu_unbind(oldpp
->cp_cpulist
->cpu_id
, B_FALSE
);
369 (void) cpupart_unbind_threads(oldpp
, B_FALSE
);
371 if (!disp_bound_partition(cp
, 0)) {
373 * No bound threads in this partition any more
378 * There are still threads bound to the partition
380 cpu_state_change_notify(cp
->cpu_id
, CPU_CPUPART_IN
);
386 * If forced flag is set unbind any threads from this CPU.
387 * Otherwise unbind soft-bound threads only.
389 if ((ret
= cpu_unbind(cp
->cpu_id
, unbind_all_threads
)) != 0) {
390 cpu_state_change_notify(cp
->cpu_id
, CPU_CPUPART_IN
);
395 * Stop further threads weak binding to this cpu.
401 * Notify the Processor Groups subsystem that the CPU
402 * will be moving cpu partitions. This is done before
403 * CPUs are paused to provide an opportunity for any
404 * needed memory allocations.
406 pg_cpupart_out(cp
, oldpp
);
407 pg_cpupart_in(cp
, newpp
);
413 * Check for threads strong or weak bound to this CPU.
415 for (loop_count
= 0; disp_bound_threads(cp
, 0); loop_count
++) {
416 if (loop_count
>= 5) {
417 cpu_state_change_notify(cp
->cpu_id
,
419 pg_cpupart_out(cp
, newpp
);
420 pg_cpupart_in(cp
, oldpp
);
422 return (EBUSY
); /* some threads still bound */
429 * Before we actually start changing data structures, notify
430 * the cyclic subsystem that we want to move this CPU out of its
433 if (!cyclic_move_out(cp
)) {
435 * This CPU must be the last CPU in a processor set with
438 cpu_state_change_notify(cp
->cpu_id
, CPU_CPUPART_IN
);
439 pg_cpupart_out(cp
, newpp
);
440 pg_cpupart_in(cp
, oldpp
);
445 pause_cpus(cp
, NULL
);
449 * The thread on cpu before the pause thread may have read
450 * cpu_inmotion before we raised the barrier above. Check
453 if (disp_bound_threads(cp
, 1)) {
461 * Now that CPUs are paused, let the PG subsystem perform
462 * any necessary data structure updates.
464 pg_cpupart_move(cp
, oldpp
, newpp
);
466 /* save this cpu's lgroup -- it'll be the same in the new partition */
467 lgrpid
= cp
->cpu_lpl
->lpl_lgrpid
;
469 cpu_lpl
= cp
->cpu_lpl
;
471 * let the lgroup framework know cp has left the partition
473 lgrp_config(LGRP_CONFIG_CPUPART_DEL
, (uintptr_t)cp
, lgrpid
);
475 /* move out of old partition */
477 if (oldpp
->cp_ncpus
> 0) {
479 ncp
= cp
->cpu_prev_part
->cpu_next_part
= cp
->cpu_next_part
;
480 cp
->cpu_next_part
->cpu_prev_part
= cp
->cpu_prev_part
;
481 if (oldpp
->cp_cpulist
== cp
) {
482 oldpp
->cp_cpulist
= ncp
;
485 ncp
= oldpp
->cp_cpulist
= NULL
;
486 cp_numparts_nonempty
--;
487 ASSERT(cp_numparts_nonempty
!= 0);
491 /* move into new partition */
492 newlist
= newpp
->cp_cpulist
;
493 if (newlist
== NULL
) {
494 newpp
->cp_cpulist
= cp
->cpu_next_part
= cp
->cpu_prev_part
= cp
;
495 cp_numparts_nonempty
++;
496 ASSERT(cp_numparts_nonempty
!= 0);
498 cp
->cpu_next_part
= newlist
;
499 cp
->cpu_prev_part
= newlist
->cpu_prev_part
;
500 newlist
->cpu_prev_part
->cpu_next_part
= cp
;
501 newlist
->cpu_prev_part
= cp
;
503 cp
->cpu_part
= newpp
;
507 ASSERT(bitset_is_null(&newpp
->cp_haltset
));
508 ASSERT(bitset_is_null(&oldpp
->cp_haltset
));
511 * let the lgroup framework know cp has entered the partition
513 lgrp_config(LGRP_CONFIG_CPUPART_ADD
, (uintptr_t)cp
, lgrpid
);
516 * If necessary, move threads off processor.
522 * Walk thru the active process list to look for
523 * threads that need to have a new home lgroup,
524 * or the last CPU they run on is the same CPU
525 * being moved out of the partition.
528 for (p
= practive
; p
!= NULL
; p
= p
->p_next
) {
539 ASSERT(t
->t_lpl
!= NULL
);
542 * Update the count of how many threads are
543 * in this CPU's lgroup but have a different lpl
546 if (t
->t_lpl
!= cpu_lpl
&&
547 t
->t_lpl
->lpl_lgrpid
== lgrpid
)
550 * If the lgroup that t is assigned to no
551 * longer has any CPUs in t's partition,
552 * we'll have to choose a new lgroup for t.
555 if (!LGRP_CPUS_IN_PART(t
->t_lpl
->lpl_lgrpid
,
558 lgrp_choose(t
, t
->t_cpupart
), 0);
562 * make sure lpl points to our own partition
564 ASSERT(t
->t_lpl
>= t
->t_cpupart
->cp_lgrploads
&&
565 (t
->t_lpl
< t
->t_cpupart
->cp_lgrploads
+
566 t
->t_cpupart
->cp_nlgrploads
));
568 ASSERT(t
->t_lpl
->lpl_ncpu
> 0);
570 /* Update CPU last ran on if it was this CPU */
571 if (t
->t_cpu
== cp
&& t
->t_cpupart
== oldpp
&&
572 t
->t_bound_cpu
!= cp
) {
573 t
->t_cpu
= disp_lowpri_cpu(ncp
,
574 t
->t_lpl
, t
->t_pri
, NULL
);
577 } while (t
!= p
->p_tlist
);
580 * Didn't find any threads in the same lgroup as this
581 * CPU with a different lpl, so remove the lgroup from
582 * the process lgroup bitmask.
586 klgrpset_del(p
->p_lgrpset
, lgrpid
);
590 * Walk thread list looking for threads that need to be
591 * rehomed, since there are some threads that are not in
592 * their process's p_tlist.
598 ASSERT(t
!= NULL
&& t
->t_lpl
!= NULL
);
601 * If the lgroup that t is assigned to no
602 * longer has any CPUs in t's partition,
603 * we'll have to choose a new lgroup for t.
604 * Also, choose best lgroup for home when
605 * thread has specified lgroup affinities,
606 * since there may be an lgroup with more
607 * affinity available after moving CPUs
610 if (!LGRP_CPUS_IN_PART(t
->t_lpl
->lpl_lgrpid
,
611 t
->t_cpupart
) || t
->t_lgrp_affinity
) {
613 lgrp_choose(t
, t
->t_cpupart
), 1);
616 /* make sure lpl points to our own partition */
617 ASSERT((t
->t_lpl
>= t
->t_cpupart
->cp_lgrploads
) &&
618 (t
->t_lpl
< t
->t_cpupart
->cp_lgrploads
+
619 t
->t_cpupart
->cp_nlgrploads
));
621 ASSERT(t
->t_lpl
->lpl_ncpu
> 0);
623 /* Update CPU last ran on if it was this CPU */
624 if (t
->t_cpu
== cp
&& t
->t_cpupart
== oldpp
&&
625 t
->t_bound_cpu
!= cp
) {
626 t
->t_cpu
= disp_lowpri_cpu(ncp
, t
->t_lpl
,
631 } while (t
!= curthread
);
634 * Clear off the CPU's run queue, and the kp queue if the
635 * partition is now empty.
637 disp_cpu_inactive(cp
);
640 * Make cp switch to a thread from the new partition.
643 cp
->cpu_kprunrun
= 1;
650 * Let anyone interested know that cpu has been added to the set.
652 cpu_state_change_notify(cp
->cpu_id
, CPU_CPUPART_IN
);
655 * Now let the cyclic subsystem know that it can reshuffle cyclics
656 * bound to the new processor set.
664 * Check if thread can be moved to a new cpu partition. Called by
665 * cpupart_move_thread() and pset_bind_start().
668 cpupart_movable_thread(kthread_id_t tp
, cpupart_t
*cp
, int ignore
)
670 ASSERT(MUTEX_HELD(&cpu_lock
));
671 ASSERT(MUTEX_HELD(&ttoproc(tp
)->p_lock
));
673 ASSERT(THREAD_LOCK_HELD(tp
));
676 * CPU-bound threads can't be moved.
679 cpu_t
*boundcpu
= tp
->t_bound_cpu
? tp
->t_bound_cpu
:
681 if (boundcpu
!= NULL
&& boundcpu
->cpu_part
!= cp
)
685 if (tp
->t_cid
== sysdccid
) {
686 return (EINVAL
); /* For now, sysdc threads can't move */
693 * Move thread to new partition. If ignore is non-zero, then CPU
694 * bindings should be ignored (this is used when destroying a
698 cpupart_move_thread(kthread_id_t tp
, cpupart_t
*newpp
, int ignore
,
699 void *projbuf
, void *zonebuf
)
701 cpupart_t
*oldpp
= tp
->t_cpupart
;
704 ASSERT(MUTEX_HELD(&cpu_lock
));
705 ASSERT(MUTEX_HELD(&pidlock
));
706 ASSERT(MUTEX_HELD(&ttoproc(tp
)->p_lock
));
707 ASSERT(newpp
!= NULL
);
709 if (newpp
->cp_cpulist
== NULL
)
713 * Check for errors first.
716 if ((ret
= cpupart_movable_thread(tp
, newpp
, ignore
)) != 0) {
721 /* move the thread */
722 if (oldpp
!= newpp
) {
724 * Make the thread switch to the new partition.
726 tp
->t_cpupart
= newpp
;
727 ASSERT(tp
->t_lpl
!= NULL
);
729 * Leave the thread on the same lgroup if possible; otherwise
730 * choose a new lgroup for it. In either case, update its
733 if (LGRP_CPUS_IN_PART(tp
->t_lpl
->lpl_lgrpid
, newpp
) &&
734 tp
->t_lgrp_affinity
== NULL
) {
736 * The thread's lgroup has CPUs in the thread's new
737 * partition, so the thread can stay assigned to the
738 * same lgroup. Update its t_lpl to point to the
739 * lpl_t for its lgroup in its new partition.
741 lgrp_move_thread(tp
, &tp
->t_cpupart
->\
742 cp_lgrploads
[tp
->t_lpl
->lpl_lgrpid
], 1);
745 * The thread's lgroup has no cpus in its new
746 * partition or it has specified lgroup affinities,
747 * so choose the best lgroup for the thread and
748 * assign it to that lgroup.
750 lgrp_move_thread(tp
, lgrp_choose(tp
, tp
->t_cpupart
),
754 * make sure lpl points to our own partition
756 ASSERT((tp
->t_lpl
>= tp
->t_cpupart
->cp_lgrploads
) &&
757 (tp
->t_lpl
< tp
->t_cpupart
->cp_lgrploads
+
758 tp
->t_cpupart
->cp_nlgrploads
));
760 ASSERT(tp
->t_lpl
->lpl_ncpu
> 0);
762 if (tp
->t_state
== TS_ONPROC
) {
764 } else if (tp
->t_state
== TS_RUN
) {
771 * Our binding has changed; set TP_CHANGEBIND.
773 tp
->t_proc_flag
|= TP_CHANGEBIND
;
777 fss_changepset(tp
, newpp
, projbuf
, zonebuf
);
779 return (0); /* success */
784 * This function binds a thread to a partition. Must be called with the
785 * p_lock of the containing process held (to keep the thread from going
786 * away), and thus also with cpu_lock held (since cpu_lock must be
787 * acquired before p_lock). If ignore is non-zero, then CPU bindings
788 * should be ignored (this is used when destroying a partition).
791 cpupart_bind_thread(kthread_id_t tp
, psetid_t psid
, int ignore
, void *projbuf
,
796 ASSERT(pool_lock_held());
797 ASSERT(MUTEX_HELD(&cpu_lock
));
798 ASSERT(MUTEX_HELD(&pidlock
));
799 ASSERT(MUTEX_HELD(&ttoproc(tp
)->p_lock
));
804 newpp
= cpupart_find(psid
);
809 return (cpupart_move_thread(tp
, newpp
, ignore
, projbuf
, zonebuf
));
814 * Create a new partition. On MP systems, this also allocates a
815 * kpreempt disp queue for that partition.
818 cpupart_create(psetid_t
*psid
)
822 ASSERT(pool_lock_held());
824 pp
= kmem_zalloc(sizeof (cpupart_t
), KM_SLEEP
);
826 mutex_enter(&cpu_lock
);
827 if (cp_numparts
== cp_max_numparts
) {
828 mutex_exit(&cpu_lock
);
829 kmem_free(pp
, sizeof (cpupart_t
));
833 /* find the next free partition ID */
834 while (cpupart_find(CPTOPS(cp_id_next
)) != NULL
)
836 pp
->cp_id
= cp_id_next
++;
838 pp
->cp_cpulist
= NULL
;
840 klgrpset_clear(pp
->cp_lgrpset
);
841 pp
->cp_kp_queue
.disp_maxrunpri
= -1;
842 pp
->cp_kp_queue
.disp_max_unbound_pri
= -1;
843 pp
->cp_kp_queue
.disp_cpu
= NULL
;
845 DISP_LOCK_INIT(&pp
->cp_kp_queue
.disp_lock
);
846 *psid
= CPTOPS(pp
->cp_id
);
847 disp_kp_alloc(&pp
->cp_kp_queue
, v
.v_nglobpris
);
848 cpupart_kstat_create(pp
);
849 cpupart_lpl_initialize(pp
);
851 bitset_init(&pp
->cp_cmt_pgs
);
854 * Initialize and size the partition's bitset of halted CPUs.
856 bitset_init_fanout(&pp
->cp_haltset
, cp_haltset_fanout
);
857 bitset_resize(&pp
->cp_haltset
, max_ncpus
);
860 * Pause all CPUs while changing the partition list, to make sure
861 * the clock thread (which traverses the list without holding
862 * cpu_lock) isn't running.
864 pause_cpus(NULL
, NULL
);
865 pp
->cp_next
= cp_list_head
;
866 pp
->cp_prev
= cp_list_head
->cp_prev
;
867 cp_list_head
->cp_prev
->cp_next
= pp
;
868 cp_list_head
->cp_prev
= pp
;
870 mutex_exit(&cpu_lock
);
876 * Move threads from specified partition to cp_default. If `force' is specified,
877 * move all threads, otherwise move only soft-bound threads.
880 cpupart_unbind_threads(cpupart_t
*pp
, boolean_t unbind_all
)
882 void *projbuf
, *zonebuf
;
886 psetid_t psid
= pp
->cp_id
;
888 ASSERT(pool_lock_held());
889 ASSERT(MUTEX_HELD(&cpu_lock
));
891 if (pp
== NULL
|| pp
== &cp_default
) {
896 * Pre-allocate enough buffers for FSS for all active projects and
897 * for all active zones on the system. Unused buffers will be
898 * freed later by fss_freebuf().
900 projbuf
= fss_allocbuf(FSS_NPROJ_BUF
, FSS_ALLOC_PROJ
);
901 zonebuf
= fss_allocbuf(FSS_NPROJ_BUF
, FSS_ALLOC_ZONE
);
903 mutex_enter(&pidlock
);
906 if (t
->t_bind_pset
== psid
) {
907 again
: p
= ttoproc(t
);
908 mutex_enter(&p
->p_lock
);
909 if (ttoproc(t
) != p
) {
911 * lwp_exit has changed this thread's process
912 * pointer before we grabbed its p_lock.
914 mutex_exit(&p
->p_lock
);
919 * Can only unbind threads which have revocable binding
920 * unless force unbinding requested.
922 if (unbind_all
|| TB_PSET_IS_SOFT(t
)) {
923 err
= cpupart_bind_thread(t
, PS_NONE
, 1,
926 mutex_exit(&p
->p_lock
);
927 mutex_exit(&pidlock
);
928 fss_freebuf(projbuf
, FSS_ALLOC_PROJ
);
929 fss_freebuf(zonebuf
, FSS_ALLOC_ZONE
);
932 t
->t_bind_pset
= PS_NONE
;
934 mutex_exit(&p
->p_lock
);
937 } while (t
!= curthread
);
939 mutex_exit(&pidlock
);
940 fss_freebuf(projbuf
, FSS_ALLOC_PROJ
);
941 fss_freebuf(zonebuf
, FSS_ALLOC_ZONE
);
946 * Destroy a partition.
949 cpupart_destroy(psetid_t psid
)
951 cpu_t
*cp
, *first_cp
;
952 cpupart_t
*pp
, *newpp
;
955 ASSERT(pool_lock_held());
956 mutex_enter(&cpu_lock
);
958 pp
= cpupart_find(psid
);
959 if (pp
== NULL
|| pp
== &cp_default
) {
960 mutex_exit(&cpu_lock
);
965 * Unbind all the threads currently bound to the partition.
967 err
= cpupart_unbind_threads(pp
, B_TRUE
);
969 mutex_exit(&cpu_lock
);
974 while ((cp
= pp
->cp_cpulist
) != NULL
) {
975 if (err
= cpupart_move_cpu(cp
, newpp
, 0)) {
976 mutex_exit(&cpu_lock
);
981 ASSERT(bitset_is_null(&pp
->cp_cmt_pgs
));
982 ASSERT(bitset_is_null(&pp
->cp_haltset
));
985 * Teardown the partition's group of active CMT PGs and halted
986 * CPUs now that they have all left.
988 bitset_fini(&pp
->cp_cmt_pgs
);
989 bitset_fini(&pp
->cp_haltset
);
992 * Reset the pointers in any offline processors so they won't
993 * try to rejoin the destroyed partition when they're turned
998 if (cp
->cpu_part
== pp
) {
999 ASSERT(cp
->cpu_flags
& CPU_OFFLINE
);
1000 cp
->cpu_part
= newpp
;
1003 } while (cp
!= first_cp
);
1006 * Pause all CPUs while changing the partition list, to make sure
1007 * the clock thread (which traverses the list without holding
1008 * cpu_lock) isn't running.
1010 pause_cpus(NULL
, NULL
);
1011 pp
->cp_prev
->cp_next
= pp
->cp_next
;
1012 pp
->cp_next
->cp_prev
= pp
->cp_prev
;
1013 if (cp_list_head
== pp
)
1014 cp_list_head
= pp
->cp_next
;
1017 if (cp_id_next
> pp
->cp_id
)
1018 cp_id_next
= pp
->cp_id
;
1021 kstat_delete(pp
->cp_kstat
);
1025 disp_kp_free(&pp
->cp_kp_queue
);
1027 cpupart_lpl_teardown(pp
);
1029 kmem_free(pp
, sizeof (cpupart_t
));
1030 mutex_exit(&cpu_lock
);
1037 * Return the ID of the partition to which the specified processor belongs.
1040 cpupart_query_cpu(cpu_t
*cp
)
1042 ASSERT(MUTEX_HELD(&cpu_lock
));
1044 return (CPTOPS(cp
->cpu_part
->cp_id
));
1049 * Attach a processor to an existing partition.
1052 cpupart_attach_cpu(psetid_t psid
, cpu_t
*cp
, int forced
)
1057 ASSERT(pool_lock_held());
1058 ASSERT(MUTEX_HELD(&cpu_lock
));
1060 pp
= cpupart_find(psid
);
1063 if (cp
->cpu_flags
& CPU_OFFLINE
)
1066 err
= cpupart_move_cpu(cp
, pp
, forced
);
1071 * Get a list of cpus belonging to the partition. If numcpus is NULL,
1072 * this just checks for a valid partition. If numcpus is non-NULL but
1073 * cpulist is NULL, the current number of cpus is stored in *numcpus.
1074 * If both are non-NULL, the current number of cpus is stored in *numcpus,
1075 * and a list of those cpus up to the size originally in *numcpus is
1076 * stored in cpulist[]. Also, store the processor set id in *psid.
1077 * This is useful in case the processor set id passed in was PS_MYID.
1080 cpupart_get_cpus(psetid_t
*psid
, processorid_t
*cpulist
, uint_t
*numcpus
)
1087 mutex_enter(&cpu_lock
);
1088 pp
= cpupart_find(*psid
);
1090 mutex_exit(&cpu_lock
);
1093 *psid
= CPTOPS(pp
->cp_id
);
1094 ncpus
= pp
->cp_ncpus
;
1096 if (ncpus
> *numcpus
) {
1098 * Only copy as many cpus as were passed in, but
1099 * pass back the real number.
1109 for (i
= 0; i
< ncpus
; i
++) {
1111 cpulist
[i
] = c
->cpu_id
;
1112 c
= c
->cpu_next_part
;
1116 mutex_exit(&cpu_lock
);
1121 * Reallocate kpreempt queues for each CPU partition. Called from
1122 * disp_setup when a new scheduling class is loaded that increases the
1123 * number of priorities in the system.
1126 cpupart_kpqalloc(pri_t npri
)
1130 ASSERT(MUTEX_HELD(&cpu_lock
));
1133 disp_kp_alloc(&cpp
->cp_kp_queue
, npri
);
1135 } while (cpp
!= cp_list_head
);
1139 cpupart_get_loadavg(psetid_t psid
, int *buf
, int nelem
)
1145 ASSERT(nelem
<= LOADAVG_NSTATS
);
1146 ASSERT(MUTEX_HELD(&cpu_lock
));
1148 cp
= cpupart_find(psid
);
1151 for (i
= 0; i
< nelem
; i
++)
1152 buf
[i
] = cp
->cp_hp_avenrun
[i
] >> (16 - FSHIFT
);
1159 cpupart_list(psetid_t
*list
, uint_t nelem
, int flag
)
1164 ASSERT(MUTEX_HELD(&cpu_lock
));
1165 ASSERT(flag
== CP_ALL
|| flag
== CP_NONEMPTY
);
1170 if (((flag
== CP_ALL
) && (cp
!= &cp_default
)) ||
1171 ((flag
== CP_NONEMPTY
) && (cp
->cp_ncpus
!= 0))) {
1172 if (numpart
== nelem
)
1174 list
[numpart
++] = CPTOPS(cp
->cp_id
);
1177 } while (cp
!= cp_list_head
);
1180 ASSERT(numpart
< cp_numparts
);
1183 numpart
= cp_numparts
- 1; /* leave out default partition */
1184 else if (flag
== CP_NONEMPTY
)
1185 numpart
= cp_numparts_nonempty
;
1191 cpupart_setattr(psetid_t psid
, uint_t attr
)
1195 ASSERT(pool_lock_held());
1197 mutex_enter(&cpu_lock
);
1198 if ((cp
= cpupart_find(psid
)) == NULL
) {
1199 mutex_exit(&cpu_lock
);
1203 * PSET_NOESCAPE attribute for default cpu partition is always set
1205 if (cp
== &cp_default
&& !(attr
& PSET_NOESCAPE
)) {
1206 mutex_exit(&cpu_lock
);
1210 mutex_exit(&cpu_lock
);
1215 cpupart_getattr(psetid_t psid
, uint_t
*attrp
)
1219 mutex_enter(&cpu_lock
);
1220 if ((cp
= cpupart_find(psid
)) == NULL
) {
1221 mutex_exit(&cpu_lock
);
1224 *attrp
= cp
->cp_attr
;
1225 mutex_exit(&cpu_lock
);