4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/systm.h>
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/thread.h>
29 #include <sys/cpuvar.h>
30 #include <sys/cpupart.h>
32 #include <sys/cmn_err.h>
33 #include <sys/kstat.h>
34 #include <sys/processor.h>
36 #include <sys/group.h>
38 #include <sys/bitset.h>
41 #include <sys/cpu_pm.h>
44 * CMT scheduler / dispatcher support
46 * This file implements CMT scheduler support using Processor Groups.
47 * The CMT processor group class creates and maintains the CMT class
48 * specific processor group pg_cmt_t.
50 * ---------------------------- <-- pg_cmt_t *
52 * ----------------------------
53 * | CMT class specific data |
54 * | - hierarchy linkage |
55 * | - CMT load balancing data|
56 * | - active CPU group/bitset|
57 * ----------------------------
59 * The scheduler/dispatcher leverages knowledge of the performance
60 * relevant CMT sharing relationships existing between cpus to implement
61 * optimized affinity, load balancing, and coalescence policies.
63 * Load balancing policy seeks to improve performance by minimizing
64 * contention over shared processor resources / facilities, Affinity
65 * policies seek to improve cache and TLB utilization. Coalescence
66 * policies improve resource utilization and ultimately power efficiency.
68 * The CMT PGs created by this class are already arranged into a
69 * hierarchy (which is done in the pghw layer). To implement the top-down
70 * CMT load balancing algorithm, the CMT PGs additionally maintain
71 * parent, child and sibling hierarchy relationships.
72 * Parent PGs always contain a superset of their children(s) resources,
73 * each PG can have at most one parent, and siblings are the group of PGs
74 * sharing the same parent.
76 * On UMA based systems, the CMT load balancing algorithm begins by balancing
77 * load across the group of top level PGs in the system hierarchy.
78 * On NUMA systems, the CMT load balancing algorithm balances load across the
79 * group of top level PGs in each leaf lgroup...but for root homed threads,
80 * is willing to balance against all the top level PGs in the system.
82 * Groups of top level PGs are maintained to implement the above, one for each
83 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
84 * root lgroup) that contains all the top level PGs in the system.
86 static cmt_lgrp_t
*cmt_lgrps
= NULL
; /* cmt_lgrps list head */
87 static cmt_lgrp_t
*cpu0_lgrp
= NULL
; /* boot CPU's initial lgrp */
88 /* used for null_proc_lpa */
89 cmt_lgrp_t
*cmt_root
= NULL
; /* Reference to root cmt pg */
91 static int is_cpu0
= 1; /* true if this is boot CPU context */
94 * Array of hardware sharing relationships that are blacklisted.
95 * CMT scheduling optimizations won't be performed for blacklisted sharing
98 static int cmt_hw_blacklisted
[PGHW_NUM_COMPONENTS
];
101 * Set this to non-zero to disable CMT scheduling
102 * This must be done via kmdb -d, as /etc/system will be too late
104 int cmt_sched_disabled
= 0;
107 * Status codes for CMT lineage validation
108 * See pg_cmt_lineage_validate() below
110 typedef enum cmt_lineage_validation
{
112 CMT_LINEAGE_NON_CONCENTRIC
,
113 CMT_LINEAGE_PG_SPANS_LGRPS
,
114 CMT_LINEAGE_NON_PROMOTABLE
,
115 CMT_LINEAGE_REPAIRED
,
116 CMT_LINEAGE_UNRECOVERABLE
117 } cmt_lineage_validation_t
;
120 * Status of the current lineage under construction.
121 * One must be holding cpu_lock to change this.
123 cmt_lineage_validation_t cmt_lineage_status
= CMT_LINEAGE_VALID
;
126 * Power domain definitions (on x86) are defined by ACPI, and
127 * therefore may be subject to BIOS bugs.
129 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw)
132 * Macro to test if PG is managed by the CMT PG class
134 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
136 static pg_cid_t pg_cmt_class_id
; /* PG class id */
138 static pg_t
*pg_cmt_alloc();
139 static void pg_cmt_free(pg_t
*);
140 static void pg_cmt_cpu_init(cpu_t
*, cpu_pg_t
*);
141 static void pg_cmt_cpu_fini(cpu_t
*, cpu_pg_t
*);
142 static void pg_cmt_cpu_active(cpu_t
*);
143 static void pg_cmt_cpu_inactive(cpu_t
*);
144 static void pg_cmt_cpupart_in(cpu_t
*, cpupart_t
*);
145 static void pg_cmt_cpupart_move(cpu_t
*, cpupart_t
*, cpupart_t
*);
146 static char *pg_cmt_policy_name(pg_t
*);
147 static void pg_cmt_hier_sort(pg_cmt_t
**, int);
148 static pg_cmt_t
*pg_cmt_hier_rank(pg_cmt_t
*, pg_cmt_t
*);
149 static int pg_cmt_cpu_belongs(pg_t
*, cpu_t
*);
150 static int pg_cmt_hw(pghw_type_t
);
151 static cmt_lgrp_t
*pg_cmt_find_lgrp(lgrp_handle_t
);
152 static cmt_lgrp_t
*pg_cmt_lgrp_create(lgrp_handle_t
);
153 static void cmt_ev_thread_swtch(pg_t
*, cpu_t
*, hrtime_t
,
154 kthread_t
*, kthread_t
*);
155 static void cmt_ev_thread_swtch_pwr(pg_t
*, cpu_t
*, hrtime_t
,
156 kthread_t
*, kthread_t
*);
157 static void cmt_ev_thread_remain_pwr(pg_t
*, cpu_t
*, kthread_t
*);
158 static cmt_lineage_validation_t
pg_cmt_lineage_validate(pg_cmt_t
**, int *,
164 struct pg_ops pg_ops_cmt
= {
172 NULL
, /* cpupart_out */
179 * Initialize the CMT PG class
182 pg_cmt_class_init(void)
184 if (cmt_sched_disabled
)
187 pg_cmt_class_id
= pg_class_register("cmt", &pg_ops_cmt
, PGR_PHYSICAL
);
191 * Called to indicate a new CPU has started up so
192 * that either t0 or the slave startup thread can
196 pg_cmt_cpu_startup(cpu_t
*cp
)
198 pg_ev_thread_swtch(cp
, gethrtime_unscaled(), cp
->cpu_idle_thread
,
203 * Return non-zero if thread can migrate between "from" and "to"
204 * without a performance penalty
207 pg_cmt_can_migrate(cpu_t
*from
, cpu_t
*to
)
209 if (from
->cpu_physid
->cpu_cacheid
==
210 to
->cpu_physid
->cpu_cacheid
)
216 * CMT class specific PG allocation
221 return (kmem_zalloc(sizeof (pg_cmt_t
), KM_NOSLEEP
));
225 * Class specific PG de-allocation
228 pg_cmt_free(pg_t
*pg
)
231 ASSERT(IS_CMT_PG(pg
));
233 kmem_free((pg_cmt_t
*)pg
, sizeof (pg_cmt_t
));
237 * Given a hardware sharing relationship, return which dispatcher
238 * policies should be implemented to optimize performance and efficiency
240 static pg_cmt_policy_t
241 pg_cmt_policy(pghw_type_t hw
)
246 * Give the platform a chance to override the default
248 if ((p
= pg_plat_cmt_policy(hw
)) != CMT_NO_POLICY
)
256 return (CMT_BALANCE
);
258 return (CMT_AFFINITY
| CMT_BALANCE
);
259 case PGHW_POW_ACTIVE
:
261 return (CMT_BALANCE
);
263 return (CMT_NO_POLICY
);
268 * Rank the importance of optimizing for the pg1 relationship vs.
269 * the pg2 relationship.
272 pg_cmt_hier_rank(pg_cmt_t
*pg1
, pg_cmt_t
*pg2
)
274 pghw_type_t hw1
= ((pghw_t
*)pg1
)->pghw_hw
;
275 pghw_type_t hw2
= ((pghw_t
*)pg2
)->pghw_hw
;
278 * A power domain is only important if CPUPM is enabled.
280 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED
) {
281 if (PGHW_IS_PM_DOMAIN(hw1
) && !PGHW_IS_PM_DOMAIN(hw2
))
283 if (PGHW_IS_PM_DOMAIN(hw2
) && !PGHW_IS_PM_DOMAIN(hw1
))
288 * Otherwise, ask the platform
290 if (pg_plat_hw_rank(hw1
, hw2
) == hw1
)
297 * Initialize CMT callbacks for the given PG
300 cmt_callback_init(pg_t
*pg
)
303 * Stick with the default callbacks if there isn't going to be
304 * any CMT thread placement optimizations implemented.
306 if (((pg_cmt_t
*)pg
)->cmt_policy
== CMT_NO_POLICY
)
309 switch (((pghw_t
*)pg
)->pghw_hw
) {
310 case PGHW_POW_ACTIVE
:
311 pg
->pg_cb
.thread_swtch
= cmt_ev_thread_swtch_pwr
;
312 pg
->pg_cb
.thread_remain
= cmt_ev_thread_remain_pwr
;
315 pg
->pg_cb
.thread_swtch
= cmt_ev_thread_swtch
;
321 * Promote PG above it's current parent.
322 * This is only legal if PG has an equal or greater number of CPUs than its
325 * This routine operates on the CPU specific processor group data (for the CPUs
326 * in the PG being promoted), and may be invoked from a context where one CPU's
327 * PG data is under construction. In this case the argument "pgdata", if not
328 * NULL, is a reference to the CPU's under-construction PG data.
331 cmt_hier_promote(pg_cmt_t
*pg
, cpu_pg_t
*pgdata
)
337 pg_cpu_itr_t cpu_iter
;
342 ASSERT(MUTEX_HELD(&cpu_lock
));
344 parent
= pg
->cmt_parent
;
345 if (parent
== NULL
) {
352 ASSERT(PG_NUM_CPUS((pg_t
*)pg
) >= PG_NUM_CPUS((pg_t
*)parent
));
355 * We're changing around the hierarchy, which is actively traversed
356 * by the dispatcher. Pause CPUS to ensure exclusivity.
358 pause_cpus(NULL
, NULL
);
361 * If necessary, update the parent's sibling set, replacing parent
364 if (parent
->cmt_siblings
) {
365 if (group_remove(parent
->cmt_siblings
, parent
, GRP_NORESIZE
)
367 r
= group_add(parent
->cmt_siblings
, pg
, GRP_NORESIZE
);
373 * If the parent is at the top of the hierarchy, replace it's entry
374 * in the root lgroup's group of top level PGs.
376 if (parent
->cmt_parent
== NULL
&&
377 parent
->cmt_siblings
!= &cmt_root
->cl_pgs
) {
378 if (group_remove(&cmt_root
->cl_pgs
, parent
, GRP_NORESIZE
)
380 r
= group_add(&cmt_root
->cl_pgs
, pg
, GRP_NORESIZE
);
386 * We assume (and therefore assert) that the PG being promoted is an
387 * only child of it's parent. Update the parent's children set
388 * replacing PG's entry with the parent (since the parent is becoming
389 * the child). Then have PG and the parent swap children sets and
392 ASSERT(GROUP_SIZE(parent
->cmt_children
) <= 1);
393 if (group_remove(parent
->cmt_children
, pg
, GRP_NORESIZE
) != -1) {
394 r
= group_add(parent
->cmt_children
, parent
, GRP_NORESIZE
);
398 children
= pg
->cmt_children
;
399 pg
->cmt_children
= parent
->cmt_children
;
400 parent
->cmt_children
= children
;
402 nchildren
= pg
->cmt_nchildren
;
403 pg
->cmt_nchildren
= parent
->cmt_nchildren
;
404 parent
->cmt_nchildren
= nchildren
;
407 * Update the sibling references for PG and it's parent
409 pg
->cmt_siblings
= parent
->cmt_siblings
;
410 parent
->cmt_siblings
= pg
->cmt_children
;
413 * Update any cached lineages in the per CPU pg data.
415 PG_CPU_ITR_INIT(pg
, cpu_iter
);
416 while ((cpu
= pg_cpu_next(&cpu_iter
)) != NULL
) {
420 cpu_pg_t
*pgd
; /* CPU's PG data */
423 * The CPU's whose lineage is under construction still
424 * references the bootstrap CPU PG data structure.
426 if (pg_cpu_is_bootstrapped(cpu
))
432 * Iterate over the CPU's PGs updating the children
433 * of the PG being promoted, since they have a new parent.
435 group_iter_init(&iter
);
436 while ((cpu_pg
= group_iterate(&pgd
->cmt_pgs
, &iter
)) != NULL
) {
437 if (cpu_pg
->cmt_parent
== pg
) {
438 cpu_pg
->cmt_parent
= parent
;
443 * Update the CMT load balancing lineage
445 if ((idx
= group_find(&pgd
->cmt_pgs
, (void *)pg
)) == -1) {
447 * Unless this is the CPU who's lineage is being
448 * constructed, the PG being promoted should be
451 ASSERT(pg_cpu_is_bootstrapped(cpu
));
456 ASSERT(GROUP_ACCESS(&pgd
->cmt_pgs
, idx
- 1) == parent
);
459 * Have the child and the parent swap places in the CPU's
462 group_remove_at(&pgd
->cmt_pgs
, idx
);
463 group_remove_at(&pgd
->cmt_pgs
, idx
- 1);
464 err
= group_add_at(&pgd
->cmt_pgs
, parent
, idx
);
466 err
= group_add_at(&pgd
->cmt_pgs
, pg
, idx
- 1);
470 * Ensure cmt_lineage references CPU's leaf PG.
471 * Since cmt_pgs is top-down ordered, the bottom is the last
474 if ((sz
= GROUP_SIZE(&pgd
->cmt_pgs
)) > 0)
475 pgd
->cmt_lineage
= GROUP_ACCESS(&pgd
->cmt_pgs
, sz
- 1);
479 * Update the parent references for PG and it's parent
481 pg
->cmt_parent
= parent
->cmt_parent
;
482 parent
->cmt_parent
= pg
;
488 * CMT class callback for a new CPU entering the system
490 * This routine operates on the CPU specific processor group data (for the CPU
491 * being initialized). The argument "pgdata" is a reference to the CPU's PG
492 * data to be constructed.
494 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
495 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
496 * calls must be careful to operate only on the "pgdata" argument, and not
500 pg_cmt_cpu_init(cpu_t
*cp
, cpu_pg_t
*pgdata
)
506 pg_t
*pg_cache
= NULL
;
507 pg_cmt_t
*cpu_cmt_hier
[PGHW_NUM_COMPONENTS
];
508 lgrp_handle_t lgrp_handle
;
510 cmt_lineage_validation_t lineage_status
;
512 ASSERT(MUTEX_HELD(&cpu_lock
));
513 ASSERT(pg_cpu_is_bootstrapped(cp
));
515 if (cmt_sched_disabled
)
519 * A new CPU is coming into the system.
520 * Interrogate the platform to see if the CPU
521 * has any performance or efficiency relevant
522 * sharing relationships
524 cmt_pgs
= &pgdata
->cmt_pgs
;
525 pgdata
->cmt_lineage
= NULL
;
527 bzero(cpu_cmt_hier
, sizeof (cpu_cmt_hier
));
529 for (hw
= PGHW_START
; hw
< PGHW_NUM_COMPONENTS
; hw
++) {
531 pg_cmt_policy_t policy
;
534 * We're only interested in the hw sharing relationships
535 * for which we know how to optimize.
537 policy
= pg_cmt_policy(hw
);
538 if (policy
== CMT_NO_POLICY
||
539 pg_plat_hw_shared(cp
, hw
) == 0)
543 * We will still create the PGs for hardware sharing
544 * relationships that have been blacklisted, but won't
545 * implement CMT thread placement optimizations against them.
547 if (cmt_hw_blacklisted
[hw
] == 1)
548 policy
= CMT_NO_POLICY
;
551 * Find (or create) the PG associated with
552 * the hw sharing relationship in which cp
555 * Determine if a suitable PG already
556 * exists, or if one needs to be created.
558 pg
= (pg_cmt_t
*)pghw_place_cpu(cp
, hw
);
562 * Initialize the common...
564 pg
= (pg_cmt_t
*)pg_create(pg_cmt_class_id
);
566 /* ... physical ... */
567 pghw_init((pghw_t
*)pg
, cp
, hw
);
570 * ... and CMT specific portions of the
573 pg
->cmt_policy
= policy
;
575 /* CMT event callbacks */
576 cmt_callback_init((pg_t
*)pg
);
578 bitset_init(&pg
->cmt_cpus_actv_set
);
579 group_create(&pg
->cmt_cpus_actv
);
581 ASSERT(IS_CMT_PG(pg
));
584 ((pghw_t
*)pg
)->pghw_generation
++;
586 /* Add the CPU to the PG */
587 pg_cpu_add((pg_t
*)pg
, cp
, pgdata
);
590 * Ensure capacity of the active CPU group/bitset
592 group_expand(&pg
->cmt_cpus_actv
,
593 GROUP_SIZE(&((pg_t
*)pg
)->pg_cpus
));
596 bitset_capacity(&pg
->cmt_cpus_actv_set
)) {
597 bitset_resize(&pg
->cmt_cpus_actv_set
,
602 * Build a lineage of CMT PGs for load balancing / coalescence
604 if (policy
& (CMT_BALANCE
| CMT_COALESCE
)) {
605 cpu_cmt_hier
[levels
++] = pg
;
608 /* Cache this for later */
609 if (hw
== PGHW_CACHE
)
610 pg_cache
= (pg_t
*)pg
;
613 group_expand(cmt_pgs
, levels
);
615 if (cmt_root
== NULL
)
616 cmt_root
= pg_cmt_lgrp_create(lgrp_plat_root_hand());
619 * Find the lgrp that encapsulates this CPU's CMT hierarchy
621 lgrp_handle
= lgrp_plat_cpu_to_hand(cp
->cpu_id
);
622 if ((lgrp
= pg_cmt_find_lgrp(lgrp_handle
)) == NULL
)
623 lgrp
= pg_cmt_lgrp_create(lgrp_handle
);
626 * Ascendingly sort the PGs in the lineage by number of CPUs
628 pg_cmt_hier_sort(cpu_cmt_hier
, levels
);
631 * Examine the lineage and validate it.
632 * This routine will also try to fix the lineage along with the
633 * rest of the PG hierarchy should it detect an issue.
635 * If it returns anything other than VALID or REPAIRED, an
636 * unrecoverable error has occurred, and we cannot proceed.
638 lineage_status
= pg_cmt_lineage_validate(cpu_cmt_hier
, &levels
, pgdata
);
639 if ((lineage_status
!= CMT_LINEAGE_VALID
) &&
640 (lineage_status
!= CMT_LINEAGE_REPAIRED
)) {
642 * In the case of an unrecoverable error where CMT scheduling
643 * has been disabled, assert that the under construction CPU's
644 * PG data has an empty CMT load balancing lineage.
646 ASSERT((cmt_sched_disabled
== 0) ||
647 (GROUP_SIZE(&(pgdata
->cmt_pgs
)) == 0));
652 * For existing PGs in the lineage, verify that the parent is
653 * correct, as the generation in the lineage may have changed
654 * as a result of the sorting. Start the traversal at the top
655 * of the lineage, moving down.
657 for (level
= levels
- 1; level
>= 0; ) {
661 pg
= cpu_cmt_hier
[level
];
664 * Promote PGs at an incorrect generation into place.
666 while (pg
->cmt_parent
&&
667 pg
->cmt_parent
!= cpu_cmt_hier
[level
+ 1]) {
668 cmt_hier_promote(pg
, pgdata
);
678 * For each of the PGs in the CPU's lineage:
679 * - Add an entry in the CPU sorted CMT PG group
680 * which is used for top down CMT load balancing
681 * - Tie the PG into the CMT hierarchy by connecting
682 * it to it's parent and siblings.
684 for (level
= 0; level
< levels
; level
++) {
688 pg
= cpu_cmt_hier
[level
];
689 err
= group_add_at(cmt_pgs
, pg
, levels
- level
- 1);
693 pgdata
->cmt_lineage
= (pg_t
*)pg
;
695 if (pg
->cmt_siblings
!= NULL
) {
696 /* Already initialized */
697 ASSERT(pg
->cmt_parent
== NULL
||
698 pg
->cmt_parent
== cpu_cmt_hier
[level
+ 1]);
699 ASSERT(pg
->cmt_siblings
== &lgrp
->cl_pgs
||
700 ((pg
->cmt_parent
!= NULL
) &&
701 pg
->cmt_siblings
== pg
->cmt_parent
->cmt_children
));
705 if ((level
+ 1) == levels
) {
706 pg
->cmt_parent
= NULL
;
708 pg
->cmt_siblings
= &lgrp
->cl_pgs
;
709 children
= ++lgrp
->cl_npgs
;
710 if (cmt_root
!= lgrp
)
713 pg
->cmt_parent
= cpu_cmt_hier
[level
+ 1];
716 * A good parent keeps track of their children.
717 * The parent's children group is also the PG's
720 if (pg
->cmt_parent
->cmt_children
== NULL
) {
721 pg
->cmt_parent
->cmt_children
=
722 kmem_zalloc(sizeof (group_t
), KM_SLEEP
);
723 group_create(pg
->cmt_parent
->cmt_children
);
725 pg
->cmt_siblings
= pg
->cmt_parent
->cmt_children
;
726 children
= ++pg
->cmt_parent
->cmt_nchildren
;
729 group_expand(pg
->cmt_siblings
, children
);
730 group_expand(&cmt_root
->cl_pgs
, cmt_root
->cl_npgs
);
734 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
735 * for fast lookups later.
737 if (cp
->cpu_physid
) {
738 cp
->cpu_physid
->cpu_chipid
=
739 pg_plat_hw_instance_id(cp
, PGHW_CHIP
);
740 cp
->cpu_physid
->cpu_coreid
= pg_plat_get_core_id(cp
);
743 * If this cpu has a PG representing shared cache, then set
744 * cpu_cacheid to that PG's logical id
747 cp
->cpu_physid
->cpu_cacheid
= pg_cache
->pg_id
;
750 /* CPU0 only initialization */
759 * Class callback when a CPU is leaving the system (deletion)
761 * "pgdata" is a reference to the CPU's PG data to be deconstructed.
763 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
764 * references a "bootstrap" structure across this function's invocation.
765 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
766 * on the "pgdata" argument, and not cp->cpu_pg.
769 pg_cmt_cpu_fini(cpu_t
*cp
, cpu_pg_t
*pgdata
)
773 group_t
*pgs
, *cmt_pgs
;
774 lgrp_handle_t lgrp_handle
;
777 if (cmt_sched_disabled
)
780 ASSERT(pg_cpu_is_bootstrapped(cp
));
783 cmt_pgs
= &pgdata
->cmt_pgs
;
786 * Find the lgroup that encapsulates this CPU's CMT hierarchy
788 lgrp_handle
= lgrp_plat_cpu_to_hand(cp
->cpu_id
);
790 lgrp
= pg_cmt_find_lgrp(lgrp_handle
);
791 if (ncpus
== 1 && lgrp
!= cpu0_lgrp
) {
793 * One might wonder how we could be deconfiguring the
794 * only CPU in the system.
796 * On Starcat systems when null_proc_lpa is detected,
797 * the boot CPU (which is already configured into a leaf
798 * lgroup), is moved into the root lgroup. This is done by
799 * deconfiguring it from both lgroups and processor
800 * groups), and then later reconfiguring it back in. This
801 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
803 * This special case is detected by noting that the platform
804 * has changed the CPU's lgrp affiliation (since it now
805 * belongs in the root). In this case, use the cmt_lgrp_t
806 * cached for the boot CPU, since this is what needs to be
812 ASSERT(lgrp
!= NULL
);
815 * First, clean up anything load balancing specific for each of
816 * the CPU's PGs that participated in CMT load balancing
818 pg
= (pg_cmt_t
*)pgdata
->cmt_lineage
;
821 ((pghw_t
*)pg
)->pghw_generation
++;
824 * Remove the PG from the CPU's load balancing lineage
826 (void) group_remove(cmt_pgs
, pg
, GRP_RESIZE
);
829 * If it's about to become empty, destroy it's children
830 * group, and remove it's reference from it's siblings.
831 * This is done here (rather than below) to avoid removing
832 * our reference from a PG that we just eliminated.
834 if (GROUP_SIZE(&((pg_t
*)pg
)->pg_cpus
) == 1) {
835 if (pg
->cmt_children
!= NULL
)
836 group_destroy(pg
->cmt_children
);
837 if (pg
->cmt_siblings
!= NULL
) {
838 if (pg
->cmt_siblings
== &lgrp
->cl_pgs
)
841 pg
->cmt_parent
->cmt_nchildren
--;
846 ASSERT(GROUP_SIZE(cmt_pgs
) == 0);
849 * Now that the load balancing lineage updates have happened,
850 * remove the CPU from all it's PGs (destroying any that become
854 while ((pg
= group_iterate(pgs
, &i
)) != NULL
) {
855 if (IS_CMT_PG(pg
) == 0)
858 pg_cpu_delete((pg_t
*)pg
, cp
, pgdata
);
860 * Deleting the CPU from the PG changes the CPU's
861 * PG group over which we are actively iterating
862 * Re-initialize the iteration
866 if (GROUP_SIZE(&((pg_t
*)pg
)->pg_cpus
) == 0) {
869 * The PG has become zero sized, so destroy it.
871 group_destroy(&pg
->cmt_cpus_actv
);
872 bitset_fini(&pg
->cmt_cpus_actv_set
);
873 pghw_fini((pghw_t
*)pg
);
875 pg_destroy((pg_t
*)pg
);
881 * Class callback when a CPU is entering a cpu partition
884 pg_cmt_cpupart_in(cpu_t
*cp
, cpupart_t
*pp
)
890 ASSERT(MUTEX_HELD(&cpu_lock
));
892 if (cmt_sched_disabled
)
895 pgs
= &cp
->cpu_pg
->pgs
;
898 * Ensure that the new partition's PG bitset
899 * is large enough for all CMT PG's to which cp
903 while ((pg
= group_iterate(pgs
, &i
)) != NULL
) {
904 if (IS_CMT_PG(pg
) == 0)
907 if (bitset_capacity(&pp
->cp_cmt_pgs
) <= pg
->pg_id
)
908 bitset_resize(&pp
->cp_cmt_pgs
, pg
->pg_id
+ 1);
913 * Class callback when a CPU is actually moving partitions
916 pg_cmt_cpupart_move(cpu_t
*cp
, cpupart_t
*oldpp
, cpupart_t
*newpp
)
921 group_iter_t pg_iter
;
922 pg_cpu_itr_t cpu_iter
;
925 ASSERT(MUTEX_HELD(&cpu_lock
));
927 if (cmt_sched_disabled
)
930 pgs
= &cp
->cpu_pg
->pgs
;
931 group_iter_init(&pg_iter
);
934 * Iterate over the CPUs CMT PGs
936 while ((pg
= group_iterate(pgs
, &pg_iter
)) != NULL
) {
938 if (IS_CMT_PG(pg
) == 0)
942 * Add the PG to the bitset in the new partition.
944 bitset_add(&newpp
->cp_cmt_pgs
, pg
->pg_id
);
947 * Remove the PG from the bitset in the old partition
948 * if the last of the PG's CPUs have left.
951 PG_CPU_ITR_INIT(pg
, cpu_iter
);
952 while ((cpp
= pg_cpu_next(&cpu_iter
)) != NULL
) {
955 if (CPU_ACTIVE(cpp
) &&
956 cpp
->cpu_part
->cp_id
== oldpp
->cp_id
) {
962 bitset_del(&cp
->cpu_part
->cp_cmt_pgs
, pg
->pg_id
);
967 * Class callback when a CPU becomes active (online)
969 * This is called in a context where CPUs are paused
972 pg_cmt_cpu_active(cpu_t
*cp
)
979 ASSERT(MUTEX_HELD(&cpu_lock
));
981 if (cmt_sched_disabled
)
984 pgs
= &cp
->cpu_pg
->pgs
;
988 * Iterate over the CPU's PGs
990 while ((pg
= group_iterate(pgs
, &i
)) != NULL
) {
992 if (IS_CMT_PG(pg
) == 0)
996 * Move to the next generation since topology is changing
998 ((pghw_t
*)pg
)->pghw_generation
++;
1000 err
= group_add(&pg
->cmt_cpus_actv
, cp
, GRP_NORESIZE
);
1004 * If this is the first active CPU in the PG, and it
1005 * represents a hardware sharing relationship over which
1006 * CMT load balancing is performed, add it as a candidate
1007 * for balancing with it's siblings.
1009 if (GROUP_SIZE(&pg
->cmt_cpus_actv
) == 1 &&
1010 (pg
->cmt_policy
& (CMT_BALANCE
| CMT_COALESCE
))) {
1011 err
= group_add(pg
->cmt_siblings
, pg
, GRP_NORESIZE
);
1015 * If this is a top level PG, add it as a balancing
1016 * candidate when balancing within the root lgroup.
1018 if (pg
->cmt_parent
== NULL
&&
1019 pg
->cmt_siblings
!= &cmt_root
->cl_pgs
) {
1020 err
= group_add(&cmt_root
->cl_pgs
, pg
,
1027 * Notate the CPU in the PGs active CPU bitset.
1028 * Also notate the PG as being active in it's associated
1031 bitset_add(&pg
->cmt_cpus_actv_set
, cp
->cpu_seqid
);
1032 bitset_add(&cp
->cpu_part
->cp_cmt_pgs
, ((pg_t
*)pg
)->pg_id
);
1037 * Class callback when a CPU goes inactive (offline)
1039 * This is called in a context where CPUs are paused
1042 pg_cmt_cpu_inactive(cpu_t
*cp
)
1049 pg_cpu_itr_t cpu_itr
;
1052 ASSERT(MUTEX_HELD(&cpu_lock
));
1054 if (cmt_sched_disabled
)
1057 pgs
= &cp
->cpu_pg
->pgs
;
1058 group_iter_init(&i
);
1060 while ((pg
= group_iterate(pgs
, &i
)) != NULL
) {
1062 if (IS_CMT_PG(pg
) == 0)
1066 * Move to the next generation since topology is changing
1068 ((pghw_t
*)pg
)->pghw_generation
++;
1071 * Remove the CPU from the CMT PGs active CPU group
1074 err
= group_remove(&pg
->cmt_cpus_actv
, cp
, GRP_NORESIZE
);
1077 bitset_del(&pg
->cmt_cpus_actv_set
, cp
->cpu_seqid
);
1080 * If there are no more active CPUs in this PG over which
1081 * load was balanced, remove it as a balancing candidate.
1083 if (GROUP_SIZE(&pg
->cmt_cpus_actv
) == 0 &&
1084 (pg
->cmt_policy
& (CMT_BALANCE
| CMT_COALESCE
))) {
1085 err
= group_remove(pg
->cmt_siblings
, pg
, GRP_NORESIZE
);
1088 if (pg
->cmt_parent
== NULL
&&
1089 pg
->cmt_siblings
!= &cmt_root
->cl_pgs
) {
1090 err
= group_remove(&cmt_root
->cl_pgs
, pg
,
1097 * Assert the number of active CPUs does not exceed
1098 * the total number of CPUs in the PG
1100 ASSERT(GROUP_SIZE(&pg
->cmt_cpus_actv
) <=
1101 GROUP_SIZE(&((pg_t
*)pg
)->pg_cpus
));
1104 * Update the PG bitset in the CPU's old partition
1107 PG_CPU_ITR_INIT(pg
, cpu_itr
);
1108 while ((cpp
= pg_cpu_next(&cpu_itr
)) != NULL
) {
1111 if (CPU_ACTIVE(cpp
) &&
1112 cpp
->cpu_part
->cp_id
== cp
->cpu_part
->cp_id
) {
1118 bitset_del(&cp
->cpu_part
->cp_cmt_pgs
,
1119 ((pg_t
*)pg
)->pg_id
);
1125 * Return non-zero if the CPU belongs in the given PG
1128 pg_cmt_cpu_belongs(pg_t
*pg
, cpu_t
*cp
)
1132 pg_cpu
= GROUP_ACCESS(&pg
->pg_cpus
, 0);
1134 ASSERT(pg_cpu
!= NULL
);
1137 * The CPU belongs if, given the nature of the hardware sharing
1138 * relationship represented by the PG, the CPU has that
1139 * relationship with some other CPU already in the PG
1141 if (pg_plat_cpus_share(cp
, pg_cpu
, ((pghw_t
*)pg
)->pghw_hw
))
1148 * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1151 pg_cmt_hier_sort(pg_cmt_t
**hier
, int size
)
1156 pg_t
**h
= (pg_t
**)hier
;
1159 * First sort by number of CPUs
1163 for (i
= inc
; i
< size
; i
++) {
1166 while ((j
>= inc
) &&
1167 (PG_NUM_CPUS(h
[j
- inc
]) > PG_NUM_CPUS(tmp
))) {
1176 inc
= (inc
* 5) / 11;
1180 * Break ties by asking the platform.
1181 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1183 for (start
= 0; start
< size
; start
++) {
1186 * Find various contiguous sets of elements,
1187 * in the array, with the same number of cpus
1190 sz
= PG_NUM_CPUS(h
[start
]);
1191 while ((end
< size
) && (sz
== PG_NUM_CPUS(h
[end
])))
1194 * Sort each such set of the array by rank
1196 for (i
= start
+ 1; i
< end
; i
++) {
1199 while (j
>= start
&&
1200 pg_cmt_hier_rank(hier
[j
],
1201 (pg_cmt_t
*)tmp
) == hier
[j
]) {
1211 * Return a cmt_lgrp_t * given an lgroup handle.
1214 pg_cmt_find_lgrp(lgrp_handle_t hand
)
1218 ASSERT(MUTEX_HELD(&cpu_lock
));
1221 while (lgrp
!= NULL
) {
1222 if (lgrp
->cl_hand
== hand
)
1224 lgrp
= lgrp
->cl_next
;
1230 * Create a cmt_lgrp_t with the specified handle.
1233 pg_cmt_lgrp_create(lgrp_handle_t hand
)
1237 ASSERT(MUTEX_HELD(&cpu_lock
));
1239 lgrp
= kmem_zalloc(sizeof (cmt_lgrp_t
), KM_SLEEP
);
1241 lgrp
->cl_hand
= hand
;
1243 lgrp
->cl_next
= cmt_lgrps
;
1245 group_create(&lgrp
->cl_pgs
);
1251 * Interfaces to enable and disable power aware dispatching
1252 * The caller must be holding cpu_lock.
1254 * Return 0 on success and -1 on failure.
1257 cmt_pad_enable(pghw_type_t type
)
1263 ASSERT(PGHW_IS_PM_DOMAIN(type
));
1264 ASSERT(MUTEX_HELD(&cpu_lock
));
1266 if (cmt_sched_disabled
== 1)
1269 if ((hwset
= pghw_set_lookup(type
)) == NULL
||
1270 cmt_hw_blacklisted
[type
]) {
1272 * Unable to find any instances of the specified type
1273 * of power domain, or the power domains have been blacklisted.
1279 * Iterate over the power domains, setting the default dispatcher
1280 * policy for power/performance optimization.
1282 * Simply setting the policy isn't enough in the case where the power
1283 * domain is an only child of another PG. Because the dispatcher walks
1284 * the PG hierarchy in a top down fashion, the higher up PG's policy
1285 * will dominate. So promote the power domain above it's parent if both
1286 * PG and it's parent have the same CPUs to ensure it's policy
1289 group_iter_init(&iter
);
1290 while ((pg
= group_iterate(hwset
, &iter
)) != NULL
) {
1292 * If the power domain is an only child to a parent
1293 * not implementing the same policy, promote the child
1294 * above the parent to activate the policy.
1296 pg
->cmt_policy
= pg_cmt_policy(((pghw_t
*)pg
)->pghw_hw
);
1297 while ((pg
->cmt_parent
!= NULL
) &&
1298 (pg
->cmt_parent
->cmt_policy
!= pg
->cmt_policy
) &&
1299 (PG_NUM_CPUS((pg_t
*)pg
) ==
1300 PG_NUM_CPUS((pg_t
*)pg
->cmt_parent
))) {
1301 cmt_hier_promote(pg
, NULL
);
1309 cmt_pad_disable(pghw_type_t type
)
1316 ASSERT(PGHW_IS_PM_DOMAIN(type
));
1317 ASSERT(MUTEX_HELD(&cpu_lock
));
1319 if (cmt_sched_disabled
== 1)
1322 if ((hwset
= pghw_set_lookup(type
)) == NULL
) {
1324 * Unable to find any instances of the specified type of
1330 * Iterate over the power domains, setting the default dispatcher
1331 * policy for performance optimization (load balancing).
1333 group_iter_init(&iter
);
1334 while ((pg
= group_iterate(hwset
, &iter
)) != NULL
) {
1337 * If the power domain has an only child that implements
1338 * policy other than load balancing, promote the child
1339 * above the power domain to ensure it's policy dominates.
1341 if (pg
->cmt_children
!= NULL
&&
1342 GROUP_SIZE(pg
->cmt_children
) == 1) {
1343 child
= GROUP_ACCESS(pg
->cmt_children
, 0);
1344 if ((child
->cmt_policy
& CMT_BALANCE
) == 0) {
1345 cmt_hier_promote(child
, NULL
);
1348 pg
->cmt_policy
= CMT_BALANCE
;
1355 cmt_ev_thread_swtch(pg_t
*pg
, cpu_t
*cp
, hrtime_t now
, kthread_t
*old
,
1358 pg_cmt_t
*cmt_pg
= (pg_cmt_t
*)pg
;
1360 if (old
== cp
->cpu_idle_thread
) {
1361 atomic_inc_32(&cmt_pg
->cmt_utilization
);
1362 } else if (new == cp
->cpu_idle_thread
) {
1363 atomic_dec_32(&cmt_pg
->cmt_utilization
);
1368 * Macro to test whether a thread is currently runnable on a CPU in a PG.
1370 #define THREAD_RUNNABLE_IN_PG(t, pg) \
1371 ((t)->t_state == TS_RUN && \
1372 (t)->t_disp_queue->disp_cpu && \
1373 bitset_in_set(&(pg)->cmt_cpus_actv_set, \
1374 (t)->t_disp_queue->disp_cpu->cpu_seqid))
1377 cmt_ev_thread_swtch_pwr(pg_t
*pg
, cpu_t
*cp
, hrtime_t now
, kthread_t
*old
,
1380 pg_cmt_t
*cmt
= (pg_cmt_t
*)pg
;
1381 cpupm_domain_t
*dom
;
1384 if (old
== cp
->cpu_idle_thread
) {
1385 ASSERT(new != cp
->cpu_idle_thread
);
1386 u
= atomic_inc_32_nv(&cmt
->cmt_utilization
);
1389 * Notify the CPU power manager that the domain
1392 dom
= (cpupm_domain_t
*)cmt
->cmt_pg
.pghw_handle
;
1393 cpupm_utilization_event(cp
, now
, dom
,
1394 CPUPM_DOM_BUSY_FROM_IDLE
);
1396 } else if (new == cp
->cpu_idle_thread
) {
1397 ASSERT(old
!= cp
->cpu_idle_thread
);
1398 u
= atomic_dec_32_nv(&cmt
->cmt_utilization
);
1401 * The domain is idle, notify the CPU power
1404 * Avoid notifying if the thread is simply migrating
1405 * between CPUs in the domain.
1407 if (!THREAD_RUNNABLE_IN_PG(old
, cmt
)) {
1408 dom
= (cpupm_domain_t
*)cmt
->cmt_pg
.pghw_handle
;
1409 cpupm_utilization_event(cp
, now
, dom
,
1410 CPUPM_DOM_IDLE_FROM_BUSY
);
1418 cmt_ev_thread_remain_pwr(pg_t
*pg
, cpu_t
*cp
, kthread_t
*t
)
1420 pg_cmt_t
*cmt
= (pg_cmt_t
*)pg
;
1421 cpupm_domain_t
*dom
;
1423 dom
= (cpupm_domain_t
*)cmt
->cmt_pg
.pghw_handle
;
1424 cpupm_utilization_event(cp
, (hrtime_t
)0, dom
, CPUPM_DOM_REMAIN_BUSY
);
1428 * Return the name of the CMT scheduling policy
1429 * being implemented across this PG
1432 pg_cmt_policy_name(pg_t
*pg
)
1434 pg_cmt_policy_t policy
;
1436 policy
= ((pg_cmt_t
*)pg
)->cmt_policy
;
1438 if (policy
& CMT_AFFINITY
) {
1439 if (policy
& CMT_BALANCE
)
1440 return ("Load Balancing & Affinity");
1441 else if (policy
& CMT_COALESCE
)
1442 return ("Load Coalescence & Affinity");
1444 return ("Affinity");
1446 if (policy
& CMT_BALANCE
)
1447 return ("Load Balancing");
1448 else if (policy
& CMT_COALESCE
)
1449 return ("Load Coalescence");
1456 * Prune PG, and all other instances of PG's hardware sharing relationship
1457 * from the CMT PG hierarchy.
1459 * This routine operates on the CPU specific processor group data (for the CPUs
1460 * in the PG being pruned), and may be invoked from a context where one CPU's
1461 * PG data is under construction. In this case the argument "pgdata", if not
1462 * NULL, is a reference to the CPU's under-construction PG data.
1465 pg_cmt_prune(pg_cmt_t
*pg_bad
, pg_cmt_t
**lineage
, int *sz
, cpu_pg_t
*pgdata
)
1467 group_t
*hwset
, *children
;
1468 int i
, j
, r
, size
= *sz
;
1469 group_iter_t hw_iter
, child_iter
;
1470 pg_cpu_itr_t cpu_iter
;
1471 pg_cmt_t
*pg
, *child
;
1476 ASSERT(MUTEX_HELD(&cpu_lock
));
1479 * Inform pghw layer that this PG is pruned.
1481 pghw_cmt_fini((pghw_t
*)pg_bad
);
1483 hw
= ((pghw_t
*)pg_bad
)->pghw_hw
;
1485 if (hw
== PGHW_POW_ACTIVE
) {
1486 cmn_err(CE_NOTE
, "!Active CPUPM domain groups look suspect. "
1487 "Event Based CPUPM Unavailable");
1488 } else if (hw
== PGHW_POW_IDLE
) {
1489 cmn_err(CE_NOTE
, "!Idle CPUPM domain groups look suspect. "
1490 "Dispatcher assisted CPUPM disabled.");
1494 * Find and eliminate the PG from the lineage.
1496 for (i
= 0; i
< size
; i
++) {
1497 if (lineage
[i
] == pg_bad
) {
1498 for (j
= i
; j
< size
- 1; j
++)
1499 lineage
[j
] = lineage
[j
+ 1];
1506 * We'll prune all instances of the hardware sharing relationship
1507 * represented by pg. But before we do that (and pause CPUs) we need
1508 * to ensure the hierarchy's groups are properly sized.
1510 hwset
= pghw_set_lookup(hw
);
1513 * Blacklist the hardware so future processor groups of this type won't
1514 * participate in CMT thread placement.
1517 * For heterogeneous system configurations, this might be overkill.
1518 * We may only need to blacklist the illegal PGs, and other instances
1519 * of this hardware sharing relationship may be ok.
1521 cmt_hw_blacklisted
[hw
] = 1;
1524 * For each of the PGs being pruned, ensure sufficient capacity in
1525 * the siblings set for the PG's children
1527 group_iter_init(&hw_iter
);
1528 while ((pg
= group_iterate(hwset
, &hw_iter
)) != NULL
) {
1530 * PG is being pruned, but if it is bringing up more than
1531 * one child, ask for more capacity in the siblings group.
1534 if (pg
->cmt_children
&&
1535 GROUP_SIZE(pg
->cmt_children
) > 1) {
1536 cap_needed
= GROUP_SIZE(pg
->cmt_children
) - 1;
1538 group_expand(pg
->cmt_siblings
,
1539 GROUP_SIZE(pg
->cmt_siblings
) + cap_needed
);
1542 * If this is a top level group, also ensure the
1543 * capacity in the root lgrp level CMT grouping.
1545 if (pg
->cmt_parent
== NULL
&&
1546 pg
->cmt_siblings
!= &cmt_root
->cl_pgs
) {
1547 group_expand(&cmt_root
->cl_pgs
,
1548 GROUP_SIZE(&cmt_root
->cl_pgs
) + cap_needed
);
1549 cmt_root
->cl_npgs
+= cap_needed
;
1555 * We're operating on the PG hierarchy. Pause CPUs to ensure
1556 * exclusivity with respect to the dispatcher.
1558 pause_cpus(NULL
, NULL
);
1561 * Prune all PG instances of the hardware sharing relationship
1562 * represented by pg.
1564 group_iter_init(&hw_iter
);
1565 while ((pg
= group_iterate(hwset
, &hw_iter
)) != NULL
) {
1568 * Remove PG from it's group of siblings, if it's there.
1570 if (pg
->cmt_siblings
) {
1571 (void) group_remove(pg
->cmt_siblings
, pg
, GRP_NORESIZE
);
1573 if (pg
->cmt_parent
== NULL
&&
1574 pg
->cmt_siblings
!= &cmt_root
->cl_pgs
) {
1575 (void) group_remove(&cmt_root
->cl_pgs
, pg
,
1580 * Indicate that no CMT policy will be implemented across
1583 pg
->cmt_policy
= CMT_NO_POLICY
;
1586 * Move PG's children from it's children set to it's parent's
1587 * children set. Note that the parent's children set, and PG's
1588 * siblings set are the same thing.
1590 * Because we are iterating over the same group that we are
1591 * operating on (removing the children), first add all of PG's
1592 * children to the parent's children set, and once we are done
1593 * iterating, empty PG's children set.
1595 if (pg
->cmt_children
!= NULL
) {
1596 children
= pg
->cmt_children
;
1598 group_iter_init(&child_iter
);
1599 while ((child
= group_iterate(children
, &child_iter
))
1601 if (pg
->cmt_siblings
!= NULL
) {
1602 r
= group_add(pg
->cmt_siblings
, child
,
1606 if (pg
->cmt_parent
== NULL
&&
1608 &cmt_root
->cl_pgs
) {
1609 r
= group_add(&cmt_root
->cl_pgs
,
1610 child
, GRP_NORESIZE
);
1615 group_empty(pg
->cmt_children
);
1619 * Reset the callbacks to the defaults
1621 pg_callback_set_defaults((pg_t
*)pg
);
1624 * Update all the CPU lineages in each of PG's CPUs
1626 PG_CPU_ITR_INIT(pg
, cpu_iter
);
1627 while ((cpu
= pg_cpu_next(&cpu_iter
)) != NULL
) {
1629 group_iter_t liter
; /* Iterator for the lineage */
1630 cpu_pg_t
*cpd
; /* CPU's PG data */
1633 * The CPU's lineage is under construction still
1634 * references the bootstrap CPU PG data structure.
1636 if (pg_cpu_is_bootstrapped(cpu
))
1642 * Iterate over the CPU's PGs updating the children
1643 * of the PG being promoted, since they have a new
1644 * parent and siblings set.
1646 group_iter_init(&liter
);
1647 while ((cpu_pg
= group_iterate(&cpd
->pgs
,
1649 if (cpu_pg
->cmt_parent
== pg
) {
1650 cpu_pg
->cmt_parent
= pg
->cmt_parent
;
1651 cpu_pg
->cmt_siblings
= pg
->cmt_siblings
;
1656 * Update the CPU's lineages
1658 * Remove the PG from the CPU's group used for CMT
1661 (void) group_remove(&cpd
->cmt_pgs
, pg
, GRP_NORESIZE
);
1669 * Disable CMT scheduling
1672 pg_cmt_disable(void)
1676 ASSERT(MUTEX_HELD(&cpu_lock
));
1678 pause_cpus(NULL
, NULL
);
1683 group_empty(&cpu
->cpu_pg
->cmt_pgs
);
1684 } while ((cpu
= cpu
->cpu_next
) != cpu_list
);
1686 cmt_sched_disabled
= 1;
1688 cmn_err(CE_NOTE
, "!CMT thread placement optimizations unavailable");
1692 * CMT lineage validation
1694 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1695 * of the PGs in a CPU's lineage. This is necessary because it's possible that
1696 * some groupings (power domain groupings in particular) may be defined by
1697 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1698 * possible to integrate those groupings into the CMT PG hierarchy, if doing
1699 * so would violate the subset invariant of the hierarchy, which says that
1700 * a PG must be subset of its parent (if it has one).
1702 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1703 * would result in a violation of this invariant. If a violation is found,
1704 * and the PG is of a grouping type who's definition is known to originate from
1705 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1706 * PG (and all other instances PG's sharing relationship type) from the CMT
1707 * hierarchy. Further, future instances of that sharing relationship type won't
1708 * be added. If the grouping definition doesn't originate from suspect
1709 * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1710 * CMT scheduling altogether.
1712 * This routine is invoked after the CPU has been added to the PGs in which
1713 * it belongs, but before those PGs have been added to (or had their place
1714 * adjusted in) the CMT PG hierarchy.
1716 * The first argument is the CPUs PG lineage (essentially an array of PGs in
1717 * which the CPU belongs) that has already been sorted in ascending order
1718 * by CPU count. Some of the PGs in the CPUs lineage may already have other
1719 * CPUs in them, and have already been integrated into the CMT hierarchy.
1721 * The addition of this new CPU to these pre-existing PGs means that those
1722 * PGs may need to be promoted up in the hierarchy to satisfy the subset
1723 * invariant. In additon to testing the subset invariant for the lineage,
1724 * this routine also verifies that the addition of the new CPU to the
1725 * existing PGs wouldn't cause the subset invariant to be violated in
1726 * the exiting lineages.
1728 * This routine will normally return one of the following:
1729 * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1730 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1732 * Otherwise, this routine will return a value indicating which error it
1733 * was unable to recover from (and set cmt_lineage_status along the way).
1735 * This routine operates on the CPU specific processor group data (for the CPU
1736 * whose lineage is being validated), which is under-construction.
1737 * "pgdata" is a reference to the CPU's under-construction PG data.
1738 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1740 static cmt_lineage_validation_t
1741 pg_cmt_lineage_validate(pg_cmt_t
**lineage
, int *sz
, cpu_pg_t
*pgdata
)
1744 pg_cmt_t
*pg
, *pg_next
, *pg_bad
, *pg_tmp
, *parent
;
1746 pg_cpu_itr_t cpu_iter
;
1749 ASSERT(MUTEX_HELD(&cpu_lock
));
1754 lgrp
= LGRP_NULL_HANDLE
;
1755 for (i
= 0; i
< size
; i
++) {
1759 pg_next
= lineage
[i
+ 1];
1764 * We assume that the lineage has already been sorted
1765 * by the number of CPUs. In fact, we depend on it.
1767 ASSERT(pg_next
== NULL
||
1768 (PG_NUM_CPUS((pg_t
*)pg
) <= PG_NUM_CPUS((pg_t
*)pg_next
)));
1771 * The CPUs PG lineage was passed as the first argument to
1772 * this routine and contains the sorted list of the CPU's
1773 * PGs. Ultimately, the ordering of the PGs in that list, and
1774 * the ordering as traversed by the cmt_parent list must be
1775 * the same. PG promotion will be used as the mechanism to
1776 * achieve this, but first we need to look for cases where
1777 * promotion will be necessary, and validate that will be
1778 * possible without violating the subset invarient described
1781 * Since the PG topology is in the middle of being changed, we
1782 * need to check whether the PG's existing parent (if any) is
1783 * part of this CPU's lineage (and therefore should contain
1784 * the new CPU). If not, it means that the addition of the
1785 * new CPU should have made this PG have more CPUs than its
1786 * parent (and other ancestors not in the same lineage) and
1787 * will need to be promoted into place.
1789 * We need to verify all of this to defend against a buggy
1790 * BIOS giving bad power domain CPU groupings. Sigh.
1792 parent
= pg
->cmt_parent
;
1793 while (parent
!= NULL
) {
1795 * Determine if the parent/ancestor is in this lineage
1798 for (j
= 0; (j
< size
) && (pg_tmp
!= parent
); j
++) {
1799 pg_tmp
= lineage
[j
];
1801 if (pg_tmp
== parent
) {
1803 * It's in the lineage. The concentricity
1804 * checks will handle the rest.
1809 * If it is not in the lineage, PG will eventually
1810 * need to be promoted above it. Verify the ancestor
1811 * is a proper subset. There is still an error if
1812 * the ancestor has the same number of CPUs as PG,
1813 * since that would imply it should be in the lineage,
1814 * and we already know it isn't.
1816 if (PG_NUM_CPUS((pg_t
*)parent
) >=
1817 PG_NUM_CPUS((pg_t
*)pg
)) {
1819 * Not a proper subset if the parent/ancestor
1820 * has the same or more CPUs than PG.
1822 cmt_lineage_status
= CMT_LINEAGE_NON_PROMOTABLE
;
1825 parent
= parent
->cmt_parent
;
1829 * Walk each of the CPUs in the PGs group and perform
1830 * consistency checks along the way.
1832 PG_CPU_ITR_INIT((pg_t
*)pg
, cpu_iter
);
1833 while ((cp
= pg_cpu_next(&cpu_iter
)) != NULL
) {
1835 * Verify that there aren't any CPUs contained in PG
1836 * that the next PG in the lineage (which is larger
1837 * or same size) doesn't also contain.
1839 if (pg_next
!= NULL
&&
1840 pg_cpu_find((pg_t
*)pg_next
, cp
) == B_FALSE
) {
1841 cmt_lineage_status
= CMT_LINEAGE_NON_CONCENTRIC
;
1846 * Verify that all the CPUs in the PG are in the same
1849 if (lgrp
== LGRP_NULL_HANDLE
) {
1850 lgrp
= lgrp_plat_cpu_to_hand(cp
->cpu_id
);
1851 } else if (lgrp_plat_cpu_to_hand(cp
->cpu_id
) != lgrp
) {
1852 cmt_lineage_status
= CMT_LINEAGE_PG_SPANS_LGRPS
;
1860 * Some of these validation errors can result when the CPU grouping
1861 * information is derived from buggy sources (for example, incorrect
1862 * ACPI tables on x86 systems).
1864 * We'll try to recover in such cases by pruning out the illegal
1865 * groupings from the PG hierarchy, which means that we won't optimize
1866 * for those levels, but we will for the remaining ones.
1868 switch (cmt_lineage_status
) {
1869 case CMT_LINEAGE_VALID
:
1870 case CMT_LINEAGE_REPAIRED
:
1872 case CMT_LINEAGE_PG_SPANS_LGRPS
:
1874 * We've detected a PG whose CPUs span lgroups.
1876 * This isn't supported, as the dispatcher isn't allowed to
1877 * to do CMT thread placement across lgroups, as this would
1878 * conflict with policies implementing MPO thread affinity.
1880 * If the PG is of a sharing relationship type known to
1881 * legitimately span lgroups, specify that no CMT thread
1882 * placement policy should be implemented, and prune the PG
1883 * from the existing CMT PG hierarchy.
1885 * Otherwise, fall though to the case below for handling.
1887 if (((pghw_t
*)pg
)->pghw_hw
== PGHW_CHIP
) {
1888 if (pg_cmt_prune(pg
, lineage
, sz
, pgdata
) == 0) {
1889 cmt_lineage_status
= CMT_LINEAGE_REPAIRED
;
1894 case CMT_LINEAGE_NON_PROMOTABLE
:
1896 * We've detected a PG that already exists in another CPU's
1897 * lineage that cannot cannot legally be promoted into place
1898 * without breaking the invariants of the hierarchy.
1900 if (PG_CMT_HW_SUSPECT(((pghw_t
*)pg
)->pghw_hw
)) {
1901 if (pg_cmt_prune(pg
, lineage
, sz
, pgdata
) == 0) {
1902 cmt_lineage_status
= CMT_LINEAGE_REPAIRED
;
1907 * Something went wrong trying to prune out the bad level.
1908 * Disable CMT scheduling altogether.
1912 case CMT_LINEAGE_NON_CONCENTRIC
:
1914 * We've detected a non-concentric PG lineage, which means that
1915 * there's a PG in the lineage that has CPUs that the next PG
1916 * over in the lineage (which is the same size or larger)
1919 * In this case, we examine the two PGs to see if either
1920 * grouping is defined by potentially buggy sources.
1922 * If one has less CPUs than the other, and contains CPUs
1923 * not found in the parent, and it is an untrusted enumeration,
1924 * then prune it. If both have the same number of CPUs, then
1925 * prune the one that is untrusted.
1927 * This process repeats until we have a concentric lineage,
1928 * or we would have to prune out level derived from what we
1929 * thought was a reliable source, in which case CMT scheduling
1930 * is disabled altogether.
1932 if ((PG_NUM_CPUS((pg_t
*)pg
) < PG_NUM_CPUS((pg_t
*)pg_next
)) &&
1933 (PG_CMT_HW_SUSPECT(((pghw_t
*)pg
)->pghw_hw
))) {
1935 } else if (PG_NUM_CPUS((pg_t
*)pg
) ==
1936 PG_NUM_CPUS((pg_t
*)pg_next
)) {
1937 if (PG_CMT_HW_SUSPECT(((pghw_t
*)pg_next
)->pghw_hw
)) {
1939 } else if (PG_CMT_HW_SUSPECT(((pghw_t
*)pg
)->pghw_hw
)) {
1944 if (pg_cmt_prune(pg_bad
, lineage
, sz
, pgdata
) == 0) {
1945 cmt_lineage_status
= CMT_LINEAGE_REPAIRED
;
1950 * Something went wrong trying to identify and/or prune out
1951 * the bad level. Disable CMT scheduling altogether.
1957 * If we're here, we've encountered a validation error for
1958 * which we don't know how to recover. In this case, disable
1959 * CMT scheduling altogether.
1961 cmt_lineage_status
= CMT_LINEAGE_UNRECOVERABLE
;
1964 return (cmt_lineage_status
);