4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Basic NUMA support in terms of locality groups
29 * Solaris needs to know which CPUs, memory, etc. are near each other to
30 * provide good performance on NUMA machines by optimizing for locality.
31 * In order to do this, a new abstraction called a "locality group (lgroup)"
32 * has been introduced to keep track of which CPU-like and memory-like hardware
33 * resources are close to each other. Currently, latency is the only measure
34 * used to determine how to group hardware resources into lgroups, but this
35 * does not limit the groupings to be based solely on latency. Other factors
36 * may be used to determine the groupings in the future.
38 * Lgroups are organized into a hieararchy or topology that represents the
39 * latency topology of the machine. There is always at least a root lgroup in
40 * the system. It represents all the hardware resources in the machine at a
41 * latency big enough that any hardware resource can at least access any other
42 * hardware resource within that latency. A Uniform Memory Access (UMA)
43 * machine is represented with one lgroup (the root). In contrast, a NUMA
44 * machine is represented at least by the root lgroup and some number of leaf
45 * lgroups where the leaf lgroups contain the hardware resources within the
46 * least latency of each other and the root lgroup still contains all the
47 * resources in the machine. Some number of intermediate lgroups may exist
48 * which represent more levels of locality than just the local latency of the
49 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups
50 * (eg. root and intermediate lgroups) contain the next nearest resources to
51 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup
52 * to the root lgroup shows the hardware resources from closest to farthest
53 * from the leaf lgroup such that each successive ancestor lgroup contains
54 * the next nearest resources at the next level of locality from the previous.
56 * The kernel uses the lgroup abstraction to know how to allocate resources
57 * near a given process/thread. At fork() and lwp/thread_create() time, a
58 * "home" lgroup is chosen for a thread. This is done by picking the lgroup
59 * with the lowest load average. Binding to a processor or processor set will
60 * change the home lgroup for a thread. The scheduler has been modified to try
61 * to dispatch a thread on a CPU in its home lgroup. Physical memory
62 * allocation is lgroup aware too, so memory will be allocated from the current
63 * thread's home lgroup if possible. If the desired resources are not
64 * available, the kernel traverses the lgroup hierarchy going to the parent
65 * lgroup to find resources at the next level of locality until it reaches the
70 #include <sys/lgrp_user.h>
71 #include <sys/types.h>
73 #include <sys/param.h>
75 #include <sys/thread.h>
76 #include <sys/cpuvar.h>
77 #include <sys/cpupart.h>
80 #include <vm/seg_kmem.h>
81 #include <vm/seg_spt.h>
82 #include <vm/seg_vn.h>
84 #include <sys/atomic.h>
85 #include <sys/systm.h>
86 #include <sys/errno.h>
87 #include <sys/cmn_err.h>
88 #include <sys/kstat.h>
89 #include <sys/sysmacros.h>
91 #include <sys/promif.h>
94 lgrp_gen_t lgrp_gen
= 0; /* generation of lgroup hierarchy */
95 lgrp_t
*lgrp_table
[NLGRPS_MAX
]; /* table of all initialized lgrp_t structs */
96 /* indexed by lgrp_id */
97 int nlgrps
; /* number of lgroups in machine */
98 int lgrp_alloc_hint
= -1; /* hint for where to try to allocate next */
99 int lgrp_alloc_max
= 0; /* max lgroup ID allocated so far */
102 * Kstat data for lgroups.
104 * Actual kstat data is collected in lgrp_stats array.
105 * The lgrp_kstat_data array of named kstats is used to extract data from
106 * lgrp_stats and present it to kstat framework. It is protected from partallel
107 * modifications by lgrp_kstat_mutex. This may cause some contention when
108 * several kstat commands run in parallel but this is not the
109 * performance-critical path.
111 extern struct lgrp_stats lgrp_stats
[]; /* table of per-lgrp stats */
114 * Declare kstat names statically for enums as defined in the header file.
118 static void lgrp_kstat_init(void);
119 static int lgrp_kstat_extract(kstat_t
*, int);
120 static void lgrp_kstat_reset(lgrp_id_t
);
122 static struct kstat_named lgrp_kstat_data
[LGRP_NUM_STATS
];
123 static kmutex_t lgrp_kstat_mutex
;
127 * max number of lgroups supported by the platform
132 * The root lgroup. Represents the set of resources at the system wide
135 lgrp_t
*lgrp_root
= NULL
;
138 * During system bootstrap cp_default does not contain the list of lgrp load
139 * averages (cp_lgrploads). The list is allocated after the first CPU is brought
140 * on-line when cp_default is initialized by cpupart_initialize_default().
141 * Configuring CPU0 may create a two-level topology with root and one leaf node
142 * containing CPU0. This topology is initially constructed in a special
143 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
144 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
145 * for all lpl operations until cp_default is fully constructed.
147 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
148 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
149 * the first element of lpl_bootstrap_list.
151 * CPUs that are added to the system, but have not yet been assigned to an
152 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
153 * on some architectures (x86) it's possible for the slave CPU startup thread
154 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
156 #define LPL_BOOTSTRAP_SIZE 2
157 static lpl_t lpl_bootstrap_list
[LPL_BOOTSTRAP_SIZE
];
158 lpl_t
*lpl_bootstrap
;
159 static lpl_t
*lpl_bootstrap_rset
[LPL_BOOTSTRAP_SIZE
];
160 static int lpl_bootstrap_id2rset
[LPL_BOOTSTRAP_SIZE
];
163 * If cp still references the bootstrap lpl, it has not yet been added to
164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
165 * a thread is trying to allocate memory close to a CPU that has no lgrp.
167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap)
172 * Size, in bytes, beyond which random memory allocation policy is applied
173 * to non-shared memory. Default is the maximum size, so random memory
174 * allocation won't be used for non-shared memory by default.
176 size_t lgrp_privm_random_thresh
= (size_t)(-1);
178 /* the maximum effect that a single thread can have on it's lgroup's load */
179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
180 ((lgrp_loadavg_max_effect) / (ncpu))
181 uint32_t lgrp_loadavg_max_effect
= LGRP_LOADAVG_THREAD_MAX
;
185 * Size, in bytes, beyond which random memory allocation policy is applied to
186 * shared memory. Default is 8MB (2 ISM pages).
188 size_t lgrp_shm_random_thresh
= 8*1024*1024;
191 * Whether to do processor set aware memory allocation by default
193 int lgrp_mem_pset_aware
= 0;
196 * Set the default memory allocation policy for root lgroup
198 lgrp_mem_policy_t lgrp_mem_policy_root
= LGRP_MEM_POLICY_RANDOM
;
201 * Set the default memory allocation policy. For most platforms,
202 * next touch is sufficient, but some platforms may wish to override
205 lgrp_mem_policy_t lgrp_mem_default_policy
= LGRP_MEM_POLICY_NEXT
;
209 * lgroup CPU event handlers
211 static void lgrp_cpu_init(struct cpu
*);
212 static void lgrp_cpu_fini(struct cpu
*, lgrp_id_t
);
213 static lgrp_t
*lgrp_cpu_to_lgrp(struct cpu
*);
216 * lgroup memory event handlers
218 static void lgrp_mem_init(int, lgrp_handle_t
, boolean_t
);
219 static void lgrp_mem_fini(int, lgrp_handle_t
, boolean_t
);
220 static void lgrp_mem_rename(int, lgrp_handle_t
, lgrp_handle_t
);
223 * lgroup CPU partition event handlers
225 static void lgrp_part_add_cpu(struct cpu
*, lgrp_id_t
);
226 static void lgrp_part_del_cpu(struct cpu
*);
229 * lgroup framework initialization
231 static void lgrp_main_init(void);
232 static void lgrp_main_mp_init(void);
233 static void lgrp_root_init(void);
234 static void lgrp_setup(void);
239 static void lpl_init(lpl_t
*, lpl_t
*, lgrp_t
*);
240 static void lpl_clear(lpl_t
*);
241 static void lpl_leaf_insert(lpl_t
*, struct cpupart
*);
242 static void lpl_leaf_remove(lpl_t
*, struct cpupart
*);
243 static void lpl_rset_add(lpl_t
*, lpl_t
*);
244 static void lpl_rset_del(lpl_t
*, lpl_t
*);
245 static int lpl_rset_contains(lpl_t
*, lpl_t
*);
246 static void lpl_cpu_adjcnt(lpl_act_t
, struct cpu
*);
247 static void lpl_child_update(lpl_t
*, struct cpupart
*);
248 static int lpl_pick(lpl_t
*, lpl_t
*);
249 static void lpl_verify_wrapper(struct cpupart
*);
252 * defines for lpl topology verifier return codes
255 #define LPL_TOPO_CORRECT 0
256 #define LPL_TOPO_PART_HAS_NO_LPL -1
257 #define LPL_TOPO_CPUS_NOT_EMPTY -2
258 #define LPL_TOPO_LGRP_MISMATCH -3
259 #define LPL_TOPO_MISSING_PARENT -4
260 #define LPL_TOPO_PARENT_MISMATCH -5
261 #define LPL_TOPO_BAD_CPUCNT -6
262 #define LPL_TOPO_RSET_MISMATCH -7
263 #define LPL_TOPO_LPL_ORPHANED -8
264 #define LPL_TOPO_LPL_BAD_NCPU -9
265 #define LPL_TOPO_RSET_MSSNG_LF -10
266 #define LPL_TOPO_CPU_HAS_BAD_LPL -11
267 #define LPL_TOPO_NONLEAF_HAS_CPUS -12
268 #define LPL_TOPO_LGRP_NOT_LEAF -13
269 #define LPL_TOPO_BAD_RSETCNT -14
272 * Return whether lgroup optimizations should be enabled on this system
275 lgrp_optimizations(void)
278 * System must have more than 2 lgroups to enable lgroup optimizations
280 * XXX This assumes that a 2 lgroup system has an empty root lgroup
281 * with one child lgroup containing all the resources. A 2 lgroup
282 * system with a root lgroup directly containing CPUs or memory might
283 * need lgroup optimizations with its child lgroup, but there
284 * isn't such a machine for now....
303 * Create the "root" lgroup
310 lgrp_root
->lgrp_cpu
= NULL
;
311 lgrp_root
->lgrp_mnodes
= 0;
312 lgrp_root
->lgrp_nmnodes
= 0;
313 hand
= lgrp_plat_root_hand();
314 lgrp_root
->lgrp_plathand
= hand
;
316 lgrp_root
->lgrp_id
= id
;
317 lgrp_root
->lgrp_cpucnt
= 0;
318 lgrp_root
->lgrp_childcnt
= 0;
319 klgrpset_clear(lgrp_root
->lgrp_children
);
320 klgrpset_clear(lgrp_root
->lgrp_leaves
);
321 lgrp_root
->lgrp_parent
= NULL
;
322 lgrp_root
->lgrp_latency
= lgrp_plat_latency(hand
, hand
);
324 for (i
= 0; i
< LGRP_RSRC_COUNT
; i
++)
325 klgrpset_clear(lgrp_root
->lgrp_set
[i
]);
327 lgrp_root
->lgrp_kstat
= NULL
;
329 lgrp_table
[id
] = lgrp_root
;
332 * Setup initial lpl list for CPU0 and initial t0 home.
333 * The only lpl space we have so far is lpl_bootstrap. It is used for
334 * all topology operations until cp_default is initialized at which
335 * point t0.t_lpl will be updated.
337 lpl_bootstrap
= lpl_bootstrap_list
;
338 t0
.t_lpl
= lpl_bootstrap
;
339 cp_default
.cp_nlgrploads
= LPL_BOOTSTRAP_SIZE
;
340 lpl_bootstrap_list
[1].lpl_lgrpid
= 1;
343 * Set up the bootstrap rset
344 * Since the bootstrap toplogy has just the root, and a leaf,
345 * the rset contains just the leaf, and both lpls can use the same rset
347 lpl_bootstrap_rset
[0] = &lpl_bootstrap_list
[1];
348 lpl_bootstrap_list
[0].lpl_rset_sz
= 1;
349 lpl_bootstrap_list
[0].lpl_rset
= lpl_bootstrap_rset
;
350 lpl_bootstrap_list
[0].lpl_id2rset
= lpl_bootstrap_id2rset
;
352 lpl_bootstrap_list
[1].lpl_rset_sz
= 1;
353 lpl_bootstrap_list
[1].lpl_rset
= lpl_bootstrap_rset
;
354 lpl_bootstrap_list
[1].lpl_id2rset
= lpl_bootstrap_id2rset
;
356 cp_default
.cp_lgrploads
= lpl_bootstrap
;
360 * Initialize the lgroup framework and allow the platform to do the same
362 * This happens in stages during boot and is all funnelled through this routine
363 * (see definition of lgrp_init_stages_t to see what happens at each stage and
367 lgrp_init(lgrp_init_stages_t stage
)
370 * Initialize the platform
372 lgrp_plat_init(stage
);
375 case LGRP_INIT_STAGE1
:
377 * Set max number of lgroups supported on this platform which
378 * must be less than the max number of lgroups supported by the
379 * common lgroup framework (eg. NLGRPS_MAX is max elements in
380 * lgrp_table[], etc.)
382 nlgrpsmax
= lgrp_plat_max_lgrps();
383 ASSERT(nlgrpsmax
<= NLGRPS_MAX
);
386 case LGRP_INIT_STAGE2
:
390 case LGRP_INIT_STAGE4
:
394 case LGRP_INIT_STAGE5
:
404 * Create the root and cpu0's lgroup, and set t0's home.
410 * Setup the root lgroup
415 * Add cpu0 to an lgroup
417 lgrp_config(LGRP_CONFIG_CPU_ADD
, (uintptr_t)CPU
, 0);
418 lgrp_config(LGRP_CONFIG_CPU_ONLINE
, (uintptr_t)CPU
, 0);
422 * true when lgrp initialization has been completed.
424 int lgrp_initialized
= 0;
427 * True when lgrp topology is constructed.
429 int lgrp_topo_initialized
= 0;
432 * Init routine called after startup(), /etc/system has been processed,
433 * and cpu0 has been added to an lgroup.
441 extern void pg_cpu0_reinit();
444 * Enforce a valid lgrp_mem_default_policy
446 if ((lgrp_mem_default_policy
<= LGRP_MEM_POLICY_DEFAULT
) ||
447 (lgrp_mem_default_policy
>= LGRP_NUM_MEM_POLICIES
) ||
448 (lgrp_mem_default_policy
== LGRP_MEM_POLICY_NEXT_SEG
))
449 lgrp_mem_default_policy
= LGRP_MEM_POLICY_NEXT
;
452 * See if mpo should be disabled.
453 * This may happen in the case of null proc LPA on Starcat.
454 * The platform won't be able to detect null proc LPA until after
455 * cpu0 and memory have already been added to lgroups.
456 * When and if it is detected, the Starcat platform will return
457 * a different platform handle for cpu0 which is what we check for
458 * here. If mpo should be disabled move cpu0 to it's rightful place
459 * (the root), and destroy the remaining lgroups. This effectively
460 * provides an UMA lgroup topology.
462 lgrpid
= cp
->cpu_lpl
->lpl_lgrpid
;
463 if (lgrp_table
[lgrpid
]->lgrp_plathand
!=
464 lgrp_plat_cpu_to_hand(cp
->cpu_id
)) {
465 lgrp_part_del_cpu(cp
);
466 lgrp_cpu_fini(cp
, lgrpid
);
469 lgrp_part_add_cpu(cp
, cp
->cpu_lpl
->lpl_lgrpid
);
471 ASSERT(cp
->cpu_lpl
->lpl_lgrpid
== LGRP_ROOTID
);
474 * Notify the PG subsystem that the CPU's lgrp
475 * association has changed
480 * Destroy all lgroups except for root
482 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
483 if (LGRP_EXISTS(lgrp_table
[i
]) &&
484 lgrp_table
[i
] != lgrp_root
)
485 lgrp_destroy(lgrp_table
[i
]);
489 * Fix up root to point at itself for leaves and resources
490 * and not have any children
492 lgrp_root
->lgrp_childcnt
= 0;
493 klgrpset_clear(lgrp_root
->lgrp_children
);
494 klgrpset_clear(lgrp_root
->lgrp_leaves
);
495 klgrpset_add(lgrp_root
->lgrp_leaves
, LGRP_ROOTID
);
496 klgrpset_clear(lgrp_root
->lgrp_set
[LGRP_RSRC_MEM
]);
497 klgrpset_add(lgrp_root
->lgrp_set
[LGRP_RSRC_MEM
], LGRP_ROOTID
);
501 * Initialize kstats framework.
505 * cpu0 is finally where it should be, so create it's lgroup's kstats
507 mutex_enter(&cpu_lock
);
508 lgrp_kstat_create(cp
);
509 mutex_exit(&cpu_lock
);
511 lgrp_initialized
= 1;
515 * Finish lgrp initialization after all CPUS are brought on-line.
516 * This routine is called after start_other_cpus().
519 lgrp_main_mp_init(void)
524 * Update lgroup topology (if necessary)
526 klgrpset_clear(changed
);
527 (void) lgrp_topo_update(lgrp_table
, lgrp_alloc_max
+ 1, &changed
);
528 lgrp_topo_initialized
= 1;
532 * Change latency of lgroup with specified lgroup platform handle (if one is
533 * given) or change all lgroups with old latency to new latency
536 lgrp_latency_change(lgrp_handle_t hand
, u_longlong_t oldtime
,
537 u_longlong_t newtime
)
542 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
543 lgrp
= lgrp_table
[i
];
545 if (!LGRP_EXISTS(lgrp
))
548 if ((hand
== LGRP_NULL_HANDLE
&&
549 lgrp
->lgrp_latency
== oldtime
) ||
550 (hand
!= LGRP_NULL_HANDLE
&& lgrp
->lgrp_plathand
== hand
))
551 lgrp
->lgrp_latency
= (int)newtime
;
556 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
559 lgrp_config(lgrp_config_flag_t event
, uintptr_t resource
, uintptr_t where
)
568 * The following (re)configuration events are common code
569 * initiated. lgrp_plat_config() is called here to inform the
570 * platform of the reconfiguration event.
572 case LGRP_CONFIG_CPU_ADD
:
573 cp
= (cpu_t
*)resource
;
576 * Initialize the new CPU's lgrp related next/prev
577 * links, and give it a bootstrap lpl so that it can
578 * survive should it need to enter the dispatcher.
580 cp
->cpu_next_lpl
= cp
;
581 cp
->cpu_prev_lpl
= cp
;
582 cp
->cpu_next_lgrp
= cp
;
583 cp
->cpu_prev_lgrp
= cp
;
584 cp
->cpu_lpl
= lpl_bootstrap
;
586 lgrp_plat_config(event
, resource
);
587 atomic_inc_32(&lgrp_gen
);
590 case LGRP_CONFIG_CPU_DEL
:
591 lgrp_plat_config(event
, resource
);
592 atomic_inc_32(&lgrp_gen
);
595 case LGRP_CONFIG_CPU_ONLINE
:
596 cp
= (cpu_t
*)resource
;
598 lgrp_part_add_cpu(cp
, cp
->cpu_lpl
->lpl_lgrpid
);
599 rc
= lpl_topo_verify(cp
->cpu_part
);
600 if (rc
!= LPL_TOPO_CORRECT
) {
601 panic("lpl_topo_verify failed: %d", rc
);
603 lgrp_plat_config(event
, resource
);
604 atomic_inc_32(&lgrp_gen
);
607 case LGRP_CONFIG_CPU_OFFLINE
:
608 cp
= (cpu_t
*)resource
;
609 id
= cp
->cpu_lpl
->lpl_lgrpid
;
610 lgrp_part_del_cpu(cp
);
611 lgrp_cpu_fini(cp
, id
);
612 rc
= lpl_topo_verify(cp
->cpu_part
);
613 if (rc
!= LPL_TOPO_CORRECT
) {
614 panic("lpl_topo_verify failed: %d", rc
);
616 lgrp_plat_config(event
, resource
);
617 atomic_inc_32(&lgrp_gen
);
620 case LGRP_CONFIG_CPUPART_ADD
:
621 cp
= (cpu_t
*)resource
;
622 lgrp_part_add_cpu((cpu_t
*)resource
, (lgrp_id_t
)where
);
623 rc
= lpl_topo_verify(cp
->cpu_part
);
624 if (rc
!= LPL_TOPO_CORRECT
) {
625 panic("lpl_topo_verify failed: %d", rc
);
627 lgrp_plat_config(event
, resource
);
630 case LGRP_CONFIG_CPUPART_DEL
:
631 cp
= (cpu_t
*)resource
;
632 lgrp_part_del_cpu((cpu_t
*)resource
);
633 rc
= lpl_topo_verify(cp
->cpu_part
);
634 if (rc
!= LPL_TOPO_CORRECT
) {
635 panic("lpl_topo_verify failed: %d", rc
);
637 lgrp_plat_config(event
, resource
);
641 * The following events are initiated by the memnode
644 case LGRP_CONFIG_MEM_ADD
:
645 lgrp_mem_init((int)resource
, where
, B_FALSE
);
646 atomic_inc_32(&lgrp_gen
);
649 case LGRP_CONFIG_MEM_DEL
:
650 lgrp_mem_fini((int)resource
, where
, B_FALSE
);
651 atomic_inc_32(&lgrp_gen
);
654 case LGRP_CONFIG_MEM_RENAME
: {
655 lgrp_config_mem_rename_t
*ren_arg
=
656 (lgrp_config_mem_rename_t
*)where
;
658 lgrp_mem_rename((int)resource
,
659 ren_arg
->lmem_rename_from
,
660 ren_arg
->lmem_rename_to
);
661 atomic_inc_32(&lgrp_gen
);
665 case LGRP_CONFIG_GEN_UPDATE
:
666 atomic_inc_32(&lgrp_gen
);
669 case LGRP_CONFIG_FLATTEN
:
671 lgrp_topo_levels
= (int)resource
;
673 (void) lgrp_topo_flatten(resource
,
674 lgrp_table
, lgrp_alloc_max
, &changed
);
678 * Update any lgroups with old latency to new latency
680 case LGRP_CONFIG_LAT_CHANGE_ALL
:
681 lgrp_latency_change(LGRP_NULL_HANDLE
, (u_longlong_t
)resource
,
682 (u_longlong_t
)where
);
686 * Update lgroup with specified lgroup platform handle to have
689 case LGRP_CONFIG_LAT_CHANGE
:
690 lgrp_latency_change((lgrp_handle_t
)resource
, 0,
691 (u_longlong_t
)where
);
694 case LGRP_CONFIG_NOP
:
704 * Called to add lgrp info into cpu structure from cpu_add_unit;
705 * do not assume cpu is in cpu[] yet!
707 * CPUs are brought online with all other CPUs paused so we can't
708 * allocate memory or we could deadlock the system, so we rely on
709 * the platform to statically allocate as much space as we need
710 * for the lgrp structs and stats.
713 lgrp_cpu_init(struct cpu
*cp
)
724 * This is the first time through if the resource set
725 * for the root lgroup is empty. After cpu0 has been
726 * initially added to an lgroup, the root's CPU resource
727 * set can never be empty, since the system's last CPU
728 * cannot be offlined.
730 if (klgrpset_isempty(lgrp_root
->lgrp_set
[LGRP_RSRC_CPU
])) {
732 * First time through.
737 * If cpu0 needs to move lgroups, we may come
738 * through here again, at which time cpu_lock won't
739 * be held, and lgrp_initialized will be false.
741 ASSERT(MUTEX_HELD(&cpu_lock
) || !lgrp_initialized
);
742 ASSERT(cp
->cpu_part
!= NULL
);
746 hand
= lgrp_plat_cpu_to_hand(cp
->cpu_id
);
747 my_lgrp
= lgrp_hand_to_lgrp(hand
);
749 if (my_lgrp
== NULL
) {
751 * Create new lgrp and add it to lgroup topology
753 my_lgrp
= lgrp_create();
754 my_lgrp
->lgrp_plathand
= hand
;
755 my_lgrp
->lgrp_latency
= lgrp_plat_latency(hand
, hand
);
756 lgrpid
= my_lgrp
->lgrp_id
;
757 klgrpset_add(my_lgrp
->lgrp_leaves
, lgrpid
);
758 klgrpset_add(my_lgrp
->lgrp_set
[LGRP_RSRC_CPU
], lgrpid
);
761 klgrpset_clear(changed
);
762 count
+= lgrp_leaf_add(my_lgrp
, lgrp_table
, lgrp_alloc_max
+ 1,
765 * May have added new intermediate lgroups, so need to add
766 * resources other than CPUs which are added below
768 (void) lgrp_mnode_update(changed
, NULL
);
769 } else if (my_lgrp
->lgrp_latency
== 0 && lgrp_plat_latency(hand
, hand
)
772 * Leaf lgroup was created, but latency wasn't available
773 * then. So, set latency for it and fill in rest of lgroup
774 * topology now that we know how far it is from other leaf
777 lgrpid
= my_lgrp
->lgrp_id
;
778 klgrpset_clear(changed
);
779 if (!klgrpset_ismember(my_lgrp
->lgrp_set
[LGRP_RSRC_CPU
],
781 klgrpset_add(my_lgrp
->lgrp_set
[LGRP_RSRC_CPU
], lgrpid
);
782 count
= lgrp_leaf_add(my_lgrp
, lgrp_table
, lgrp_alloc_max
+ 1,
786 * May have added new intermediate lgroups, so need to add
787 * resources other than CPUs which are added below
789 (void) lgrp_mnode_update(changed
, NULL
);
790 } else if (!klgrpset_ismember(my_lgrp
->lgrp_set
[LGRP_RSRC_CPU
],
795 * Update existing lgroup and lgroups containing it with CPU
798 lgrpid
= my_lgrp
->lgrp_id
;
799 klgrpset_add(my_lgrp
->lgrp_set
[LGRP_RSRC_CPU
], lgrpid
);
800 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
803 lgrp
= lgrp_table
[i
];
804 if (!LGRP_EXISTS(lgrp
) ||
805 !lgrp_rsets_member(lgrp
->lgrp_set
, lgrpid
))
808 klgrpset_add(lgrp
->lgrp_set
[LGRP_RSRC_CPU
], lgrpid
);
812 lgrpid
= my_lgrp
->lgrp_id
;
813 cp
->cpu_lpl
= &cp
->cpu_part
->cp_lgrploads
[lgrpid
];
816 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
817 * end up in lpl for lgroup 0 whether it is supposed to be in there or
818 * not since none of lgroup IDs in the lpl's have been set yet.
820 if (first_cpu
&& nlgrpsmax
> 1 && lgrpid
!= cp
->cpu_lpl
->lpl_lgrpid
)
821 cp
->cpu_lpl
->lpl_lgrpid
= lgrpid
;
824 * link the CPU into the lgrp's CPU list
826 if (my_lgrp
->lgrp_cpucnt
== 0) {
827 my_lgrp
->lgrp_cpu
= cp
;
828 cp
->cpu_next_lgrp
= cp
->cpu_prev_lgrp
= cp
;
830 cptr
= my_lgrp
->lgrp_cpu
;
831 cp
->cpu_next_lgrp
= cptr
;
832 cp
->cpu_prev_lgrp
= cptr
->cpu_prev_lgrp
;
833 cptr
->cpu_prev_lgrp
->cpu_next_lgrp
= cp
;
834 cptr
->cpu_prev_lgrp
= cp
;
836 my_lgrp
->lgrp_cpucnt
++;
846 ASSERT(!lgrp_initialized
|| MUTEX_HELD(&cpu_lock
));
849 * Find an open slot in the lgroup table and recycle unused lgroup
853 if (lgrp_alloc_hint
== -1)
855 * Allocate from end when hint not set yet because no lgroups
856 * have been deleted yet
861 * Start looking for next open slot from hint and leave hint
864 for (i
= lgrp_alloc_hint
; i
< nlgrpsmax
; i
++) {
865 my_lgrp
= lgrp_table
[i
];
866 if (!LGRP_EXISTS(my_lgrp
)) {
872 lgrp_alloc_hint
= lgrpid
;
876 * Keep track of max lgroup ID allocated so far to cut down on searches
878 if (lgrpid
> lgrp_alloc_max
)
879 lgrp_alloc_max
= lgrpid
;
882 * Need to allocate new lgroup if next open slot didn't have one
886 my_lgrp
= lgrp_plat_alloc(lgrpid
);
888 if (nlgrps
> nlgrpsmax
|| my_lgrp
== NULL
)
889 panic("Too many lgrps for platform (%d)", nlgrps
);
891 my_lgrp
->lgrp_id
= lgrpid
;
892 my_lgrp
->lgrp_latency
= 0;
893 my_lgrp
->lgrp_plathand
= LGRP_NULL_HANDLE
;
894 my_lgrp
->lgrp_parent
= NULL
;
895 my_lgrp
->lgrp_childcnt
= 0;
896 my_lgrp
->lgrp_mnodes
= (mnodeset_t
)0;
897 my_lgrp
->lgrp_nmnodes
= 0;
898 klgrpset_clear(my_lgrp
->lgrp_children
);
899 klgrpset_clear(my_lgrp
->lgrp_leaves
);
900 for (i
= 0; i
< LGRP_RSRC_COUNT
; i
++)
901 klgrpset_clear(my_lgrp
->lgrp_set
[i
]);
903 my_lgrp
->lgrp_cpu
= NULL
;
904 my_lgrp
->lgrp_cpucnt
= 0;
906 if (my_lgrp
->lgrp_kstat
!= NULL
)
907 lgrp_kstat_reset(lgrpid
);
909 lgrp_table
[my_lgrp
->lgrp_id
] = my_lgrp
;
915 lgrp_destroy(lgrp_t
*lgrp
)
920 * Unless this lgroup is being destroyed on behalf of
921 * the boot CPU, cpu_lock must be held
923 ASSERT(!lgrp_initialized
|| MUTEX_HELD(&cpu_lock
));
926 cmn_err(CE_PANIC
, "Can't destroy only lgroup!");
928 if (!LGRP_EXISTS(lgrp
))
932 * Set hint to lgroup being deleted and try to keep lower numbered
933 * hints to facilitate finding empty slots
935 if (lgrp_alloc_hint
== -1 || lgrp
->lgrp_id
< lgrp_alloc_hint
)
936 lgrp_alloc_hint
= lgrp
->lgrp_id
;
939 * Mark this lgroup to be recycled by setting its lgroup ID to
940 * LGRP_NONE and clear relevant fields
942 lgrp
->lgrp_id
= LGRP_NONE
;
943 lgrp
->lgrp_latency
= 0;
944 lgrp
->lgrp_plathand
= LGRP_NULL_HANDLE
;
945 lgrp
->lgrp_parent
= NULL
;
946 lgrp
->lgrp_childcnt
= 0;
948 klgrpset_clear(lgrp
->lgrp_children
);
949 klgrpset_clear(lgrp
->lgrp_leaves
);
950 for (i
= 0; i
< LGRP_RSRC_COUNT
; i
++)
951 klgrpset_clear(lgrp
->lgrp_set
[i
]);
953 lgrp
->lgrp_mnodes
= (mnodeset_t
)0;
954 lgrp
->lgrp_nmnodes
= 0;
956 lgrp
->lgrp_cpu
= NULL
;
957 lgrp
->lgrp_cpucnt
= 0;
963 * Initialize kstat data. Called from lgrp intialization code.
966 lgrp_kstat_init(void)
970 mutex_init(&lgrp_kstat_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
972 for (stat
= 0; stat
< LGRP_NUM_STATS
; stat
++)
973 kstat_named_init(&lgrp_kstat_data
[stat
],
974 lgrp_kstat_names
[stat
], KSTAT_DATA_INT64
);
978 * initialize an lgrp's kstats if needed
979 * called with cpu_lock held but not with cpus paused.
980 * we don't tear these down now because we don't know about
981 * memory leaving the lgrp yet...
985 lgrp_kstat_create(cpu_t
*cp
)
991 ASSERT(MUTEX_HELD(&cpu_lock
));
993 lgrpid
= cp
->cpu_lpl
->lpl_lgrpid
;
994 my_lgrp
= lgrp_table
[lgrpid
];
996 if (my_lgrp
->lgrp_kstat
!= NULL
)
997 return; /* already initialized */
999 lgrp_kstat
= kstat_create("lgrp", lgrpid
, NULL
, "misc",
1000 KSTAT_TYPE_NAMED
, LGRP_NUM_STATS
,
1001 KSTAT_FLAG_VIRTUAL
| KSTAT_FLAG_WRITABLE
);
1003 if (lgrp_kstat
!= NULL
) {
1004 lgrp_kstat
->ks_lock
= &lgrp_kstat_mutex
;
1005 lgrp_kstat
->ks_private
= my_lgrp
;
1006 lgrp_kstat
->ks_data
= &lgrp_kstat_data
;
1007 lgrp_kstat
->ks_update
= lgrp_kstat_extract
;
1008 my_lgrp
->lgrp_kstat
= lgrp_kstat
;
1009 kstat_install(lgrp_kstat
);
1014 * this will do something when we manage to remove now unused lgrps
1019 lgrp_kstat_destroy(cpu_t
*cp
)
1021 ASSERT(MUTEX_HELD(&cpu_lock
));
1025 * Called when a CPU is off-lined.
1028 lgrp_cpu_fini(struct cpu
*cp
, lgrp_id_t lgrpid
)
1034 ASSERT(MUTEX_HELD(&cpu_lock
) || !lgrp_initialized
);
1036 prev
= cp
->cpu_prev_lgrp
;
1037 next
= cp
->cpu_next_lgrp
;
1039 prev
->cpu_next_lgrp
= next
;
1040 next
->cpu_prev_lgrp
= prev
;
1043 * just because I'm paranoid doesn't mean...
1046 cp
->cpu_next_lgrp
= cp
->cpu_prev_lgrp
= NULL
;
1048 my_lgrp
= lgrp_table
[lgrpid
];
1049 my_lgrp
->lgrp_cpucnt
--;
1052 * Removing last CPU in lgroup, so update lgroup topology
1054 if (my_lgrp
->lgrp_cpucnt
== 0) {
1059 my_lgrp
->lgrp_cpu
= NULL
;
1062 * Remove this lgroup from its lgroup CPU resources and remove
1063 * lgroup from lgroup topology if it doesn't have any more
1064 * resources in it now
1066 klgrpset_del(my_lgrp
->lgrp_set
[LGRP_RSRC_CPU
], lgrpid
);
1067 if (lgrp_rsets_empty(my_lgrp
->lgrp_set
)) {
1069 klgrpset_clear(changed
);
1070 count
+= lgrp_leaf_delete(my_lgrp
, lgrp_table
,
1071 lgrp_alloc_max
+ 1, &changed
);
1076 * This lgroup isn't empty, so just remove it from CPU
1077 * resources of any lgroups that contain it as such
1079 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1082 lgrp
= lgrp_table
[i
];
1083 if (!LGRP_EXISTS(lgrp
) ||
1084 !klgrpset_ismember(lgrp
->lgrp_set
[LGRP_RSRC_CPU
],
1088 klgrpset_del(lgrp
->lgrp_set
[LGRP_RSRC_CPU
], lgrpid
);
1093 if (my_lgrp
->lgrp_cpu
== cp
)
1094 my_lgrp
->lgrp_cpu
= next
;
1099 * Update memory nodes in target lgroups and return ones that get changed
1102 lgrp_mnode_update(klgrpset_t target
, klgrpset_t
*changed
)
1112 klgrpset_clear(*changed
);
1114 if (klgrpset_isempty(target
))
1118 * Find each lgroup in target lgroups
1120 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1122 * Skip any lgroups that don't exist or aren't in target group
1124 lgrp
= lgrp_table
[i
];
1125 if (!klgrpset_ismember(target
, i
) || !LGRP_EXISTS(lgrp
)) {
1130 * Initialize memnodes for intermediate lgroups to 0
1131 * and update them from scratch since they may have completely
1134 if (lgrp
->lgrp_childcnt
&& lgrp
!= lgrp_root
) {
1135 lgrp
->lgrp_mnodes
= (mnodeset_t
)0;
1136 lgrp
->lgrp_nmnodes
= 0;
1140 * Update memory nodes of of target lgroup with memory nodes
1141 * from each lgroup in its lgroup memory resource set
1143 for (j
= 0; j
<= lgrp_alloc_max
; j
++) {
1147 * Skip any lgroups that don't exist or aren't in
1148 * memory resources of target lgroup
1150 lgrp_rsrc
= lgrp_table
[j
];
1151 if (!LGRP_EXISTS(lgrp_rsrc
) ||
1152 !klgrpset_ismember(lgrp
->lgrp_set
[LGRP_RSRC_MEM
],
1157 * Update target lgroup's memnodes to include memnodes
1160 for (k
= 0; k
< sizeof (mnodeset_t
) * NBBY
; k
++) {
1161 mnodeset_t mnode_mask
;
1163 mnode_mask
= (mnodeset_t
)1 << k
;
1164 if ((lgrp_rsrc
->lgrp_mnodes
& mnode_mask
) &&
1165 !(lgrp
->lgrp_mnodes
& mnode_mask
)) {
1166 lgrp
->lgrp_mnodes
|= mnode_mask
;
1167 lgrp
->lgrp_nmnodes
++;
1172 klgrpset_add(*changed
, lgrp
->lgrp_id
);
1180 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1181 * is moved from one board to another. The "from" and "to" arguments specify the
1182 * source and the destination of the move.
1184 * See plat_lgrp_config() for a detailed description of the copy-rename
1187 * The lgrp_mem_rename() is called by the platform copy-rename code to update
1188 * the lgroup topology which is changing as memory moves from one lgroup to
1189 * another. It removes the mnode from the source lgroup and re-inserts it in the
1192 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1193 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1194 * copy-rename operation.
1196 * There is one case which requires special handling. If the system contains
1197 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1198 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1199 * lgrp_mem_init), but there is a window when the system has no memory in the
1200 * lgroup hierarchy. If another thread tries to allocate memory during this
1201 * window, the allocation will fail, although the system has physical memory.
1202 * This may cause a system panic or a deadlock (some sleeping memory allocations
1203 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1206 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1207 * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1208 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1209 * but it updates the rest of the lgroup topology as if the mnode was actually
1210 * removed. The lgrp_mem_init() function recognizes that the mnode being
1211 * inserted represents such a special case and updates the topology
1215 lgrp_mem_rename(int mnode
, lgrp_handle_t from
, lgrp_handle_t to
)
1218 * Remove the memory from the source node and add it to the destination
1221 lgrp_mem_fini(mnode
, from
, B_TRUE
);
1222 lgrp_mem_init(mnode
, to
, B_TRUE
);
1226 * Called to indicate that the lgrp with platform handle "hand" now
1227 * contains the memory identified by "mnode".
1229 * LOCKING for this routine is a bit tricky. Usually it is called without
1230 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1231 * callers. During DR of the board containing the caged memory it may be called
1232 * with cpu_lock already held and CPUs paused.
1234 * If the insertion is part of the DR copy-rename and the inserted mnode (and
1235 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1236 * dealing with the special case of DR copy-rename described in
1237 * lgrp_mem_rename().
1240 lgrp_mem_init(int mnode
, lgrp_handle_t hand
, boolean_t is_copy_rename
)
1247 mnodeset_t mnodes_mask
= ((mnodeset_t
)1 << mnode
);
1248 boolean_t drop_lock
= B_FALSE
;
1249 boolean_t need_synch
= B_FALSE
;
1252 * Grab CPU lock (if we haven't already)
1254 if (!MUTEX_HELD(&cpu_lock
)) {
1255 mutex_enter(&cpu_lock
);
1260 * This routine may be called from a context where we already
1261 * hold cpu_lock, and have already paused cpus.
1264 need_synch
= B_TRUE
;
1267 * Check if this mnode is already configured and return immediately if
1270 * NOTE: in special case of copy-rename of the only remaining mnode,
1271 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1272 * recognize this case and continue as usual, but skip the update to
1273 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1274 * in topology, temporarily introduced by lgrp_mem_fini().
1276 if (! (is_copy_rename
&& (lgrp_root
->lgrp_mnodes
== mnodes_mask
)) &&
1277 lgrp_root
->lgrp_mnodes
& mnodes_mask
) {
1279 mutex_exit(&cpu_lock
);
1284 * Update lgroup topology with new memory resources, keeping track of
1285 * which lgroups change
1288 klgrpset_clear(changed
);
1289 my_lgrp
= lgrp_hand_to_lgrp(hand
);
1290 if (my_lgrp
== NULL
) {
1292 my_lgrp
= lgrp_create();
1293 lgrpid
= my_lgrp
->lgrp_id
;
1294 my_lgrp
->lgrp_plathand
= hand
;
1295 my_lgrp
->lgrp_latency
= lgrp_plat_latency(hand
, hand
);
1296 klgrpset_add(my_lgrp
->lgrp_leaves
, lgrpid
);
1297 klgrpset_add(my_lgrp
->lgrp_set
[LGRP_RSRC_MEM
], lgrpid
);
1300 pause_cpus(NULL
, NULL
);
1301 count
= lgrp_leaf_add(my_lgrp
, lgrp_table
, lgrp_alloc_max
+ 1,
1305 } else if (my_lgrp
->lgrp_latency
== 0 && lgrp_plat_latency(hand
, hand
)
1308 * Leaf lgroup was created, but latency wasn't available
1309 * then. So, set latency for it and fill in rest of lgroup
1310 * topology now that we know how far it is from other leaf
1313 klgrpset_clear(changed
);
1314 lgrpid
= my_lgrp
->lgrp_id
;
1315 if (!klgrpset_ismember(my_lgrp
->lgrp_set
[LGRP_RSRC_MEM
],
1317 klgrpset_add(my_lgrp
->lgrp_set
[LGRP_RSRC_MEM
], lgrpid
);
1319 pause_cpus(NULL
, NULL
);
1320 count
= lgrp_leaf_add(my_lgrp
, lgrp_table
, lgrp_alloc_max
+ 1,
1324 } else if (!klgrpset_ismember(my_lgrp
->lgrp_set
[LGRP_RSRC_MEM
],
1325 my_lgrp
->lgrp_id
)) {
1327 * Add new lgroup memory resource to existing lgroup
1329 lgrpid
= my_lgrp
->lgrp_id
;
1330 klgrpset_add(my_lgrp
->lgrp_set
[LGRP_RSRC_MEM
], lgrpid
);
1331 klgrpset_add(changed
, lgrpid
);
1333 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1336 lgrp
= lgrp_table
[i
];
1337 if (!LGRP_EXISTS(lgrp
) ||
1338 !lgrp_rsets_member(lgrp
->lgrp_set
, lgrpid
))
1341 klgrpset_add(lgrp
->lgrp_set
[LGRP_RSRC_MEM
], lgrpid
);
1342 klgrpset_add(changed
, lgrp
->lgrp_id
);
1348 * Add memory node to lgroup and remove lgroup from ones that need
1351 if (!(my_lgrp
->lgrp_mnodes
& mnodes_mask
)) {
1352 my_lgrp
->lgrp_mnodes
|= mnodes_mask
;
1353 my_lgrp
->lgrp_nmnodes
++;
1355 klgrpset_del(changed
, lgrpid
);
1358 * Update memory node information for all lgroups that changed and
1359 * contain new memory node as a resource
1362 (void) lgrp_mnode_update(changed
, NULL
);
1365 mutex_exit(&cpu_lock
);
1369 * Called to indicate that the lgroup associated with the platform
1370 * handle "hand" no longer contains given memory node
1372 * LOCKING for this routine is a bit tricky. Usually it is called without
1373 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1374 * callers. During DR of the board containing the caged memory it may be called
1375 * with cpu_lock already held and CPUs paused.
1377 * If the deletion is part of the DR copy-rename and the deleted mnode is the
1378 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1379 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1380 * the same mnode back into the topology. See lgrp_mem_rename() and
1381 * lgrp_mem_init() for additional details.
1384 lgrp_mem_fini(int mnode
, lgrp_handle_t hand
, boolean_t is_copy_rename
)
1391 mnodeset_t mnodes_mask
;
1392 boolean_t drop_lock
= B_FALSE
;
1393 boolean_t need_synch
= B_FALSE
;
1396 * Grab CPU lock (if we haven't already)
1398 if (!MUTEX_HELD(&cpu_lock
)) {
1399 mutex_enter(&cpu_lock
);
1404 * This routine may be called from a context where we already
1405 * hold cpu_lock and have already paused cpus.
1408 need_synch
= B_TRUE
;
1410 my_lgrp
= lgrp_hand_to_lgrp(hand
);
1413 * The lgrp *must* be pre-existing
1415 ASSERT(my_lgrp
!= NULL
);
1418 * Delete memory node from lgroups which contain it
1420 mnodes_mask
= ((mnodeset_t
)1 << mnode
);
1421 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1422 lgrp_t
*lgrp
= lgrp_table
[i
];
1424 * Skip any non-existent lgroups and any lgroups that don't
1425 * contain leaf lgroup of memory as a memory resource
1427 if (!LGRP_EXISTS(lgrp
) ||
1428 !(lgrp
->lgrp_mnodes
& mnodes_mask
))
1432 * Avoid removing the last mnode from the root in the DR
1433 * copy-rename case. See lgrp_mem_rename() for details.
1435 if (is_copy_rename
&&
1436 (lgrp
== lgrp_root
) && (lgrp
->lgrp_mnodes
== mnodes_mask
))
1440 * Remove memory node from lgroup.
1442 lgrp
->lgrp_mnodes
&= ~mnodes_mask
;
1443 lgrp
->lgrp_nmnodes
--;
1444 ASSERT(lgrp
->lgrp_nmnodes
>= 0);
1446 ASSERT(lgrp_root
->lgrp_nmnodes
> 0);
1449 * Don't need to update lgroup topology if this lgroup still has memory.
1451 * In the special case of DR copy-rename with the only mnode being
1452 * removed, the lgrp_mnodes for the root is always non-zero, but we
1453 * still need to update the lgroup topology.
1455 if ((my_lgrp
->lgrp_nmnodes
> 0) &&
1456 !(is_copy_rename
&& (my_lgrp
== lgrp_root
) &&
1457 (my_lgrp
->lgrp_mnodes
== mnodes_mask
))) {
1459 mutex_exit(&cpu_lock
);
1464 * This lgroup does not contain any memory now
1466 klgrpset_clear(my_lgrp
->lgrp_set
[LGRP_RSRC_MEM
]);
1469 * Remove this lgroup from lgroup topology if it does not contain any
1472 lgrpid
= my_lgrp
->lgrp_id
;
1474 klgrpset_clear(changed
);
1475 if (lgrp_rsets_empty(my_lgrp
->lgrp_set
)) {
1477 * Delete lgroup when no more resources
1480 pause_cpus(NULL
, NULL
);
1481 count
= lgrp_leaf_delete(my_lgrp
, lgrp_table
,
1482 lgrp_alloc_max
+ 1, &changed
);
1488 * Remove lgroup from memory resources of any lgroups that
1489 * contain it as such
1491 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1494 lgrp
= lgrp_table
[i
];
1495 if (!LGRP_EXISTS(lgrp
) ||
1496 !klgrpset_ismember(lgrp
->lgrp_set
[LGRP_RSRC_MEM
],
1500 klgrpset_del(lgrp
->lgrp_set
[LGRP_RSRC_MEM
], lgrpid
);
1504 mutex_exit(&cpu_lock
);
1508 * Return lgroup with given platform handle
1511 lgrp_hand_to_lgrp(lgrp_handle_t hand
)
1516 if (hand
== LGRP_NULL_HANDLE
)
1519 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1520 lgrp
= lgrp_table
[i
];
1521 if (LGRP_EXISTS(lgrp
) && lgrp
->lgrp_plathand
== hand
)
1528 * Return the home lgroup of the current thread.
1529 * We must do this with kernel preemption disabled, since we don't want our
1530 * thread to be re-homed while we're poking around with its lpl, and the lpl
1531 * should never be NULL.
1533 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1534 * is enabled because of DR. Callers can use disable kernel preemption
1535 * around this call to guarantee that the lgroup will be valid beyond this
1536 * routine, since kernel preemption can be recursive.
1539 lgrp_home_lgrp(void)
1546 lpl
= curthread
->t_lpl
;
1547 ASSERT(lpl
!= NULL
);
1548 ASSERT(lpl
->lpl_lgrpid
>= 0 && lpl
->lpl_lgrpid
<= lgrp_alloc_max
);
1549 ASSERT(LGRP_EXISTS(lgrp_table
[lpl
->lpl_lgrpid
]));
1550 lgrp
= lgrp_table
[lpl
->lpl_lgrpid
];
1558 * Return ID of home lgroup for given thread
1559 * (See comments for lgrp_home_lgrp() for special care and handling
1563 lgrp_home_id(kthread_t
*t
)
1570 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1571 * cannot since the HAT layer can call into this routine to
1572 * determine the locality for its data structures in the context
1579 ASSERT(lpl
!= NULL
);
1580 ASSERT(lpl
->lpl_lgrpid
>= 0 && lpl
->lpl_lgrpid
<= lgrp_alloc_max
);
1581 lgrp
= lpl
->lpl_lgrpid
;
1589 * Return lgroup containing the physical memory for the given page frame number
1592 lgrp_pfn_to_lgrp(pfn_t pfn
)
1598 hand
= lgrp_plat_pfn_to_hand(pfn
);
1599 if (hand
!= LGRP_NULL_HANDLE
)
1600 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1601 lgrp
= lgrp_table
[i
];
1602 if (LGRP_EXISTS(lgrp
) && lgrp
->lgrp_plathand
== hand
)
1609 * Return lgroup containing the physical memory for the given page frame number
1612 lgrp_phys_to_lgrp(u_longlong_t physaddr
)
1619 pfn
= btop(physaddr
);
1620 hand
= lgrp_plat_pfn_to_hand(pfn
);
1621 if (hand
!= LGRP_NULL_HANDLE
)
1622 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1623 lgrp
= lgrp_table
[i
];
1624 if (LGRP_EXISTS(lgrp
) && lgrp
->lgrp_plathand
== hand
)
1631 * Return the leaf lgroup containing the given CPU
1633 * The caller needs to take precautions necessary to prevent
1634 * "cpu", and it's lpl from going away across a call to this function.
1635 * hint: kpreempt_disable()/kpreempt_enable()
1638 lgrp_cpu_to_lgrp(cpu_t
*cpu
)
1640 return (cpu
->cpu_lpl
->lpl_lgrp
);
1644 * Return the sum of the partition loads in an lgrp divided by
1645 * the number of CPUs in the lgrp. This is our best approximation
1646 * of an 'lgroup load average' for a useful per-lgroup kstat.
1649 lgrp_sum_loadavgs(lgrp_t
*lgrp
)
1655 mutex_enter(&cpu_lock
);
1657 cpu
= lgrp
->lgrp_cpu
;
1658 ncpu
= lgrp
->lgrp_cpucnt
;
1660 if (cpu
== NULL
|| ncpu
== 0) {
1661 mutex_exit(&cpu_lock
);
1666 loads
+= cpu
->cpu_lpl
->lpl_loadavg
;
1667 cpu
= cpu
->cpu_next_lgrp
;
1668 } while (cpu
!= lgrp
->lgrp_cpu
);
1670 mutex_exit(&cpu_lock
);
1672 return (loads
/ ncpu
);
1676 lgrp_stat_add(lgrp_id_t lgrpid
, lgrp_stat_t stat
, int64_t val
)
1678 struct lgrp_stats
*pstats
;
1681 * Verify that the caller isn't trying to add to
1682 * a statistic for an lgroup that has gone away
1684 if (lgrpid
< 0 || lgrpid
> lgrp_alloc_max
)
1687 pstats
= &lgrp_stats
[lgrpid
];
1688 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats
, stat
), val
);
1692 lgrp_stat_read(lgrp_id_t lgrpid
, lgrp_stat_t stat
)
1695 struct lgrp_stats
*pstats
;
1697 if (lgrpid
< 0 || lgrpid
> lgrp_alloc_max
)
1698 return ((int64_t)0);
1700 pstats
= &lgrp_stats
[lgrpid
];
1701 LGRP_STAT_READ(pstats
, stat
, val
);
1706 * Reset all kstats for lgrp specified by its lgrpid.
1709 lgrp_kstat_reset(lgrp_id_t lgrpid
)
1713 if (lgrpid
< 0 || lgrpid
> lgrp_alloc_max
)
1716 for (stat
= 0; stat
< LGRP_NUM_COUNTER_STATS
; stat
++) {
1717 LGRP_STAT_RESET(&lgrp_stats
[lgrpid
], stat
);
1722 * Collect all per-lgrp statistics for the lgrp associated with this
1723 * kstat, and store them in the ks_data array.
1725 * The superuser can reset all the running counter statistics for an
1726 * lgrp by writing to any of the lgrp's stats.
1729 lgrp_kstat_extract(kstat_t
*ksp
, int rw
)
1732 struct kstat_named
*ksd
;
1736 lgrp
= (lgrp_t
*)ksp
->ks_private
;
1738 ksd
= (struct kstat_named
*)ksp
->ks_data
;
1739 ASSERT(ksd
== (struct kstat_named
*)&lgrp_kstat_data
);
1741 lgrpid
= lgrp
->lgrp_id
;
1743 if (lgrpid
== LGRP_NONE
) {
1745 * Return all zeroes as stats for freed lgrp.
1747 for (stat
= 0; stat
< LGRP_NUM_COUNTER_STATS
; stat
++) {
1748 ksd
[stat
].value
.i64
= 0;
1750 ksd
[stat
+ LGRP_NUM_CPUS
].value
.i64
= 0;
1751 ksd
[stat
+ LGRP_NUM_PG_INSTALL
].value
.i64
= 0;
1752 ksd
[stat
+ LGRP_NUM_PG_AVAIL
].value
.i64
= 0;
1753 ksd
[stat
+ LGRP_NUM_PG_FREE
].value
.i64
= 0;
1754 ksd
[stat
+ LGRP_LOADAVG
].value
.i64
= 0;
1755 } else if (rw
!= KSTAT_WRITE
) {
1757 * Handle counter stats
1759 for (stat
= 0; stat
< LGRP_NUM_COUNTER_STATS
; stat
++) {
1760 ksd
[stat
].value
.i64
= lgrp_stat_read(lgrpid
, stat
);
1764 * Handle kernel data snapshot stats
1766 ksd
[stat
+ LGRP_NUM_CPUS
].value
.i64
= lgrp
->lgrp_cpucnt
;
1767 ksd
[stat
+ LGRP_NUM_PG_INSTALL
].value
.i64
=
1768 lgrp_mem_size(lgrpid
, LGRP_MEM_SIZE_INSTALL
);
1769 ksd
[stat
+ LGRP_NUM_PG_AVAIL
].value
.i64
=
1770 lgrp_mem_size(lgrpid
, LGRP_MEM_SIZE_AVAIL
);
1771 ksd
[stat
+ LGRP_NUM_PG_FREE
].value
.i64
=
1772 lgrp_mem_size(lgrpid
, LGRP_MEM_SIZE_FREE
);
1773 ksd
[stat
+ LGRP_LOADAVG
].value
.i64
= lgrp_sum_loadavgs(lgrp
);
1774 ksd
[stat
+ LGRP_LOADAVG_SCALE
].value
.i64
=
1775 lgrp_loadavg_max_effect
;
1777 lgrp_kstat_reset(lgrpid
);
1784 lgrp_query_cpu(processorid_t id
, lgrp_id_t
*lp
)
1788 mutex_enter(&cpu_lock
);
1790 if ((cp
= cpu_get(id
)) == NULL
) {
1791 mutex_exit(&cpu_lock
);
1795 if (cpu_is_offline(cp
) || cpu_is_poweredoff(cp
)) {
1796 mutex_exit(&cpu_lock
);
1800 ASSERT(cp
->cpu_lpl
!= NULL
);
1802 *lp
= cp
->cpu_lpl
->lpl_lgrpid
;
1804 mutex_exit(&cpu_lock
);
1810 lgrp_query_load(processorid_t id
, lgrp_load_t
*lp
)
1814 mutex_enter(&cpu_lock
);
1816 if ((cp
= cpu_get(id
)) == NULL
) {
1817 mutex_exit(&cpu_lock
);
1821 ASSERT(cp
->cpu_lpl
!= NULL
);
1823 *lp
= cp
->cpu_lpl
->lpl_loadavg
;
1825 mutex_exit(&cpu_lock
);
1831 * Add a resource named by lpl_leaf to rset of lpl_target
1833 * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1834 * resource. It is adjusted here, as this is presently the only place that we
1835 * can be certain a resource addition has succeeded.
1837 * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1838 * list in order until it reaches a NULL. (This list is required to be NULL
1839 * terminated, too). This is done so that we can mark start pos + 1, so that
1840 * each lpl is traversed sequentially, but in a different order. We hope this
1841 * will improve performance a bit. (Hopefully, less read-to-own traffic...)
1845 lpl_rset_add(lpl_t
*lpl_target
, lpl_t
*lpl_leaf
)
1850 /* return if leaf is already present */
1851 for (i
= 0; i
< lpl_target
->lpl_nrset
; i
++) {
1852 if (lpl_target
->lpl_rset
[i
] == lpl_leaf
) {
1856 if (lpl_target
->lpl_rset
[i
]->lpl_lgrpid
>
1857 lpl_leaf
->lpl_lgrpid
) {
1862 /* insert leaf, update counts */
1864 i
= lpl_target
->lpl_nrset
++;
1867 * Start at the end of the rset array and work backwards towards the
1868 * slot into which the new lpl will be inserted. This effectively
1869 * preserves the current ordering by scooting everybody over one entry,
1870 * and placing the new entry into the space created.
1872 while (i
-- > entry_slot
) {
1873 lpl_target
->lpl_rset
[i
+ 1] = lpl_target
->lpl_rset
[i
];
1874 lpl_target
->lpl_id2rset
[lpl_target
->lpl_rset
[i
]->lpl_lgrpid
] =
1878 lpl_target
->lpl_rset
[entry_slot
] = lpl_leaf
;
1879 lpl_target
->lpl_id2rset
[lpl_leaf
->lpl_lgrpid
] = entry_slot
;
1881 lpl_target
->lpl_ncpu
+= lpl_leaf
->lpl_ncpu
;
1885 * Update each of lpl_parent's children with a reference to their parent.
1886 * The lgrp topology is used as the reference since it is fully
1887 * consistent and correct at this point.
1888 * This should be called after any potential change in lpl_parent's
1892 lpl_child_update(lpl_t
*lpl_parent
, struct cpupart
*cp
)
1894 klgrpset_t children
;
1897 children
= lgrp_table
[lpl_parent
->lpl_lgrpid
]->lgrp_children
;
1898 if (klgrpset_isempty(children
))
1899 return; /* nothing to do */
1901 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1902 if (klgrpset_ismember(children
, i
)) {
1904 * (Re)set the parent. It may be incorrect if
1905 * lpl_parent is new in the topology.
1907 cp
->cp_lgrploads
[i
].lpl_parent
= lpl_parent
;
1913 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1915 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1916 * resource. The values are adjusted here, as this is the only place that we can
1917 * be certain a resource was successfully deleted.
1920 lpl_rset_del(lpl_t
*lpl_target
, lpl_t
*lpl_leaf
)
1925 if (lpl_target
->lpl_nrset
== 0)
1928 /* find leaf in intermediate node */
1929 for (i
= 0; i
< lpl_target
->lpl_nrset
; i
++) {
1930 if (lpl_target
->lpl_rset
[i
] == lpl_leaf
)
1934 /* return if leaf not found */
1935 if (lpl_target
->lpl_rset
[i
] != lpl_leaf
)
1938 /* prune leaf, compress array */
1939 lpl_target
->lpl_rset
[lpl_target
->lpl_nrset
--] = NULL
;
1940 lpl_target
->lpl_id2rset
[lpl_leaf
->lpl_lgrpid
] = -1;
1941 lpl_target
->lpl_ncpu
--;
1943 lpl_target
->lpl_rset
[i
] = lpl_target
->lpl_rset
[i
+ 1];
1945 * Update the lgrp id <=> rset mapping
1947 if ((leaf
= lpl_target
->lpl_rset
[i
]) != NULL
) {
1948 lpl_target
->lpl_id2rset
[leaf
->lpl_lgrpid
] = i
;
1950 } while (i
++ < lpl_target
->lpl_nrset
);
1954 * Check to see if the resource set of the target lpl contains the
1955 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not.
1959 lpl_rset_contains(lpl_t
*lpl_target
, lpl_t
*lpl_leaf
)
1963 for (i
= 0; i
< lpl_target
->lpl_nrset
; i
++) {
1964 if (lpl_target
->lpl_rset
[i
] == lpl_leaf
)
1972 * Called when we change cpu lpl membership. This increments or decrements the
1973 * per-cpu counter in every lpl in which our leaf appears.
1976 lpl_cpu_adjcnt(lpl_act_t act
, cpu_t
*cp
)
1985 ASSERT(act
== LPL_DECREMENT
|| act
== LPL_INCREMENT
);
1987 cpupart
= cp
->cpu_part
;
1988 lpl_leaf
= cp
->cpu_lpl
;
1989 lgrp_leaf
= lgrp_table
[lpl_leaf
->lpl_lgrpid
];
1991 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
1992 lgrp_cur
= lgrp_table
[i
];
1995 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
1996 * for the cpu in question, or if the current lgrp and leaf
1997 * don't share the same resources.
2000 if (!LGRP_EXISTS(lgrp_cur
) || (lgrp_cur
== lgrp_leaf
) ||
2001 !klgrpset_intersects(lgrp_leaf
->lgrp_set
[LGRP_RSRC_CPU
],
2002 lgrp_cur
->lgrp_set
[LGRP_RSRC_CPU
]))
2006 lpl_cur
= &cpupart
->cp_lgrploads
[lgrp_cur
->lgrp_id
];
2008 if (lpl_cur
->lpl_nrset
> 0) {
2009 if (act
== LPL_INCREMENT
) {
2010 lpl_cur
->lpl_ncpu
++;
2011 } else if (act
== LPL_DECREMENT
) {
2012 lpl_cur
->lpl_ncpu
--;
2019 * Initialize lpl with given resources and specified lgrp
2022 lpl_init(lpl_t
*lpl
, lpl_t
*lpl_leaf
, lgrp_t
*lgrp
)
2024 lpl
->lpl_lgrpid
= lgrp
->lgrp_id
;
2025 lpl
->lpl_loadavg
= 0;
2026 if (lpl
== lpl_leaf
)
2029 lpl
->lpl_ncpu
= lpl_leaf
->lpl_ncpu
;
2031 lpl
->lpl_rset
[0] = lpl_leaf
;
2032 lpl
->lpl_id2rset
[lpl_leaf
->lpl_lgrpid
] = 0;
2033 lpl
->lpl_lgrp
= lgrp
;
2034 lpl
->lpl_parent
= NULL
; /* set by lpl_leaf_insert() */
2035 lpl
->lpl_cpus
= NULL
; /* set by lgrp_part_add_cpu() */
2039 * Clear an unused lpl
2042 lpl_clear(lpl_t
*lpl
)
2045 * Clear out all fields in the lpl except:
2046 * lpl_lgrpid - to facilitate debugging
2047 * lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2049 * Note that the lpl's rset and id2rset mapping are cleared as well.
2051 lpl
->lpl_loadavg
= 0;
2053 lpl
->lpl_lgrp
= NULL
;
2054 lpl
->lpl_parent
= NULL
;
2055 lpl
->lpl_cpus
= NULL
;
2057 lpl
->lpl_homed_time
= 0;
2058 bzero(lpl
->lpl_rset
, sizeof (lpl
->lpl_rset
[0]) * lpl
->lpl_rset_sz
);
2059 bzero(lpl
->lpl_id2rset
,
2060 sizeof (lpl
->lpl_id2rset
[0]) * lpl
->lpl_rset_sz
);
2064 * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2065 * is in sync with the lgroup toplogy in the system. The lpl topology may not
2066 * make full use of all of the lgroup topology, but this checks to make sure
2067 * that for the parts that it does use, it has correctly understood the
2068 * relationships that exist. This function returns
2069 * 0 if the topology is correct, and a non-zero error code, for non-debug
2070 * kernels if incorrect. Asserts are spread throughout the code to aid in
2071 * debugging on a DEBUG kernel.
2074 lpl_topo_verify(cpupart_t
*cpupart
)
2086 /* topology can't be incorrect if it doesn't exist */
2087 if (!lgrp_topo_initialized
|| !lgrp_initialized
)
2088 return (LPL_TOPO_CORRECT
);
2090 ASSERT(cpupart
!= NULL
);
2092 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
2093 lgrp
= lgrp_table
[i
];
2095 /* make sure lpls are allocated */
2096 ASSERT(cpupart
->cp_lgrploads
);
2097 if (!cpupart
->cp_lgrploads
)
2098 return (LPL_TOPO_PART_HAS_NO_LPL
);
2100 lpl
= &cpupart
->cp_lgrploads
[i
];
2101 /* make sure our index is good */
2102 ASSERT(i
< cpupart
->cp_nlgrploads
);
2104 /* if lgroup doesn't exist, make sure lpl is empty */
2105 if (!LGRP_EXISTS(lgrp
)) {
2106 ASSERT(lpl
->lpl_ncpu
== 0);
2107 if (lpl
->lpl_ncpu
> 0) {
2108 return (LPL_TOPO_CPUS_NOT_EMPTY
);
2114 /* verify that lgroup and lpl are identically numbered */
2115 ASSERT(lgrp
->lgrp_id
== lpl
->lpl_lgrpid
);
2117 /* if lgroup isn't in our partition, make sure lpl is empty */
2118 if (!klgrpset_intersects(lgrp
->lgrp_leaves
,
2119 cpupart
->cp_lgrpset
)) {
2120 ASSERT(lpl
->lpl_ncpu
== 0);
2121 if (lpl
->lpl_ncpu
> 0) {
2122 return (LPL_TOPO_CPUS_NOT_EMPTY
);
2125 * lpl is empty, and lgroup isn't in partition. verify
2126 * that lpl doesn't show up in anyone else's rsets (in
2127 * this partition, anyway)
2129 for (j
= 0; j
< cpupart
->cp_nlgrploads
; j
++) {
2130 lpl_t
*i_lpl
; /* lpl we're iterating over */
2132 i_lpl
= &cpupart
->cp_lgrploads
[j
];
2134 ASSERT(!lpl_rset_contains(i_lpl
, lpl
));
2135 if (lpl_rset_contains(i_lpl
, lpl
)) {
2136 return (LPL_TOPO_LPL_ORPHANED
);
2139 /* lgroup is empty, and everything is ok. continue */
2144 /* lgroup is in this partition, now check it against lpl */
2146 /* do both have matching lgrps? */
2147 ASSERT(lgrp
== lpl
->lpl_lgrp
);
2148 if (lgrp
!= lpl
->lpl_lgrp
) {
2149 return (LPL_TOPO_LGRP_MISMATCH
);
2152 /* do the parent lgroups exist and do they match? */
2153 if (lgrp
->lgrp_parent
) {
2154 ASSERT(lpl
->lpl_parent
);
2155 ASSERT(lgrp
->lgrp_parent
->lgrp_id
==
2156 lpl
->lpl_parent
->lpl_lgrpid
);
2158 if (!lpl
->lpl_parent
) {
2159 return (LPL_TOPO_MISSING_PARENT
);
2160 } else if (lgrp
->lgrp_parent
->lgrp_id
!=
2161 lpl
->lpl_parent
->lpl_lgrpid
) {
2162 return (LPL_TOPO_PARENT_MISMATCH
);
2166 /* only leaf lgroups keep a cpucnt, only check leaves */
2167 if ((lpl
->lpl_nrset
== 1) && (lpl
== lpl
->lpl_rset
[0])) {
2169 /* verify that lgrp is also a leaf */
2170 ASSERT((lgrp
->lgrp_childcnt
== 0) &&
2171 (klgrpset_ismember(lgrp
->lgrp_leaves
,
2174 if ((lgrp
->lgrp_childcnt
> 0) ||
2175 (!klgrpset_ismember(lgrp
->lgrp_leaves
,
2176 lpl
->lpl_lgrpid
))) {
2177 return (LPL_TOPO_LGRP_NOT_LEAF
);
2180 ASSERT((lgrp
->lgrp_cpucnt
>= lpl
->lpl_ncpu
) &&
2181 (lpl
->lpl_ncpu
> 0));
2182 if ((lgrp
->lgrp_cpucnt
< lpl
->lpl_ncpu
) ||
2183 (lpl
->lpl_ncpu
<= 0)) {
2184 return (LPL_TOPO_BAD_CPUCNT
);
2188 * Check that lpl_ncpu also matches the number of
2189 * cpus in the lpl's linked list. This only exists in
2190 * leaves, but they should always match.
2193 cpu
= cp_start
= lpl
->lpl_cpus
;
2194 while (cpu
!= NULL
) {
2197 /* check to make sure cpu's lpl is leaf lpl */
2198 ASSERT(cpu
->cpu_lpl
== lpl
);
2199 if (cpu
->cpu_lpl
!= lpl
) {
2200 return (LPL_TOPO_CPU_HAS_BAD_LPL
);
2203 /* check next cpu */
2204 if ((cpu
= cpu
->cpu_next_lpl
) != cp_start
) {
2211 ASSERT(j
== lpl
->lpl_ncpu
);
2212 if (j
!= lpl
->lpl_ncpu
) {
2213 return (LPL_TOPO_LPL_BAD_NCPU
);
2217 * Also, check that leaf lpl is contained in all
2218 * intermediate lpls that name the leaf as a descendant
2220 for (j
= 0; j
<= lgrp_alloc_max
; j
++) {
2221 klgrpset_t intersect
;
2225 lgrp_cand
= lgrp_table
[j
];
2226 intersect
= klgrpset_intersects(
2227 lgrp_cand
->lgrp_set
[LGRP_RSRC_CPU
],
2228 cpupart
->cp_lgrpset
);
2230 if (!LGRP_EXISTS(lgrp_cand
) ||
2231 !klgrpset_intersects(lgrp_cand
->lgrp_leaves
,
2232 cpupart
->cp_lgrpset
) ||
2237 &cpupart
->cp_lgrploads
[lgrp_cand
->lgrp_id
];
2239 if (klgrpset_ismember(intersect
,
2241 ASSERT(lpl_rset_contains(lpl_cand
,
2244 if (!lpl_rset_contains(lpl_cand
, lpl
)) {
2245 return (LPL_TOPO_RSET_MSSNG_LF
);
2250 } else { /* non-leaf specific checks */
2253 * Non-leaf lpls should have lpl_cpus == NULL
2254 * verify that this is so
2256 ASSERT(lpl
->lpl_cpus
== NULL
);
2257 if (lpl
->lpl_cpus
!= NULL
) {
2258 return (LPL_TOPO_NONLEAF_HAS_CPUS
);
2262 * verify that the sum of the cpus in the leaf resources
2263 * is equal to the total ncpu in the intermediate
2265 for (j
= sum
= 0; j
< lpl
->lpl_nrset
; j
++) {
2266 sum
+= lpl
->lpl_rset
[j
]->lpl_ncpu
;
2269 ASSERT(sum
== lpl
->lpl_ncpu
);
2270 if (sum
!= lpl
->lpl_ncpu
) {
2271 return (LPL_TOPO_LPL_BAD_NCPU
);
2276 * Check the rset of the lpl in question. Make sure that each
2277 * rset contains a subset of the resources in
2278 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes
2279 * sure that each rset doesn't include resources that are
2280 * outside of that set. (Which would be resources somehow not
2283 klgrpset_clear(rset
);
2284 for (j
= 0; j
< lpl
->lpl_nrset
; j
++) {
2285 klgrpset_add(rset
, lpl
->lpl_rset
[j
]->lpl_lgrpid
);
2287 klgrpset_copy(cset
, rset
);
2288 /* make sure lpl rset matches lgrp rset */
2289 klgrpset_diff(rset
, lgrp
->lgrp_set
[LGRP_RSRC_CPU
]);
2290 /* make sure rset is contained with in partition, too */
2291 klgrpset_diff(cset
, cpupart
->cp_lgrpset
);
2293 ASSERT(klgrpset_isempty(rset
) && klgrpset_isempty(cset
));
2294 if (!klgrpset_isempty(rset
) || !klgrpset_isempty(cset
)) {
2295 return (LPL_TOPO_RSET_MISMATCH
);
2299 * check to make sure lpl_nrset matches the number of rsets
2300 * contained in the lpl
2302 for (j
= 0; j
< lpl
->lpl_nrset
; j
++) {
2303 if (lpl
->lpl_rset
[j
] == NULL
)
2307 ASSERT(j
== lpl
->lpl_nrset
);
2308 if (j
!= lpl
->lpl_nrset
) {
2309 return (LPL_TOPO_BAD_RSETCNT
);
2313 return (LPL_TOPO_CORRECT
);
2317 * Flatten lpl topology to given number of levels. This is presently only
2318 * implemented for a flatten to 2 levels, which will prune out the intermediates
2319 * and home the leaf lpls to the root lpl.
2322 lpl_topo_flatten(int levels
)
2334 /* called w/ cpus paused - grab no locks! */
2335 ASSERT(MUTEX_HELD(&cpu_lock
) || curthread
->t_preempt
> 0 ||
2340 lpl_root
= &cp
->cp_lgrploads
[lgrp_root
->lgrp_id
];
2341 ASSERT(LGRP_EXISTS(lgrp_root
) && (lpl_root
->lpl_ncpu
> 0));
2343 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
2344 lgrp_cur
= lgrp_table
[i
];
2345 lpl_cur
= &cp
->cp_lgrploads
[i
];
2347 if ((lgrp_cur
== lgrp_root
) ||
2348 (!LGRP_EXISTS(lgrp_cur
) &&
2349 (lpl_cur
->lpl_ncpu
== 0)))
2352 if (!LGRP_EXISTS(lgrp_cur
) && (lpl_cur
->lpl_ncpu
> 0)) {
2354 * this should be a deleted intermediate, so
2358 } else if ((lpl_cur
->lpl_nrset
== 1) &&
2359 (lpl_cur
->lpl_rset
[0] == lpl_cur
) &&
2360 ((lpl_cur
->lpl_parent
->lpl_ncpu
== 0) ||
2361 (!LGRP_EXISTS(lpl_cur
->lpl_parent
->lpl_lgrp
)))) {
2363 * this is a leaf whose parent was deleted, or
2364 * whose parent had their lgrp deleted. (And
2365 * whose parent will soon be deleted). Point
2366 * this guy back to the root lpl.
2368 lpl_cur
->lpl_parent
= lpl_root
;
2369 lpl_rset_add(lpl_root
, lpl_cur
);
2375 * Now that we're done, make sure the count on the root lpl is
2376 * correct, and update the hints of the children for the sake of
2379 for (i
= sum
= 0; i
< lpl_root
->lpl_nrset
; i
++) {
2380 sum
+= lpl_root
->lpl_rset
[i
]->lpl_ncpu
;
2382 lpl_root
->lpl_ncpu
= sum
;
2383 lpl_child_update(lpl_root
, cp
);
2386 } while (cp
!= cp_list_head
);
2392 * Insert a lpl into the resource hierarchy and create any additional lpls that
2393 * are necessary to represent the varying states of locality for the cpu
2394 * resoruces newly added to the partition.
2396 * This routine is clever enough that it can correctly add resources from the
2397 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie,
2398 * those for which the lpl is a leaf as opposed to simply a named equally local
2399 * resource). The one special case that needs additional processing is when a
2400 * new intermediate lpl is introduced. Since the main loop only traverses
2401 * looking to add the leaf resource where it does not yet exist, additional work
2402 * is necessary to add other leaf resources that may need to exist in the newly
2403 * created intermediate. This is performed by the second inner loop, and is
2404 * only done when the check for more than one overlapping resource succeeds.
2408 lpl_leaf_insert(lpl_t
*lpl_leaf
, cpupart_t
*cpupart
)
2412 int rset_num_intersect
;
2416 lgrp_id_t parent_id
;
2417 klgrpset_t rset_intersect
; /* resources in cpupart and lgrp */
2419 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
2420 lgrp_cur
= lgrp_table
[i
];
2423 * Don't insert if the lgrp isn't there, if the leaf isn't
2424 * contained within the current lgrp, or if the current lgrp has
2425 * no leaves in this partition
2428 if (!LGRP_EXISTS(lgrp_cur
) ||
2429 !klgrpset_ismember(lgrp_cur
->lgrp_set
[LGRP_RSRC_CPU
],
2430 lpl_leaf
->lpl_lgrpid
) ||
2431 !klgrpset_intersects(lgrp_cur
->lgrp_leaves
,
2432 cpupart
->cp_lgrpset
))
2435 lpl_cur
= &cpupart
->cp_lgrploads
[lgrp_cur
->lgrp_id
];
2436 if (lgrp_cur
->lgrp_parent
!= NULL
) {
2437 /* if lgrp has a parent, assign it properly */
2438 parent_id
= lgrp_cur
->lgrp_parent
->lgrp_id
;
2439 lpl_parent
= &cpupart
->cp_lgrploads
[parent_id
];
2441 /* if not, make sure parent ptr gets set to null */
2445 if (lpl_cur
== lpl_leaf
) {
2447 * Almost all leaf state was initialized elsewhere. The
2448 * only thing left to do is to set the parent.
2450 lpl_cur
->lpl_parent
= lpl_parent
;
2455 lpl_init(lpl_cur
, lpl_leaf
, lgrp_cur
);
2457 lpl_cur
->lpl_parent
= lpl_parent
;
2459 /* does new lpl need to be populated with other resources? */
2461 klgrpset_intersects(lgrp_cur
->lgrp_set
[LGRP_RSRC_CPU
],
2462 cpupart
->cp_lgrpset
);
2463 klgrpset_nlgrps(rset_intersect
, rset_num_intersect
);
2465 if (rset_num_intersect
> 1) {
2467 * If so, figure out what lpls have resources that
2468 * intersect this one, and add them.
2470 for (j
= 0; j
<= lgrp_alloc_max
; j
++) {
2471 lgrp_t
*lgrp_cand
; /* candidate lgrp */
2472 lpl_t
*lpl_cand
; /* candidate lpl */
2474 lgrp_cand
= lgrp_table
[j
];
2475 if (!LGRP_EXISTS(lgrp_cand
) ||
2476 !klgrpset_ismember(rset_intersect
,
2477 lgrp_cand
->lgrp_id
))
2480 &cpupart
->cp_lgrploads
[lgrp_cand
->lgrp_id
];
2481 lpl_rset_add(lpl_cur
, lpl_cand
);
2485 * This lpl's rset has changed. Update the hint in it's
2488 lpl_child_update(lpl_cur
, cpupart
);
2493 * remove a lpl from the hierarchy of resources, clearing its state when
2494 * finished. If the lpls at the intermediate levels of the hierarchy have no
2495 * remaining resources, or no longer name a leaf resource in the cpu-partition,
2496 * delete them as well.
2500 lpl_leaf_remove(lpl_t
*lpl_leaf
, cpupart_t
*cpupart
)
2505 klgrpset_t leaf_intersect
; /* intersection of leaves */
2507 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
2508 lgrp_cur
= lgrp_table
[i
];
2511 * Don't attempt to remove from lgrps that aren't there, that
2512 * don't contain our leaf, or from the leaf itself. (We do that
2516 if (!LGRP_EXISTS(lgrp_cur
))
2519 lpl_cur
= &cpupart
->cp_lgrploads
[lgrp_cur
->lgrp_id
];
2521 if (!klgrpset_ismember(lgrp_cur
->lgrp_set
[LGRP_RSRC_CPU
],
2522 lpl_leaf
->lpl_lgrpid
) ||
2523 (lpl_cur
== lpl_leaf
)) {
2528 * This is a slightly sleazy simplification in that we have
2529 * already marked the cp_lgrpset as no longer containing the
2530 * leaf we've deleted. Any lpls that pass the above checks
2531 * based upon lgrp membership but not necessarily cpu-part
2532 * membership also get cleared by the checks below. Currently
2533 * this is harmless, as the lpls should be empty anyway.
2535 * In particular, we want to preserve lpls that have additional
2536 * leaf resources, even though we don't yet have a processor
2537 * architecture that represents resources this way.
2540 leaf_intersect
= klgrpset_intersects(lgrp_cur
->lgrp_leaves
,
2541 cpupart
->cp_lgrpset
);
2543 lpl_rset_del(lpl_cur
, lpl_leaf
);
2544 if ((lpl_cur
->lpl_nrset
== 0) || (!leaf_intersect
)) {
2548 * Update this lpl's children
2550 lpl_child_update(lpl_cur
, cpupart
);
2553 lpl_clear(lpl_leaf
);
2557 * add a cpu to a partition in terms of lgrp load avg bookeeping
2559 * The lpl (cpu partition load average information) is now arranged in a
2560 * hierarchical fashion whereby resources that are closest, ie. most local, to
2561 * the cpu in question are considered to be leaves in a tree of resources.
2562 * There are two general cases for cpu additon:
2564 * 1. A lpl structure that contains resources already in the hierarchy tree.
2565 * In this case, all of the associated lpl relationships have been defined, and
2566 * all that is necessary is that we link the new cpu into the per-lpl list of
2567 * cpus, and increment the ncpu count of all places where this cpu resource will
2568 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2569 * pushing is accomplished by this routine.
2571 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2572 * not exist yet. In this case, it is necessary to build the leaf lpl, and
2573 * construct the hierarchy of state necessary to name it's more distant
2574 * resources, if they should exist. The leaf structure is initialized by this
2575 * routine, as is the cpu-partition state for the lgrp membership. This routine
2576 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2577 * and builds all of the "ancestoral" state necessary to identify resources at
2578 * differing levels of locality.
2581 lgrp_part_add_cpu(cpu_t
*cp
, lgrp_id_t lgrpid
)
2587 /* called sometimes w/ cpus paused - grab no locks */
2588 ASSERT(MUTEX_HELD(&cpu_lock
) || !lgrp_initialized
);
2590 cpupart
= cp
->cpu_part
;
2591 lgrp_leaf
= lgrp_table
[lgrpid
];
2593 /* don't add non-existent lgrp */
2594 ASSERT(LGRP_EXISTS(lgrp_leaf
));
2595 lpl_leaf
= &cpupart
->cp_lgrploads
[lgrpid
];
2596 cp
->cpu_lpl
= lpl_leaf
;
2598 /* only leaf lpls contain cpus */
2600 if (lpl_leaf
->lpl_ncpu
++ == 0) {
2601 lpl_init(lpl_leaf
, lpl_leaf
, lgrp_leaf
);
2602 klgrpset_add(cpupart
->cp_lgrpset
, lgrpid
);
2603 lpl_leaf_insert(lpl_leaf
, cpupart
);
2606 * the lpl should already exist in the parent, so just update
2607 * the count of available CPUs
2609 lpl_cpu_adjcnt(LPL_INCREMENT
, cp
);
2612 /* link cpu into list of cpus in lpl */
2614 if (lpl_leaf
->lpl_cpus
) {
2615 cp
->cpu_next_lpl
= lpl_leaf
->lpl_cpus
;
2616 cp
->cpu_prev_lpl
= lpl_leaf
->lpl_cpus
->cpu_prev_lpl
;
2617 lpl_leaf
->lpl_cpus
->cpu_prev_lpl
->cpu_next_lpl
= cp
;
2618 lpl_leaf
->lpl_cpus
->cpu_prev_lpl
= cp
;
2621 * We increment ncpu immediately after we create a new leaf
2622 * lpl, so assert that ncpu == 1 for the case where we don't
2623 * have any cpu pointers yet.
2625 ASSERT(lpl_leaf
->lpl_ncpu
== 1);
2626 lpl_leaf
->lpl_cpus
= cp
->cpu_next_lpl
= cp
->cpu_prev_lpl
= cp
;
2633 * remove a cpu from a partition in terms of lgrp load avg bookeeping
2635 * The lpl (cpu partition load average information) is now arranged in a
2636 * hierarchical fashion whereby resources that are closest, ie. most local, to
2637 * the cpu in question are considered to be leaves in a tree of resources.
2638 * There are two removal cases in question:
2640 * 1. Removal of the resource in the leaf leaves other resources remaining in
2641 * that leaf. (Another cpu still exists at this level of locality). In this
2642 * case, the count of available cpus is decremented in all assocated lpls by
2643 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2644 * from the per-cpu lpl list.
2646 * 2. Removal of the resource results in the lpl containing no resources. (It's
2647 * empty) In this case, all of what has occurred for the first step must take
2648 * place; however, additionally we must remove the lpl structure itself, prune
2649 * out any stranded lpls that do not directly name a leaf resource, and mark the
2650 * cpu partition in question as no longer containing resources from the lgrp of
2651 * the lpl that has been delted. Cpu-partition changes are handled by this
2652 * method, but the lpl_leaf_remove function deals with the details of pruning
2653 * out the empty lpl and any of its orphaned direct ancestors.
2656 lgrp_part_del_cpu(cpu_t
*cp
)
2662 /* called sometimes w/ cpus paused - grab no locks */
2664 ASSERT(MUTEX_HELD(&cpu_lock
) || !lgrp_initialized
);
2666 lpl
= leaf_lpl
= cp
->cpu_lpl
;
2667 lgrp_leaf
= leaf_lpl
->lpl_lgrp
;
2669 /* don't delete a leaf that isn't there */
2670 ASSERT(LGRP_EXISTS(lgrp_leaf
));
2672 /* no double-deletes */
2673 ASSERT(lpl
->lpl_ncpu
);
2674 if (--lpl
->lpl_ncpu
== 0) {
2676 * This was the last cpu in this lgroup for this partition,
2677 * clear its bit in the partition's lgroup bitmask
2679 klgrpset_del(cp
->cpu_part
->cp_lgrpset
, lpl
->lpl_lgrpid
);
2681 /* eliminate remaning lpl link pointers in cpu, lpl */
2682 lpl
->lpl_cpus
= cp
->cpu_next_lpl
= cp
->cpu_prev_lpl
= NULL
;
2684 lpl_leaf_remove(leaf_lpl
, cp
->cpu_part
);
2687 /* unlink cpu from lists of cpus in lpl */
2688 cp
->cpu_prev_lpl
->cpu_next_lpl
= cp
->cpu_next_lpl
;
2689 cp
->cpu_next_lpl
->cpu_prev_lpl
= cp
->cpu_prev_lpl
;
2690 if (lpl
->lpl_cpus
== cp
) {
2691 lpl
->lpl_cpus
= cp
->cpu_next_lpl
;
2695 * Update the cpu count in the lpls associated with parent
2698 lpl_cpu_adjcnt(LPL_DECREMENT
, cp
);
2701 /* clear cpu's lpl ptr when we're all done */
2706 * Recompute load average for the specified partition/lgrp fragment.
2708 * We rely on the fact that this routine is called from the clock thread
2709 * at a point before the clock thread can block (i.e. before its first
2710 * lock request). Since the clock thread can not be preempted (since it
2711 * runs at highest priority), we know that cpu partitions can not change
2712 * (since doing so would require either the repartition requester or the
2713 * cpu_pause thread to run on this cpu), so we can update the cpu's load
2714 * without grabbing cpu_lock.
2717 lgrp_loadavg(lpl_t
*lpl
, uint_t nrcpus
, int ageflag
)
2720 int64_t old
, new, f
;
2723 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2725 static short expval
[] = {
2726 0, 3196, 1618, 1083,
2736 /* ASSERT (called from clock level) */
2738 if ((lpl
== NULL
) || /* we're booting - this is easiest for now */
2739 ((ncpu
= lpl
->lpl_ncpu
) == 0)) {
2745 if (ncpu
>= sizeof (expval
) / sizeof (expval
[0]))
2746 f
= expval
[1]/ncpu
; /* good approx. for large ncpu */
2751 * Modify the load average atomically to avoid losing
2752 * anticipatory load updates (see lgrp_move_thread()).
2756 * We're supposed to both update and age the load.
2757 * This happens 10 times/sec. per cpu. We do a
2758 * little hoop-jumping to avoid integer overflow.
2763 old
= new = lpl
->lpl_loadavg
;
2764 q
= (old
>> 16) << 7;
2765 r
= (old
& 0xffff) << 7;
2766 new += ((long long)(nrcpus
- q
) * f
-
2767 ((r
* f
) >> 16)) >> 7;
2770 * Check for overflow
2772 if (new > LGRP_LOADAVG_MAX
)
2773 new = LGRP_LOADAVG_MAX
;
2776 } while (atomic_cas_32((lgrp_load_t
*)&lpl
->lpl_loadavg
,
2780 * We're supposed to update the load, but not age it.
2781 * This option is used to update the load (which either
2782 * has already been aged in this 1/10 sec. interval or
2783 * soon will be) to account for a remotely executing
2787 old
= new = lpl
->lpl_loadavg
;
2790 * Check for overflow
2791 * Underflow not possible here
2794 new = LGRP_LOADAVG_MAX
;
2795 } while (atomic_cas_32((lgrp_load_t
*)&lpl
->lpl_loadavg
,
2800 * Do the same for this lpl's parent
2802 if ((lpl
= lpl
->lpl_parent
) == NULL
)
2804 ncpu
= lpl
->lpl_ncpu
;
2809 * Initialize lpl topology in the target based on topology currently present in
2812 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2813 * initialize cp_default list of lpls. Up to this point all topology operations
2814 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2815 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2816 * `target' points to the list of lpls in cp_default and `size' is the size of
2819 * This function walks the lpl topology in lpl_bootstrap and does for things:
2821 * 1) Copies all fields from lpl_bootstrap to the target.
2823 * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2825 * 3) Updates lpl_parent pointers to point to the lpls in the target list
2826 * instead of lpl_bootstrap.
2828 * 4) Updates pointers in the resource list of the target to point to the lpls
2829 * in the target list instead of lpl_bootstrap.
2831 * After lpl_topo_bootstrap() completes, target contains the same information
2832 * that would be present there if it were used during boot instead of
2833 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2834 * and it is bzeroed.
2837 lpl_topo_bootstrap(lpl_t
*target
, int size
)
2839 lpl_t
*lpl
= lpl_bootstrap
;
2840 lpl_t
*target_lpl
= target
;
2849 * The only target that should be passed here is cp_default lpl list.
2851 ASSERT(target
== cp_default
.cp_lgrploads
);
2852 ASSERT(size
== cp_default
.cp_nlgrploads
);
2853 ASSERT(!lgrp_topo_initialized
);
2856 howmany
= MIN(LPL_BOOTSTRAP_SIZE
, size
);
2857 for (i
= 0; i
< howmany
; i
++, lpl
++, target_lpl
++) {
2859 * Copy all fields from lpl, except for the rset,
2860 * lgrp id <=> rset mapping storage,
2861 * and amount of storage
2863 rset
= target_lpl
->lpl_rset
;
2864 id2rset
= target_lpl
->lpl_id2rset
;
2865 sz
= target_lpl
->lpl_rset_sz
;
2869 target_lpl
->lpl_rset_sz
= sz
;
2870 target_lpl
->lpl_rset
= rset
;
2871 target_lpl
->lpl_id2rset
= id2rset
;
2874 * Substitute CPU0 lpl pointer with one relative to target.
2876 if (lpl
->lpl_cpus
== CPU
) {
2877 ASSERT(CPU
->cpu_lpl
== lpl
);
2878 CPU
->cpu_lpl
= target_lpl
;
2882 * Substitute parent information with parent relative to target.
2884 if (lpl
->lpl_parent
!= NULL
)
2885 target_lpl
->lpl_parent
= (lpl_t
*)
2886 (((uintptr_t)lpl
->lpl_parent
-
2887 (uintptr_t)lpl_bootstrap
) +
2891 * Walk over resource set substituting pointers relative to
2892 * lpl_bootstrap's rset to pointers relative to target's
2894 ASSERT(lpl
->lpl_nrset
<= 1);
2896 for (id
= 0; id
< lpl
->lpl_nrset
; id
++) {
2897 if (lpl
->lpl_rset
[id
] != NULL
) {
2898 target_lpl
->lpl_rset
[id
] = (lpl_t
*)
2899 (((uintptr_t)lpl
->lpl_rset
[id
] -
2900 (uintptr_t)lpl_bootstrap
) +
2903 target_lpl
->lpl_id2rset
[id
] =
2904 lpl
->lpl_id2rset
[id
];
2909 * Clean up the bootstrap lpls since we have switched over to the
2910 * actual lpl array in the default cpu partition.
2912 * We still need to keep one empty lpl around for newly starting
2913 * slave CPUs to reference should they need to make it through the
2914 * dispatcher prior to their lgrp/lpl initialization.
2916 * The lpl related dispatcher code has been designed to work properly
2917 * (and without extra checks) for this special case of a zero'ed
2918 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2919 * with lgrpid 0 and an empty resource set. Iteration over the rset
2920 * array by the dispatcher is also NULL terminated for this reason.
2922 * This provides the desired behaviour for an uninitialized CPU.
2923 * It shouldn't see any other CPU to either dispatch to or steal
2924 * from until it is properly initialized.
2926 bzero(lpl_bootstrap_list
, sizeof (lpl_bootstrap_list
));
2927 bzero(lpl_bootstrap_id2rset
, sizeof (lpl_bootstrap_id2rset
));
2928 bzero(lpl_bootstrap_rset
, sizeof (lpl_bootstrap_rset
));
2930 lpl_bootstrap_list
[0].lpl_rset
= lpl_bootstrap_rset
;
2931 lpl_bootstrap_list
[0].lpl_id2rset
= lpl_bootstrap_id2rset
;
2935 * If the lowest load among the lgroups a process' threads are currently
2936 * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2937 * expanding the process to a new lgroup.
2939 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2940 lgrp_load_t lgrp_expand_proc_thresh
= LGRP_EXPAND_PROC_THRESH_DEFAULT
;
2942 #define LGRP_EXPAND_PROC_THRESH(ncpu) \
2943 ((lgrp_expand_proc_thresh) / (ncpu))
2946 * A process will be expanded to a new lgroup only if the difference between
2947 * the lowest load on the lgroups the process' thread's are currently spread
2948 * across and the lowest load on the other lgroups in the process' partition
2949 * is greater than lgrp_expand_proc_diff.
2951 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2952 lgrp_load_t lgrp_expand_proc_diff
= LGRP_EXPAND_PROC_DIFF_DEFAULT
;
2954 #define LGRP_EXPAND_PROC_DIFF(ncpu) \
2955 ((lgrp_expand_proc_diff) / (ncpu))
2958 * The loadavg tolerance accounts for "noise" inherent in the load, which may
2959 * be present due to impreciseness of the load average decay algorithm.
2961 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2962 * tolerance is scaled by the number of cpus in the lgroup just like
2963 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2964 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2965 * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2967 uint32_t lgrp_loadavg_tolerance
= LGRP_LOADAVG_THREAD_MAX
;
2968 #define LGRP_LOADAVG_TOLERANCE(ncpu) \
2969 ((lgrp_loadavg_tolerance) / ncpu)
2972 * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2973 * average is above this threshold
2975 uint32_t lgrp_load_thresh
= UINT32_MAX
;
2978 * lgrp_choose() will try to skip any lgroups with less memory
2979 * than this free when choosing a home lgroup
2981 pgcnt_t lgrp_mem_free_thresh
= 0;
2984 * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2985 * one based on one of the following policies:
2986 * - Random selection
2987 * - Pseudo round robin placement
2988 * - Longest time since a thread was last placed
2990 #define LGRP_CHOOSE_RANDOM 1
2991 #define LGRP_CHOOSE_RR 2
2992 #define LGRP_CHOOSE_TIME 3
2994 int lgrp_choose_policy
= LGRP_CHOOSE_TIME
;
2997 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to
2998 * be bound to a CPU or processor set.
3002 * cpupart The partition the thread belongs to.
3004 * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3005 * disabled, or thread_lock held (at splhigh) to protect against the CPU
3006 * partitions changing out from under us and assumes that given thread is
3007 * protected. Also, called sometimes w/ cpus paused or kernel preemption
3008 * disabled, so don't grab any locks because we should never block under
3012 lgrp_choose(kthread_t
*t
, cpupart_t
*cpupart
)
3014 lgrp_load_t bestload
, bestrload
;
3015 int lgrpid_offset
, lgrp_count
;
3016 lgrp_id_t lgrpid
, lgrpid_start
;
3017 lpl_t
*lpl
, *bestlpl
, *bestrlpl
;
3022 ASSERT(MUTEX_HELD(&cpu_lock
) || curthread
->t_preempt
> 0 ||
3023 THREAD_LOCK_HELD(t
));
3024 ASSERT(cpupart
!= NULL
);
3028 /* A process should always be in an active partition */
3029 ASSERT(!klgrpset_isempty(cpupart
->cp_lgrpset
));
3031 bestlpl
= bestrlpl
= NULL
;
3032 bestload
= bestrload
= LGRP_LOADAVG_MAX
;
3033 lgrpset
= cpupart
->cp_lgrpset
;
3035 switch (lgrp_choose_policy
) {
3036 case LGRP_CHOOSE_RR
:
3037 lgrpid
= cpupart
->cp_lgrp_hint
;
3039 if (++lgrpid
> lgrp_alloc_max
)
3041 } while (!klgrpset_ismember(lgrpset
, lgrpid
));
3045 case LGRP_CHOOSE_TIME
:
3046 case LGRP_CHOOSE_RANDOM
:
3047 klgrpset_nlgrps(lgrpset
, lgrp_count
);
3049 (((ushort_t
)(gethrtime() >> 4)) % lgrp_count
) + 1;
3050 for (lgrpid
= 0; ; lgrpid
++) {
3051 if (klgrpset_ismember(lgrpset
, lgrpid
)) {
3052 if (--lgrpid_offset
== 0)
3059 lgrpid_start
= lgrpid
;
3061 DTRACE_PROBE2(lgrp_choose_start
, lgrp_id_t
, lgrpid_start
,
3062 lgrp_id_t
, cpupart
->cp_lgrp_hint
);
3065 * Use lgroup affinities (if any) to choose best lgroup
3067 * NOTE: Assumes that thread is protected from going away and its
3068 * lgroup affinities won't change (ie. p_lock, or
3069 * thread_lock() being held and/or CPUs paused)
3071 if (t
->t_lgrp_affinity
) {
3072 lpl
= lgrp_affinity_best(t
, cpupart
, lgrpid_start
, B_FALSE
);
3077 ASSERT(klgrpset_ismember(lgrpset
, lgrpid_start
));
3083 * Skip any lgroups outside of thread's pset
3085 if (!klgrpset_ismember(lgrpset
, lgrpid
)) {
3086 if (++lgrpid
> lgrp_alloc_max
)
3087 lgrpid
= 0; /* wrap the search */
3092 * Skip any non-leaf lgroups
3094 if (lgrp_table
[lgrpid
]->lgrp_childcnt
!= 0)
3098 * Skip any lgroups without enough free memory
3099 * (when threshold set to nonzero positive value)
3101 if (lgrp_mem_free_thresh
> 0) {
3102 npgs
= lgrp_mem_size(lgrpid
, LGRP_MEM_SIZE_FREE
);
3103 if (npgs
< lgrp_mem_free_thresh
) {
3104 if (++lgrpid
> lgrp_alloc_max
)
3105 lgrpid
= 0; /* wrap the search */
3110 lpl
= &cpupart
->cp_lgrploads
[lgrpid
];
3111 if (klgrpset_isempty(p
->p_lgrpset
) ||
3112 klgrpset_ismember(p
->p_lgrpset
, lgrpid
)) {
3114 * Either this is a new process or the process already
3115 * has threads on this lgrp, so this is a preferred
3116 * lgroup for the thread.
3118 if (bestlpl
== NULL
||
3119 lpl_pick(lpl
, bestlpl
)) {
3120 bestload
= lpl
->lpl_loadavg
;
3125 * The process doesn't have any threads on this lgrp,
3126 * but we're willing to consider this lgrp if the load
3127 * difference is big enough to justify splitting up
3128 * the process' threads.
3130 if (bestrlpl
== NULL
||
3131 lpl_pick(lpl
, bestrlpl
)) {
3132 bestrload
= lpl
->lpl_loadavg
;
3136 if (++lgrpid
> lgrp_alloc_max
)
3137 lgrpid
= 0; /* wrap the search */
3138 } while (lgrpid
!= lgrpid_start
);
3141 * Return root lgroup if threshold isn't set to maximum value and
3142 * lowest lgroup load average more than a certain threshold
3144 if (lgrp_load_thresh
!= UINT32_MAX
&&
3145 bestload
>= lgrp_load_thresh
&& bestrload
>= lgrp_load_thresh
)
3146 return (&cpupart
->cp_lgrploads
[lgrp_root
->lgrp_id
]);
3149 * If all the lgroups over which the thread's process is spread are
3150 * heavily loaded, or otherwise undesirable, we'll consider placing
3151 * the thread on one of the other leaf lgroups in the thread's
3154 if ((bestlpl
== NULL
) ||
3155 ((bestload
> LGRP_EXPAND_PROC_THRESH(bestlpl
->lpl_ncpu
)) &&
3156 (bestrload
< bestload
) && /* paranoid about wraparound */
3157 (bestrload
+ LGRP_EXPAND_PROC_DIFF(bestrlpl
->lpl_ncpu
) <
3162 if (bestlpl
== NULL
) {
3164 * No lgroup looked particularly good, but we still
3165 * have to pick something. Go with the randomly selected
3166 * legal lgroup we started with above.
3168 bestlpl
= &cpupart
->cp_lgrploads
[lgrpid_start
];
3171 cpupart
->cp_lgrp_hint
= bestlpl
->lpl_lgrpid
;
3172 bestlpl
->lpl_homed_time
= gethrtime_unscaled();
3174 ASSERT(bestlpl
->lpl_ncpu
> 0);
3179 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3180 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3183 lpl_pick(lpl_t
*lpl1
, lpl_t
*lpl2
)
3186 lgrp_load_t tolerance
= LGRP_LOADAVG_TOLERANCE(lpl1
->lpl_ncpu
);
3188 l1
= lpl1
->lpl_loadavg
;
3189 l2
= lpl2
->lpl_loadavg
;
3191 if ((l1
+ tolerance
< l2
) && (l1
< l2
)) {
3192 /* lpl1 is significantly less loaded than lpl2 */
3196 if (lgrp_choose_policy
== LGRP_CHOOSE_TIME
&&
3197 l1
+ tolerance
>= l2
&& l1
< l2
&&
3198 lpl1
->lpl_homed_time
< lpl2
->lpl_homed_time
) {
3200 * lpl1's load is within the tolerance of lpl2. We're
3201 * willing to consider it be to better however if
3202 * it has been longer since we last homed a thread there
3211 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3212 * process that uses text replication changed home lgrp. This info is used by
3213 * segvn asyncronous thread to detect if it needs to recheck what lgrps
3214 * should be used for text replication.
3216 static uint64_t lgrp_trthr_moves
= 0;
3219 lgrp_get_trthr_migrations(void)
3221 return (lgrp_trthr_moves
);
3225 lgrp_update_trthr_migrations(uint64_t incr
)
3227 atomic_add_64(&lgrp_trthr_moves
, incr
);
3231 * An LWP is expected to be assigned to an lgroup for at least this long
3232 * for its anticipatory load to be justified. NOTE that this value should
3233 * not be set extremely huge (say, larger than 100 years), to avoid problems
3234 * with overflow in the calculation that uses it.
3236 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */
3237 hrtime_t lgrp_min_nsec
= LGRP_MIN_NSEC
;
3240 * Routine to change a thread's lgroup affiliation. This routine updates
3241 * the thread's kthread_t struct and its process' proc_t struct to note the
3242 * thread's new lgroup affiliation, and its lgroup affinities.
3244 * Note that this is the only routine that modifies a thread's t_lpl field,
3245 * and that adds in or removes anticipatory load.
3247 * If the thread is exiting, newlpl is NULL.
3250 * The following lock must be held on entry:
3251 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3252 * doesn't get removed from t's partition
3254 * This routine is not allowed to grab any locks, since it may be called
3255 * with cpus paused (such as from cpu_offline).
3258 lgrp_move_thread(kthread_t
*t
, lpl_t
*newlpl
, int do_lgrpset_delete
)
3261 lpl_t
*lpl
, *oldlpl
;
3265 lgrp_load_t old
, new;
3268 ASSERT(MUTEX_HELD(&cpu_lock
) || curthread
->t_preempt
> 0 ||
3269 THREAD_LOCK_HELD(t
));
3272 * If not changing lpls, just return
3274 if ((oldlpl
= t
->t_lpl
) == newlpl
)
3278 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3279 * associated with process 0 rather than with its original process).
3281 if (t
->t_proc_flag
& TP_LWPEXIT
) {
3282 if (newlpl
!= NULL
) {
3291 * If the thread had a previous lgroup, update its process' p_lgrpset
3292 * to account for it being moved from its old lgroup.
3294 if ((oldlpl
!= NULL
) && /* thread had a previous lgroup */
3295 (p
->p_tlist
!= NULL
)) {
3296 oldid
= oldlpl
->lpl_lgrpid
;
3299 lgrp_stat_add(oldid
, LGRP_NUM_MIGR
, 1);
3301 if ((do_lgrpset_delete
) &&
3302 (klgrpset_ismember(p
->p_lgrpset
, oldid
))) {
3303 for (tp
= p
->p_tlist
->t_forw
; ; tp
= tp
->t_forw
) {
3305 * Check if a thread other than the thread
3306 * that's moving is assigned to the same
3307 * lgroup as the thread that's moving. Note
3308 * that we have to compare lgroup IDs, rather
3309 * than simply comparing t_lpl's, since the
3310 * threads may belong to different partitions
3311 * but be assigned to the same lgroup.
3313 ASSERT(tp
->t_lpl
!= NULL
);
3316 (tp
->t_lpl
->lpl_lgrpid
== oldid
)) {
3318 * Another thread is assigned to the
3319 * same lgroup as the thread that's
3320 * moving, p_lgrpset doesn't change.
3323 } else if (tp
== p
->p_tlist
) {
3325 * No other thread is assigned to the
3326 * same lgroup as the exiting thread,
3327 * clear the lgroup's bit in p_lgrpset.
3329 klgrpset_del(p
->p_lgrpset
, oldid
);
3336 * If this thread was assigned to its old lgroup for such a
3337 * short amount of time that the anticipatory load that was
3338 * added on its behalf has aged very little, remove that
3339 * anticipatory load.
3341 if ((t
->t_anttime
+ lgrp_min_nsec
> gethrtime()) &&
3342 ((ncpu
= oldlpl
->lpl_ncpu
) > 0)) {
3346 old
= new = lpl
->lpl_loadavg
;
3347 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu
);
3350 * this can happen if the load
3351 * average was aged since we
3352 * added in the anticipatory
3357 } while (atomic_cas_32(
3358 (lgrp_load_t
*)&lpl
->lpl_loadavg
, old
,
3361 lpl
= lpl
->lpl_parent
;
3365 ncpu
= lpl
->lpl_ncpu
;
3371 * If the thread has a new lgroup (i.e. it's not exiting), update its
3372 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3373 * to its new lgroup to account for its move to its new lgroup.
3375 if (newlpl
!= NULL
) {
3377 * This thread is moving to a new lgroup
3380 if (t
->t_tid
== 1 && p
->p_t1_lgrpid
!= newlpl
->lpl_lgrpid
) {
3381 p
->p_t1_lgrpid
= newlpl
->lpl_lgrpid
;
3383 if (p
->p_tr_lgrpid
!= LGRP_NONE
&&
3384 p
->p_tr_lgrpid
!= p
->p_t1_lgrpid
) {
3385 lgrp_update_trthr_migrations(1);
3390 * Reflect move in load average of new lgroup
3391 * unless it is root lgroup
3393 if (lgrp_table
[newlpl
->lpl_lgrpid
] == lgrp_root
)
3396 if (!klgrpset_ismember(p
->p_lgrpset
, newlpl
->lpl_lgrpid
)) {
3397 klgrpset_add(p
->p_lgrpset
, newlpl
->lpl_lgrpid
);
3401 * It'll take some time for the load on the new lgroup
3402 * to reflect this thread's placement on it. We'd
3403 * like not, however, to have all threads between now
3404 * and then also piling on to this lgroup. To avoid
3405 * this pileup, we anticipate the load this thread
3406 * will generate on its new lgroup. The goal is to
3407 * make the lgroup's load appear as though the thread
3408 * had been there all along. We're very conservative
3409 * in calculating this anticipatory load, we assume
3410 * the worst case case (100% CPU-bound thread). This
3411 * may be modified in the future to be more accurate.
3415 ncpu
= lpl
->lpl_ncpu
;
3418 old
= new = lpl
->lpl_loadavg
;
3419 new += LGRP_LOADAVG_MAX_EFFECT(ncpu
);
3421 * Check for overflow
3422 * Underflow not possible here
3426 } while (atomic_cas_32((lgrp_load_t
*)&lpl
->lpl_loadavg
,
3429 lpl
= lpl
->lpl_parent
;
3433 t
->t_anttime
= gethrtime();
3438 * Return lgroup memory allocation policy given advice from madvise(3C)
3441 lgrp_madv_to_policy(uchar_t advice
, size_t size
, int type
)
3444 case MADV_ACCESS_LWP
:
3445 return (LGRP_MEM_POLICY_NEXT
);
3446 case MADV_ACCESS_MANY
:
3447 return (LGRP_MEM_POLICY_RANDOM
);
3449 return (lgrp_mem_policy_default(size
, type
));
3454 * Figure out default policy
3457 lgrp_mem_policy_default(size_t size
, int type
)
3460 lgrp_mem_policy_t policy
;
3461 size_t pset_mem_size
;
3464 * Randomly allocate memory across lgroups for shared memory
3465 * beyond a certain threshold
3467 if ((type
!= MAP_SHARED
&& size
> lgrp_privm_random_thresh
) ||
3468 (type
== MAP_SHARED
&& size
> lgrp_shm_random_thresh
)) {
3470 * Get total memory size of current thread's pset
3473 cp
= curthread
->t_cpupart
;
3474 klgrpset_totalsize(cp
->cp_lgrpset
, pset_mem_size
);
3478 * Choose policy to randomly allocate memory across
3479 * lgroups in pset if it will fit and is not default
3480 * partition. Otherwise, allocate memory randomly
3483 if (lgrp_mem_pset_aware
&& size
< pset_mem_size
)
3484 policy
= LGRP_MEM_POLICY_RANDOM_PSET
;
3486 policy
= LGRP_MEM_POLICY_RANDOM
;
3489 * Apply default policy for private memory and
3490 * shared memory under the respective random
3493 policy
= lgrp_mem_default_policy
;
3499 * Get memory allocation policy for this segment
3501 lgrp_mem_policy_info_t
*
3502 lgrp_mem_policy_get(struct seg
*seg
, caddr_t vaddr
)
3504 return (segop_getpolicy(seg
, vaddr
));
3508 * Set policy for allocating private memory given desired policy, policy info,
3509 * size in bytes of memory that policy is being applied.
3510 * Return 0 if policy wasn't set already and 1 if policy was set already
3513 lgrp_privm_policy_set(lgrp_mem_policy_t policy
,
3514 lgrp_mem_policy_info_t
*policy_info
, size_t size
)
3517 ASSERT(policy_info
!= NULL
);
3519 if (policy
== LGRP_MEM_POLICY_DEFAULT
)
3520 policy
= lgrp_mem_policy_default(size
, MAP_PRIVATE
);
3523 * Policy set already?
3525 if (policy
== policy_info
->mem_policy
)
3531 policy_info
->mem_policy
= policy
;
3532 policy_info
->mem_lgrpid
= LGRP_NONE
;
3539 * Get shared memory allocation policy with given tree and offset
3541 lgrp_mem_policy_info_t
*
3542 lgrp_shm_policy_get(struct anon_map
*amp
, ulong_t anon_index
, vnode_t
*vp
,
3546 lgrp_mem_policy_info_t
*policy_info
;
3547 lgrp_shm_policy_seg_t
*policy_seg
;
3548 lgrp_shm_locality_t
*shm_locality
;
3553 * Get policy segment tree from anon_map or vnode and use specified
3554 * anon index or vnode offset as offset
3556 * Assume that no lock needs to be held on anon_map or vnode, since
3557 * they should be protected by their reference count which must be
3558 * nonzero for an existing segment
3561 ASSERT(amp
->refcnt
!= 0);
3562 shm_locality
= amp
->locality
;
3563 if (shm_locality
== NULL
)
3565 tree
= shm_locality
->loc_tree
;
3566 off
= ptob(anon_index
);
3568 shm_locality
= vp
->v_locality
;
3569 if (shm_locality
== NULL
)
3571 ASSERT(shm_locality
->loc_count
!= 0);
3572 tree
= shm_locality
->loc_tree
;
3580 * Lookup policy segment for offset into shared object and return
3583 rw_enter(&shm_locality
->loc_lock
, RW_READER
);
3585 policy_seg
= avl_find(tree
, &off
, &where
);
3587 policy_info
= &policy_seg
->shm_policy
;
3588 rw_exit(&shm_locality
->loc_lock
);
3590 return (policy_info
);
3594 * Default memory allocation policy for kernel segmap pages
3596 lgrp_mem_policy_t lgrp_segmap_default_policy
= LGRP_MEM_POLICY_RANDOM
;
3599 * Return lgroup to use for allocating memory
3600 * given the segment and address
3602 * There isn't any mutual exclusion that exists between calls
3603 * to this routine and DR, so this routine and whomever calls it
3604 * should be mindful of the possibility that the lgrp returned
3605 * may be deleted. If this happens, dereferences of the lgrp
3606 * pointer will still be safe, but the resources in the lgrp will
3607 * be gone, and LGRP_EXISTS() will no longer be true.
3610 lgrp_mem_choose(struct seg
*seg
, caddr_t vaddr
, size_t pgsz
)
3617 lgrp_mem_policy_t policy
;
3618 lgrp_mem_policy_info_t
*policy_info
;
3621 extern struct seg
*segkmap
;
3624 * Just return null if the lgrp framework hasn't finished
3625 * initializing or if this is a UMA machine.
3627 if (nlgrps
== 1 || !lgrp_initialized
)
3631 * Get memory allocation policy for this segment
3633 policy
= lgrp_mem_default_policy
;
3635 if (seg
->s_as
== &kas
) {
3637 policy
= lgrp_segmap_default_policy
;
3638 if (policy
== LGRP_MEM_POLICY_RANDOM_PROC
||
3639 policy
== LGRP_MEM_POLICY_RANDOM_PSET
)
3640 policy
= LGRP_MEM_POLICY_RANDOM
;
3642 policy_info
= lgrp_mem_policy_get(seg
, vaddr
);
3643 if (policy_info
!= NULL
) {
3644 policy
= policy_info
->mem_policy
;
3645 if (policy
== LGRP_MEM_POLICY_NEXT_SEG
) {
3646 lgrp_id_t id
= policy_info
->mem_lgrpid
;
3647 ASSERT(id
!= LGRP_NONE
);
3648 ASSERT(id
< NLGRPS_MAX
);
3649 lgrp
= lgrp_table
[id
];
3650 if (!LGRP_EXISTS(lgrp
)) {
3651 policy
= LGRP_MEM_POLICY_NEXT
;
3654 LGRP_NUM_NEXT_SEG
, 1);
3664 * Initialize lgroup to home by default
3666 lgrp
= lgrp_home_lgrp();
3669 * When homing threads on root lgrp, override default memory
3670 * allocation policies with root lgroup memory allocation policy
3672 if (lgrp
== lgrp_root
)
3673 policy
= lgrp_mem_policy_root
;
3679 case LGRP_MEM_POLICY_NEXT_CPU
:
3682 * Return lgroup of current CPU which faulted on memory
3683 * If the CPU isn't currently in an lgrp, then opt to
3684 * allocate from the root.
3686 * Kernel preemption needs to be disabled here to prevent
3687 * the current CPU from going away before lgrp is found.
3689 if (LGRP_CPU_HAS_NO_LGRP(CPU
)) {
3693 lgrp
= lgrp_cpu_to_lgrp(CPU
);
3698 case LGRP_MEM_POLICY_NEXT
:
3699 case LGRP_MEM_POLICY_DEFAULT
:
3703 * Just return current thread's home lgroup
3704 * for default policy (next touch)
3705 * If the thread is homed to the root,
3706 * then the default policy is random across lgroups.
3707 * Fallthrough to the random case.
3709 if (lgrp
!= lgrp_root
) {
3710 if (policy
== LGRP_MEM_POLICY_NEXT
)
3711 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_NEXT
, 1);
3713 lgrp_stat_add(lgrp
->lgrp_id
,
3714 LGRP_NUM_DEFAULT
, 1);
3717 case LGRP_MEM_POLICY_RANDOM
:
3720 * Return a random leaf lgroup with memory
3722 lgrpset
= lgrp_root
->lgrp_set
[LGRP_RSRC_MEM
];
3724 * Count how many lgroups are spanned
3726 klgrpset_nlgrps(lgrpset
, lgrps_spanned
);
3729 * There may be no memnodes in the root lgroup during DR copy
3730 * rename on a system with only two boards (memnodes)
3731 * configured. In this case just return the root lgrp.
3733 if (lgrps_spanned
== 0) {
3739 * Pick a random offset within lgroups spanned
3740 * and return lgroup at that offset
3742 random
= (ushort_t
)gethrtime() >> 4;
3743 off
= random
% lgrps_spanned
;
3744 ASSERT(off
<= lgrp_alloc_max
);
3746 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
3747 if (!klgrpset_ismember(lgrpset
, i
))
3752 lgrp
= lgrp_table
[i
];
3753 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_RANDOM
,
3760 case LGRP_MEM_POLICY_RANDOM_PROC
:
3763 * Grab copy of bitmask of lgroups spanned by
3766 klgrpset_copy(lgrpset
, curproc
->p_lgrpset
);
3767 stat
= LGRP_NUM_RANDOM_PROC
;
3770 case LGRP_MEM_POLICY_RANDOM_PSET
:
3773 stat
= LGRP_NUM_RANDOM_PSET
;
3775 if (klgrpset_isempty(lgrpset
)) {
3777 * Grab copy of bitmask of lgroups spanned by
3778 * this processor set
3781 klgrpset_copy(lgrpset
,
3782 curthread
->t_cpupart
->cp_lgrpset
);
3787 * Count how many lgroups are spanned
3789 klgrpset_nlgrps(lgrpset
, lgrps_spanned
);
3790 ASSERT(lgrps_spanned
<= nlgrps
);
3793 * Probably lgrps_spanned should be always non-zero, but to be
3794 * on the safe side we return lgrp_root if it is empty.
3796 if (lgrps_spanned
== 0) {
3802 * Pick a random offset within lgroups spanned
3803 * and return lgroup at that offset
3805 random
= (ushort_t
)gethrtime() >> 4;
3806 off
= random
% lgrps_spanned
;
3807 ASSERT(off
<= lgrp_alloc_max
);
3809 for (i
= 0; i
<= lgrp_alloc_max
; i
++) {
3810 if (!klgrpset_ismember(lgrpset
, i
))
3815 lgrp
= lgrp_table
[i
];
3816 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_RANDOM
,
3823 case LGRP_MEM_POLICY_ROUNDROBIN
:
3826 * Use offset within segment to determine
3827 * offset from home lgroup to choose for
3828 * next lgroup to allocate memory from
3830 off
= ((unsigned long)(vaddr
- seg
->s_base
) / pgsz
) %
3831 (lgrp_alloc_max
+ 1);
3834 lgrpset
= lgrp_root
->lgrp_set
[LGRP_RSRC_MEM
];
3839 i
= (i
+ 1) % (lgrp_alloc_max
+ 1);
3840 lgrp
= lgrp_table
[i
];
3841 if (klgrpset_ismember(lgrpset
, i
))
3844 lgrp_stat_add(lgrp
->lgrp_id
, LGRP_NUM_ROUNDROBIN
, 1);
3849 ASSERT(lgrp
!= NULL
);
3854 * Return the number of pages in an lgroup
3856 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3857 * could cause tests that rely on the numat driver to fail....
3860 lgrp_mem_size(lgrp_id_t lgrpid
, lgrp_mem_query_t query
)
3864 lgrp
= lgrp_table
[lgrpid
];
3865 if (!LGRP_EXISTS(lgrp
) ||
3866 klgrpset_isempty(lgrp
->lgrp_set
[LGRP_RSRC_MEM
]) ||
3867 !klgrpset_ismember(lgrp
->lgrp_set
[LGRP_RSRC_MEM
], lgrpid
))
3870 return (lgrp_plat_mem_size(lgrp
->lgrp_plathand
, query
));
3874 * Initialize lgroup shared memory allocation policy support
3877 lgrp_shm_policy_init(struct anon_map
*amp
, vnode_t
*vp
)
3879 lgrp_shm_locality_t
*shm_locality
;
3882 * Initialize locality field in anon_map
3883 * Don't need any locks because this is called when anon_map is
3884 * allocated, but not used anywhere yet.
3887 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
3888 if (amp
->locality
== NULL
) {
3890 * Allocate and initialize shared memory locality info
3891 * and set anon_map locality pointer to it
3892 * Drop lock across kmem_alloc(KM_SLEEP)
3894 ANON_LOCK_EXIT(&
->a_rwlock
);
3895 shm_locality
= kmem_alloc(sizeof (*shm_locality
),
3897 rw_init(&shm_locality
->loc_lock
, NULL
, RW_DEFAULT
,
3899 shm_locality
->loc_count
= 1; /* not used for amp */
3900 shm_locality
->loc_tree
= NULL
;
3903 * Reacquire lock and check to see whether anyone beat
3904 * us to initializing the locality info
3906 ANON_LOCK_ENTER(&
->a_rwlock
, RW_WRITER
);
3907 if (amp
->locality
!= NULL
) {
3908 rw_destroy(&shm_locality
->loc_lock
);
3909 kmem_free(shm_locality
,
3910 sizeof (*shm_locality
));
3912 amp
->locality
= shm_locality
;
3914 ANON_LOCK_EXIT(&
->a_rwlock
);
3919 * Allocate shared vnode policy info if vnode is not locality aware yet
3921 mutex_enter(&vp
->v_lock
);
3922 if ((vp
->v_flag
& V_LOCALITY
) == 0) {
3924 * Allocate and initialize shared memory locality info
3926 mutex_exit(&vp
->v_lock
);
3927 shm_locality
= kmem_alloc(sizeof (*shm_locality
), KM_SLEEP
);
3928 rw_init(&shm_locality
->loc_lock
, NULL
, RW_DEFAULT
, NULL
);
3929 shm_locality
->loc_count
= 1;
3930 shm_locality
->loc_tree
= NULL
;
3933 * Point vnode locality field at shared vnode policy info
3934 * and set locality aware flag in vnode
3936 mutex_enter(&vp
->v_lock
);
3937 if ((vp
->v_flag
& V_LOCALITY
) == 0) {
3938 vp
->v_locality
= shm_locality
;
3939 vp
->v_flag
|= V_LOCALITY
;
3942 * Lost race so free locality info and increment count.
3944 rw_destroy(&shm_locality
->loc_lock
);
3945 kmem_free(shm_locality
, sizeof (*shm_locality
));
3946 shm_locality
= vp
->v_locality
;
3947 shm_locality
->loc_count
++;
3949 mutex_exit(&vp
->v_lock
);
3955 * Increment reference count of number of segments mapping this vnode
3958 shm_locality
= vp
->v_locality
;
3959 shm_locality
->loc_count
++;
3960 mutex_exit(&vp
->v_lock
);
3964 * Destroy the given shared memory policy segment tree
3967 lgrp_shm_policy_tree_destroy(avl_tree_t
*tree
)
3969 lgrp_shm_policy_seg_t
*cur
;
3970 lgrp_shm_policy_seg_t
*next
;
3975 cur
= (lgrp_shm_policy_seg_t
*)avl_first(tree
);
3976 while (cur
!= NULL
) {
3977 next
= AVL_NEXT(tree
, cur
);
3978 avl_remove(tree
, cur
);
3979 kmem_free(cur
, sizeof (*cur
));
3982 kmem_free(tree
, sizeof (avl_tree_t
));
3986 * Uninitialize lgroup shared memory allocation policy support
3989 lgrp_shm_policy_fini(struct anon_map
*amp
, vnode_t
*vp
)
3991 lgrp_shm_locality_t
*shm_locality
;
3994 * For anon_map, deallocate shared memory policy tree and
3995 * zero locality field
3996 * Don't need any locks because anon_map is being freed
3999 if (amp
->locality
== NULL
)
4001 shm_locality
= amp
->locality
;
4002 shm_locality
->loc_count
= 0; /* not really used for amp */
4003 rw_destroy(&shm_locality
->loc_lock
);
4004 lgrp_shm_policy_tree_destroy(shm_locality
->loc_tree
);
4005 kmem_free(shm_locality
, sizeof (*shm_locality
));
4011 * For vnode, decrement reference count of segments mapping this vnode
4012 * shared and delete locality info if reference count drops to 0
4014 mutex_enter(&vp
->v_lock
);
4015 shm_locality
= vp
->v_locality
;
4016 shm_locality
->loc_count
--;
4018 if (shm_locality
->loc_count
== 0) {
4019 rw_destroy(&shm_locality
->loc_lock
);
4020 lgrp_shm_policy_tree_destroy(shm_locality
->loc_tree
);
4021 kmem_free(shm_locality
, sizeof (*shm_locality
));
4023 vp
->v_flag
&= ~V_LOCALITY
;
4025 mutex_exit(&vp
->v_lock
);
4029 * Compare two shared memory policy segments
4030 * Used by AVL tree code for searching
4033 lgrp_shm_policy_compar(const void *x
, const void *y
)
4035 lgrp_shm_policy_seg_t
*a
= (lgrp_shm_policy_seg_t
*)x
;
4036 lgrp_shm_policy_seg_t
*b
= (lgrp_shm_policy_seg_t
*)y
;
4038 if (a
->shm_off
< b
->shm_off
)
4040 if (a
->shm_off
>= b
->shm_off
+ b
->shm_size
)
4046 * Concatenate seg1 with seg2 and remove seg2
4049 lgrp_shm_policy_concat(avl_tree_t
*tree
, lgrp_shm_policy_seg_t
*seg1
,
4050 lgrp_shm_policy_seg_t
*seg2
)
4052 if (!seg1
|| !seg2
||
4053 seg1
->shm_off
+ seg1
->shm_size
!= seg2
->shm_off
||
4054 seg1
->shm_policy
.mem_policy
!= seg2
->shm_policy
.mem_policy
)
4057 seg1
->shm_size
+= seg2
->shm_size
;
4058 avl_remove(tree
, seg2
);
4059 kmem_free(seg2
, sizeof (*seg2
));
4064 * Split segment at given offset and return rightmost (uppermost) segment
4065 * Assumes that there are no overlapping segments
4067 static lgrp_shm_policy_seg_t
*
4068 lgrp_shm_policy_split(avl_tree_t
*tree
, lgrp_shm_policy_seg_t
*seg
,
4071 lgrp_shm_policy_seg_t
*newseg
;
4074 ASSERT(seg
!= NULL
);
4075 ASSERT(off
>= seg
->shm_off
&& off
<= seg
->shm_off
+ seg
->shm_size
);
4077 if (!seg
|| off
< seg
->shm_off
|| off
> seg
->shm_off
+
4081 if (off
== seg
->shm_off
|| off
== seg
->shm_off
+ seg
->shm_size
)
4085 * Adjust size of left segment and allocate new (right) segment
4087 newseg
= kmem_alloc(sizeof (lgrp_shm_policy_seg_t
), KM_SLEEP
);
4088 newseg
->shm_policy
= seg
->shm_policy
;
4089 newseg
->shm_off
= off
;
4090 newseg
->shm_size
= seg
->shm_size
- (off
- seg
->shm_off
);
4091 seg
->shm_size
= off
- seg
->shm_off
;
4094 * Find where to insert new segment in AVL tree and insert it
4096 (void) avl_find(tree
, &off
, &where
);
4097 avl_insert(tree
, newseg
, where
);
4103 * Set shared memory allocation policy on specified shared object at given
4106 * Return 0 if policy wasn't set already, 1 if policy was set already, and
4107 * -1 if can't set policy.
4110 lgrp_shm_policy_set(lgrp_mem_policy_t policy
, struct anon_map
*amp
,
4111 ulong_t anon_index
, vnode_t
*vp
, uoff_t vn_off
, size_t len
)
4114 lgrp_shm_policy_seg_t
*next
;
4115 lgrp_shm_policy_seg_t
*newseg
;
4118 lgrp_shm_policy_seg_t
*prev
;
4120 lgrp_shm_policy_seg_t
*seg
;
4121 lgrp_shm_locality_t
*shm_locality
;
4126 ASSERT((len
& PAGEOFFSET
) == 0);
4134 * Get locality info and starting offset into shared object
4135 * Try anon map first and then vnode
4136 * Assume that no locks need to be held on anon_map or vnode, since
4137 * it should be protected by its reference count which must be nonzero
4138 * for an existing segment.
4142 * Get policy info from anon_map
4145 ASSERT(amp
->refcnt
!= 0);
4146 if (amp
->locality
== NULL
)
4147 lgrp_shm_policy_init(amp
, NULL
);
4148 shm_locality
= amp
->locality
;
4149 off
= ptob(anon_index
);
4152 * Get policy info from vnode
4154 if ((vp
->v_flag
& V_LOCALITY
) == 0 || vp
->v_locality
== NULL
)
4155 lgrp_shm_policy_init(NULL
, vp
);
4156 shm_locality
= vp
->v_locality
;
4157 ASSERT(shm_locality
->loc_count
!= 0);
4162 ASSERT((off
& PAGEOFFSET
) == 0);
4165 * Figure out default policy
4167 if (policy
== LGRP_MEM_POLICY_DEFAULT
)
4168 policy
= lgrp_mem_policy_default(len
, MAP_SHARED
);
4171 * Create AVL tree if there isn't one yet
4172 * and set locality field to point at it
4174 rw_enter(&shm_locality
->loc_lock
, RW_WRITER
);
4175 tree
= shm_locality
->loc_tree
;
4177 rw_exit(&shm_locality
->loc_lock
);
4179 tree
= kmem_alloc(sizeof (avl_tree_t
), KM_SLEEP
);
4181 rw_enter(&shm_locality
->loc_lock
, RW_WRITER
);
4182 if (shm_locality
->loc_tree
== NULL
) {
4183 avl_create(tree
, lgrp_shm_policy_compar
,
4184 sizeof (lgrp_shm_policy_seg_t
),
4185 offsetof(lgrp_shm_policy_seg_t
, shm_tree
));
4186 shm_locality
->loc_tree
= tree
;
4189 * Another thread managed to set up the tree
4190 * before we could. Free the tree we allocated
4191 * and use the one that's already there.
4193 kmem_free(tree
, sizeof (*tree
));
4194 tree
= shm_locality
->loc_tree
;
4201 * Need to maintain hold on writer's lock to keep tree from
4202 * changing out from under us
4206 * Find policy segment for specified offset into shared object
4208 seg
= avl_find(tree
, &off
, &where
);
4211 * Didn't find any existing segment that contains specified
4212 * offset, so allocate new segment, insert it, and concatenate
4213 * with adjacent segments if possible
4216 newseg
= kmem_alloc(sizeof (lgrp_shm_policy_seg_t
),
4218 newseg
->shm_policy
.mem_policy
= policy
;
4219 newseg
->shm_policy
.mem_lgrpid
= LGRP_NONE
;
4220 newseg
->shm_off
= off
;
4221 avl_insert(tree
, newseg
, where
);
4224 * Check to see whether new segment overlaps with next
4225 * one, set length of new segment accordingly, and
4226 * calculate remaining length and next offset
4228 seg
= AVL_NEXT(tree
, newseg
);
4229 if (seg
== NULL
|| off
+ len
<= seg
->shm_off
) {
4230 newseg
->shm_size
= len
;
4233 newseg
->shm_size
= seg
->shm_off
- off
;
4235 len
-= newseg
->shm_size
;
4239 * Try to concatenate new segment with next and
4240 * previous ones, since they might have the same policy
4241 * now. Grab previous and next segments first because
4242 * they will change on concatenation.
4244 prev
= AVL_PREV(tree
, newseg
);
4245 next
= AVL_NEXT(tree
, newseg
);
4246 (void) lgrp_shm_policy_concat(tree
, newseg
, next
);
4247 (void) lgrp_shm_policy_concat(tree
, prev
, newseg
);
4253 oldeoff
= seg
->shm_off
+ seg
->shm_size
;
4256 * Policy set already?
4258 if (policy
== seg
->shm_policy
.mem_policy
) {
4260 * Nothing left to do if offset and length
4261 * fall within this segment
4263 if (eoff
<= oldeoff
) {
4267 len
= eoff
- oldeoff
;
4274 * Specified offset and length match existing segment exactly
4276 if (off
== seg
->shm_off
&& len
== seg
->shm_size
) {
4278 * Set policy and update current length
4280 seg
->shm_policy
.mem_policy
= policy
;
4281 seg
->shm_policy
.mem_lgrpid
= LGRP_NONE
;
4285 * Try concatenating new segment with previous and next
4286 * segments, since they might have the same policy now.
4287 * Grab previous and next segments first because they
4288 * will change on concatenation.
4290 prev
= AVL_PREV(tree
, seg
);
4291 next
= AVL_NEXT(tree
, seg
);
4292 (void) lgrp_shm_policy_concat(tree
, seg
, next
);
4293 (void) lgrp_shm_policy_concat(tree
, prev
, seg
);
4296 * Specified offset and length only apply to part of
4301 * New segment starts in middle of old one, so split
4302 * new one off near beginning of old one
4305 if (off
> seg
->shm_off
) {
4306 newseg
= lgrp_shm_policy_split(tree
, seg
, off
);
4309 * New segment ends where old one did, so try
4310 * to concatenate with next segment
4312 if (eoff
== oldeoff
) {
4313 newseg
->shm_policy
.mem_policy
= policy
;
4314 newseg
->shm_policy
.mem_lgrpid
=
4316 (void) lgrp_shm_policy_concat(tree
,
4317 newseg
, AVL_NEXT(tree
, newseg
));
4323 * New segment ends before old one, so split off end of
4326 if (eoff
< oldeoff
) {
4328 (void) lgrp_shm_policy_split(tree
,
4330 newseg
->shm_policy
.mem_policy
= policy
;
4331 newseg
->shm_policy
.mem_lgrpid
=
4334 (void) lgrp_shm_policy_split(tree
, seg
,
4336 seg
->shm_policy
.mem_policy
= policy
;
4337 seg
->shm_policy
.mem_lgrpid
= LGRP_NONE
;
4340 if (off
== seg
->shm_off
)
4341 (void) lgrp_shm_policy_concat(tree
,
4342 AVL_PREV(tree
, seg
), seg
);
4347 * Calculate remaining length and next offset
4349 len
= eoff
- oldeoff
;
4354 rw_exit(&shm_locality
->loc_lock
);
4359 * Return the best memnode from which to allocate memory given
4362 * "c" is for cookie, which is good enough for me.
4363 * It references a cookie struct that should be zero'ed to initialize.
4364 * The cookie should live on the caller's stack.
4366 * The routine returns -1 when:
4367 * - traverse is 0, and all the memnodes in "lgrp" have been returned.
4368 * - traverse is 1, and all the memnodes in the system have been
4372 lgrp_memnode_choose(lgrp_mnode_cookie_t
*c
)
4374 lgrp_t
*lp
= c
->lmc_lgrp
;
4375 mnodeset_t nodes
= c
->lmc_nodes
;
4376 int cnt
= c
->lmc_cnt
;
4379 extern int max_mem_nodes
;
4382 * If the set is empty, and the caller is willing, traverse
4383 * up the hierarchy until we find a non-empty set.
4385 while (nodes
== (mnodeset_t
)0 || cnt
<= 0) {
4386 if (c
->lmc_scope
== LGRP_SRCH_LOCAL
||
4387 ((lp
= lp
->lgrp_parent
) == NULL
))
4390 nodes
= lp
->lgrp_mnodes
& ~(c
->lmc_tried
);
4391 cnt
= lp
->lgrp_nmnodes
- c
->lmc_ntried
;
4395 * Select a memnode by picking one at a "random" offset.
4396 * Because of DR, memnodes can come and go at any time.
4397 * This code must be able to cope with the possibility
4398 * that the nodes count "cnt" is inconsistent with respect
4399 * to the number of elements actually in "nodes", and
4400 * therefore that the offset chosen could be greater than
4401 * the number of elements in the set (some memnodes may
4402 * have dissapeared just before cnt was read).
4403 * If this happens, the search simply wraps back to the
4404 * beginning of the set.
4406 ASSERT(nodes
!= (mnodeset_t
)0 && cnt
> 0);
4407 offset
= c
->lmc_rand
% cnt
;
4409 for (mnode
= 0; mnode
< max_mem_nodes
; mnode
++)
4410 if (nodes
& ((mnodeset_t
)1 << mnode
))
4413 } while (mnode
>= max_mem_nodes
);
4415 /* Found a node. Store state before returning. */
4417 c
->lmc_nodes
= (nodes
& ~((mnodeset_t
)1 << mnode
));
4418 c
->lmc_cnt
= cnt
- 1;
4419 c
->lmc_tried
= (c
->lmc_tried
| ((mnodeset_t
)1 << mnode
));