4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/types.h>
28 #include <sys/sysmacros.h>
29 #include <sys/machsystm.h>
30 #include <sys/machparam.h>
31 #include <sys/cmn_err.h>
33 #include <sys/mach_descrip.h>
34 #include <sys/memnode.h>
35 #include <sys/mdesc.h>
38 #include <vm/vm_dep.h>
39 #include <vm/hat_sfmmu.h>
40 #include <sys/promif.h>
43 * MPO and the sun4v memory representation
44 * ---------------------------------------
46 * Latency groups are defined in the sun4v achitecture by memory-latency-group
47 * nodes in the Machine Description, as specified in FWARC/2007/260. These
48 * tie together cpu nodes and mblock nodes, and contain mask and match
49 * properties that identify the portion of an mblock that belongs to the
50 * lgroup. Mask and match are defined in the Physical Address (PA) space,
51 * but an mblock defines Real Addresses (RA). To translate, the mblock
52 * includes the property address-congruence-offset, hereafter referred to as
53 * ra_to_pa. A real address ra is a member of an lgroup if
55 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
57 * The MD is traversed, and information on all mblocks is kept in the array
58 * mpo_mblock[]. Information on all CPUs, including which lgroup they map
59 * to, is kept in the array mpo_cpu[].
61 * This implementation makes (and verifies) the simplifying assumption that
62 * the mask bits are the same for all defined lgroups, and that all 1 bits in
63 * the mask are contiguous. Thus the number of lgroups is bounded by the
64 * number of possible mask values, and the lgrp_handle_t is defined as the
65 * mask value, shifted right to eliminate the 0 bit positions in mask. The
66 * masks and values are also referred to as "home bits" in the code.
68 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
69 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
70 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
71 * home bits. This yields the mem_node.
76 * This file exports the following entry points:
79 * plat_build_mem_nodes()
80 * plat_lgrp_cpu_to_hand()
82 * plat_pfn_to_mem_node()
83 * These implement the usual platform lgroup interfaces.
85 * plat_rapfn_to_papfn()
86 * Recover the PA page coloring bits from an RA.
88 * plat_mem_node_iterator_init()
89 * Initialize an iterator to efficiently step through pages in a mem_node.
91 * plat_mem_node_intersect_range()
92 * Find the intersection with a mem_node.
96 * Platform hooks to add/delete a pfn range.
98 * Internal Organization
99 * ---------------------
101 * A number of routines are used both boot/DR code which (re)build
102 * appropriate MPO structures.
105 * Allocate memory for mblocks and stripes as
106 * appropriate for boot or memory DR.
109 * Free memory allocated by mblock_alloc.
112 * Build mblocks based on mblock nodes read from the MD.
114 * mblock_update_add()
115 * Rebuild mblocks after a memory DR add operation.
117 * mblock_update_del()
118 * Rebuild mblocks after a memory DR delete operation.
121 * Install mblocks as the new configuration.
124 * Build stripes based on mblocks.
127 * Call memnode layer to add/del a pfn range, based on stripes.
129 * The platform interfaces allocate all memory required for the
130 * particualar update first, block access to the MPO structures
131 * while they are updated, and free old structures after the update.
134 int sun4v_mpo_enable
= 1;
135 int sun4v_mpo_debug
= 0;
136 char sun4v_mpo_status
[256] = "";
138 /* Save CPU info from the MD and associate CPUs with lgroups */
139 static struct cpu_md mpo_cpu
[NCPU
];
141 /* Save lgroup info from the MD */
142 #define MAX_MD_LGROUPS 32
143 static struct lgrp_md mpo_lgroup
[MAX_MD_LGROUPS
];
144 static int n_lgrpnodes
= 0;
145 static int n_locality_groups
= 0;
146 static int max_locality_groups
= 0;
147 static int szc_mask0
= 0;
149 /* Save mblocks from the MD */
150 #define SMALL_MBLOCKS_COUNT 8
151 static struct mblock_md
*mpo_mblock
;
152 static struct mblock_md small_mpo_mblocks
[SMALL_MBLOCKS_COUNT
];
153 static int n_mblocks
= 0;
155 /* Save mem_node stripes calculate from mblocks and lgroups. */
156 static mem_stripe_t
*mem_stripes
;
157 static mem_stripe_t small_mem_stripes
[SMALL_MBLOCKS_COUNT
* MAX_MEM_NODES
];
158 static int n_mem_stripes
= 0;
159 static pfn_t mnode_stride
; /* distance between stripes, start to start */
160 static int stripe_shift
; /* stride/stripes expressed as a shift */
161 static pfn_t mnode_pages
; /* mem_node stripe width */
163 /* Save home mask and shift used to calculate lgrp_handle_t values */
164 static uint64_t home_mask
= 0;
165 static pfn_t home_mask_pfn
= 0;
166 static int home_mask_shift
= 0;
167 static uint_t home_mask_pfn_shift
= 0;
169 /* Save lowest and highest latencies found across all lgroups */
170 static int lower_latency
= 0;
171 static int higher_latency
= 0;
173 static pfn_t base_ra_to_pa_pfn
= 0; /* ra_to_pa for single mblock memory */
174 static int mpo_genid
; /* config gen; updated by mem DR */
175 static mpo_config_t mpo_config
; /* current mblocks and stripes */
177 typedef enum { U_ADD
, U_ADD_ALL
, U_DEL
} update_t
;
179 static int valid_pages(md_t
*md
, mde_cookie_t cpu0
);
180 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset
);
181 static int fix_interleave(void);
183 static int mblock_alloc(mpo_config_t
*, update_t
, int nmblocks
);
184 static void mblock_install(mpo_config_t
*);
185 static void mblock_free(mpo_config_t
*);
186 static void mblock_update(mpo_config_t
*, md_t
, mde_cookie_t
*mblocknodes
);
187 static void mblock_update_add(mpo_config_t
*);
188 static void mblock_update_del(mpo_config_t
*, mpo_config_t
*, pfn_t
, pfn_t
);
189 static void mstripe_update(mpo_config_t
*);
190 static void mnode_update(mpo_config_t
*, pfn_t
, pfn_t
, update_t
);
193 #if defined(DEBUG) && !defined(lint)
194 #define VALIDATE_SLICE(base, end) { \
195 ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M))); \
196 ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M))); \
198 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
200 #define VALIDATE_SLICE(base, end)
201 #define MPO_DEBUG(...)
204 /* Record status message, viewable from mdb */
205 #define MPO_STATUS(args...) { \
206 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \
207 MPO_DEBUG(sun4v_mpo_status); \
211 * The MPO locks are to protect the MPO metadata while that
212 * information is updated as a result of a memory DR operation.
213 * The read lock must be acquired to read the metadata and the
214 * write locks must be acquired to update it.
216 #define mpo_rd_lock kpreempt_disable
217 #define mpo_rd_unlock kpreempt_enable
222 mutex_enter(&cpu_lock
);
223 pause_cpus(NULL
, NULL
);
224 mutex_exit(&cpu_lock
);
230 mutex_enter(&cpu_lock
);
232 mutex_exit(&cpu_lock
);
236 * Routine to read a uint64_t from a given md
239 get_int(md_t md
, mde_cookie_t node
, char *propname
, uint64_t *val
)
241 int err
= md_get_prop_val(md
, node
, propname
, val
);
246 mblock_cmp(const void *a
, const void *b
)
248 struct mblock_md
*m1
= (struct mblock_md
*)a
;
249 struct mblock_md
*m2
= (struct mblock_md
*)b
;
251 if (m1
->base
< m2
->base
)
253 else if (m1
->base
== m2
->base
)
260 mblock_sort(struct mblock_md
*mblocks
, int n
)
262 extern void qsort(void *, size_t, size_t,
263 int (*)(const void *, const void *));
265 qsort(mblocks
, n
, sizeof (mblocks
[0]), mblock_cmp
);
269 mpo_update_tunables(void)
274 * lgrp_expand_proc_thresh is the minimum load on the lgroups
275 * this process is currently running on before considering
276 * expanding threads to another lgroup.
278 * lgrp_expand_proc_diff determines how much less the remote lgroup
279 * must be loaded before expanding to it.
281 * On sun4v CMT processors, threads share a core pipeline, and
282 * at less than 100% utilization, best throughput is obtained by
283 * spreading threads across more cores, even if some are in a
284 * different lgroup. Spread threads to a new lgroup if the
285 * current group is more than 50% loaded. Because of virtualization,
286 * lgroups may have different numbers of CPUs, but the tunables
287 * apply to all lgroups, so find the smallest lgroup and compute
292 for (i
= 0; i
< n_lgrpnodes
; i
++) {
293 int ncpu
= mpo_lgroup
[i
].ncpu
;
294 if (ncpu
!= 0 && ncpu
< ncpu_min
)
297 lgrp_expand_proc_thresh
= ncpu_min
* lgrp_loadavg_max_effect
/ 2;
299 /* new home may only be half as loaded as the existing home to use it */
300 lgrp_expand_proc_diff
= lgrp_expand_proc_thresh
/ 2;
302 lgrp_loadavg_tolerance
= lgrp_loadavg_max_effect
;
306 cpuid_to_cpunode(md_t
*md
, int cpuid
)
308 mde_cookie_t rootnode
, foundnode
, *cpunodes
;
313 return (MDE_INVAL_ELEM_COOKIE
);
315 rootnode
= md_root_node(md
);
316 if (rootnode
== MDE_INVAL_ELEM_COOKIE
)
317 return (MDE_INVAL_ELEM_COOKIE
);
319 n_cpunodes
= md_alloc_scan_dag(md
, rootnode
, PROP_LG_CPU
,
321 if (n_cpunodes
<= 0 || n_cpunodes
> NCPU
)
324 for (i
= 0; i
< n_cpunodes
; i
++) {
325 if (md_get_prop_val(md
, cpunodes
[i
], PROP_LG_CPU_ID
,
328 if (cpuid_prop
== (uint64_t)cpuid
) {
329 foundnode
= cpunodes
[i
];
330 md_free_scan_dag(md
, &cpunodes
);
336 md_free_scan_dag(md
, &cpunodes
);
337 return (MDE_INVAL_ELEM_COOKIE
);
341 mpo_cpu_to_lgroup(md_t
*md
, mde_cookie_t cpunode
)
344 uint64_t latency
, lowest_latency
;
345 uint64_t address_match
, lowest_address_match
;
346 int n_lgroups
, j
, result
= 0;
348 /* Find lgroup nodes reachable from this cpu */
349 n_lgroups
= md_alloc_scan_dag(md
, cpunode
, PROP_LG_MEM_LG
,
352 lowest_latency
= ~(0UL);
354 /* Find the lgroup node with the smallest latency */
355 for (j
= 0; j
< n_lgroups
; j
++) {
356 result
= get_int(md
, nodes
[j
], PROP_LG_LATENCY
,
358 result
|= get_int(md
, nodes
[j
], PROP_LG_MATCH
,
364 if (latency
< lowest_latency
) {
365 lowest_latency
= latency
;
366 lowest_address_match
= address_match
;
369 for (j
= 0; j
< n_lgrpnodes
; j
++) {
370 if ((mpo_lgroup
[j
].latency
== lowest_latency
) &&
371 (mpo_lgroup
[j
].addr_match
== lowest_address_match
))
374 if (j
== n_lgrpnodes
)
379 md_free_scan_dag(md
, &nodes
);
383 /* Called when DR'ing in a CPU */
385 mpo_cpu_add(md_t
*md
, int cpuid
)
387 mde_cookie_t cpunode
;
391 if (n_lgrpnodes
<= 0)
397 cpunode
= cpuid_to_cpunode(md
, cpuid
);
398 if (cpunode
== MDE_INVAL_ELEM_COOKIE
)
401 i
= mpo_cpu_to_lgroup(md
, cpunode
);
405 mpo_cpu
[cpuid
].lgrp_index
= i
;
406 mpo_cpu
[cpuid
].home
= mpo_lgroup
[i
].addr_match
>> home_mask_shift
;
407 mpo_lgroup
[i
].ncpu
++;
408 mpo_update_tunables();
411 panic("mpo_cpu_add: Cannot read MD");
414 /* Called when DR'ing out a CPU */
416 mpo_cpu_remove(int cpuid
)
420 if (n_lgrpnodes
<= 0)
423 i
= mpo_cpu
[cpuid
].lgrp_index
;
424 mpo_lgroup
[i
].ncpu
--;
425 mpo_cpu
[cpuid
].home
= 0;
426 mpo_cpu
[cpuid
].lgrp_index
= -1;
427 mpo_update_tunables();
431 md_get_root(md_t
*md
)
433 mde_cookie_t root
= MDE_INVAL_ELEM_COOKIE
;
436 n_nodes
= md_node_count(md
);
439 MPO_STATUS("md_get_root: No nodes in node count\n");
443 root
= md_root_node(md
);
445 if (root
== MDE_INVAL_ELEM_COOKIE
) {
446 MPO_STATUS("md_get_root: Root node is missing\n");
450 MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes
);
451 MPO_DEBUG("md_get_root: md: %p\n", md
);
452 MPO_DEBUG("md_get_root: root: %lx\n", root
);
458 lgrp_update(md_t
*md
, mde_cookie_t root
)
463 mde_cookie_t
*nodes
, *lgrpnodes
;
465 n_lgrpnodes
= md_alloc_scan_dag(md
, root
, PROP_LG_MEM_LG
,
468 if (n_lgrpnodes
<= 0 || n_lgrpnodes
>= MAX_MD_LGROUPS
) {
469 MPO_STATUS("lgrp_update: No Lgroups\n");
474 MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes
);
476 for (i
= 0; i
< n_lgrpnodes
; i
++) {
477 mpo_lgroup
[i
].node
= lgrpnodes
[i
];
478 mpo_lgroup
[i
].id
= i
;
479 mpo_lgroup
[i
].ncpu
= 0;
480 result
= get_int(md
, lgrpnodes
[i
], PROP_LG_MASK
,
481 &mpo_lgroup
[i
].addr_mask
);
482 result
|= get_int(md
, lgrpnodes
[i
], PROP_LG_MATCH
,
483 &mpo_lgroup
[i
].addr_match
);
486 * If either the mask or match properties are missing, set to 0
489 mpo_lgroup
[i
].addr_mask
= 0;
490 mpo_lgroup
[i
].addr_match
= 0;
493 /* Set latency to 0 if property not present */
495 result
= get_int(md
, lgrpnodes
[i
], PROP_LG_LATENCY
,
496 &mpo_lgroup
[i
].latency
);
498 mpo_lgroup
[i
].latency
= 0;
502 * Sub-page level interleave is not yet supported. Check for it,
503 * and remove sub-page interleaved lgroups from mpo_lgroup and
504 * n_lgrpnodes. If no lgroups are left, return.
507 sub_page_fix
= fix_interleave();
508 if (n_lgrpnodes
== 0) {
513 /* Ensure that all of the addr_mask values are the same */
515 for (i
= 0; i
< n_lgrpnodes
; i
++) {
516 if (mpo_lgroup
[0].addr_mask
!= mpo_lgroup
[i
].addr_mask
) {
517 MPO_STATUS("lgrp_update: "
518 "addr_mask values are not the same\n");
525 * Ensure that all lgrp nodes see all the mblocks. However, if
526 * sub-page interleave is being fixed, they do not, so skip
530 if (sub_page_fix
== 0) {
531 for (i
= 0; i
< n_lgrpnodes
; i
++) {
532 j
= md_alloc_scan_dag(md
, mpo_lgroup
[i
].node
,
533 PROP_LG_MBLOCK
, "fwd", &nodes
);
534 md_free_scan_dag(md
, &nodes
);
535 if (j
!= n_mblocks
) {
536 MPO_STATUS("lgrp_update: "
537 "sub-page interleave is being fixed\n");
544 if (n_lgrpnodes
> 0) {
545 md_free_scan_dag(md
, &lgrpnodes
);
546 for (i
= 0; i
< n_lgrpnodes
; i
++)
547 mpo_lgroup
[i
].node
= MDE_INVAL_ELEM_COOKIE
;
555 * Traverse the MD to determine:
557 * Number of CPU nodes, lgrp_nodes, and mblocks
558 * Then for each lgrp_node, obtain the appropriate data.
559 * For each CPU, determine its home locality and store it.
560 * For each mblock, retrieve its data and store it.
563 lgrp_traverse(md_t
*md
)
565 mde_cookie_t root
, *cpunodes
, *mblocknodes
;
567 uint64_t i
, k
, stripe
, stride
;
568 uint64_t mem_lg_homeset
= 0;
572 mpo_config_t new_config
;
574 if ((root
= md_get_root(md
)) == MDE_INVAL_ELEM_COOKIE
) {
579 n_mblocks
= md_alloc_scan_dag(md
, root
, PROP_LG_MBLOCK
, "fwd",
581 if (n_mblocks
<= 0) {
582 MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
589 * Build the Memory Nodes. Do this before any possibility of
590 * bailing from this routine so we obtain ra_to_pa (needed for page
591 * coloring) even when there are no lgroups defined.
593 if (mblock_alloc(&new_config
, U_ADD_ALL
, n_mblocks
) < 0) {
598 mblock_update(&new_config
, md
, mblocknodes
);
599 mblock_install(&new_config
);
601 /* Page coloring hook is required so we can iterate through mnodes */
602 if (&page_next_pfn_for_color_cpu
== NULL
) {
603 MPO_STATUS("lgrp_traverse: No page coloring support\n");
608 /* Global enable for mpo */
609 if (sun4v_mpo_enable
== 0) {
610 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
615 n_cpunodes
= md_alloc_scan_dag(md
, root
, PROP_LG_CPU
, "fwd", &cpunodes
);
617 if (n_cpunodes
<= 0 || n_cpunodes
> NCPU
) {
618 MPO_STATUS("lgrp_traverse: No CPU nodes detected "
624 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes
);
626 if ((ret_val
= lgrp_update(md
, root
)) == -1)
630 * Use the address mask from the first lgroup node
631 * to establish our home_mask.
633 home_mask
= mpo_lgroup
[0].addr_mask
;
634 home_mask_pfn
= btop(home_mask
);
635 home_mask_shift
= lowbit(home_mask
) - 1;
636 home_mask_pfn_shift
= home_mask_shift
- PAGESHIFT
;
637 mnode_pages
= btop(1ULL << home_mask_shift
);
640 * How many values are possible in home mask? Assume the mask
641 * bits are contiguous.
643 max_locality_groups
=
644 1 << highbit(home_mask_pfn
>> home_mask_pfn_shift
);
646 stripe_shift
= highbit(max_locality_groups
) - 1;
647 stripe
= ptob(mnode_pages
);
648 stride
= max_locality_groups
* stripe
;
649 mnode_stride
= btop(stride
);
651 /* Now verify the home mask bits are contiguous */
653 if (max_locality_groups
- 1 != home_mask_pfn
>> home_mask_pfn_shift
) {
654 MPO_STATUS("lgrp_traverse: "
655 "home mask bits are not contiguous\n");
660 /* Record all of the home bits */
662 for (i
= 0; i
< n_lgrpnodes
; i
++) {
663 HOMESET_ADD(mem_lg_homeset
,
664 mpo_lgroup
[i
].addr_match
>> home_mask_shift
);
667 /* Count the number different "home" mem_lg's we've discovered */
669 n_locality_groups
= unique_home_mem_lg_count(mem_lg_homeset
);
671 /* If we have only 1 locality group then we can exit */
672 if (n_locality_groups
== 1) {
673 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
679 * Set the latencies. A CPU's lgroup is defined by the lowest
680 * latency found. All other memory is considered remote, and the
681 * remote latency is represented by the highest latency found.
682 * Thus hierarchical lgroups, if any, are approximated by a
685 * The Solaris MPO framework by convention wants to see latencies
686 * in units of nano-sec/10. In the MD, the units are defined to be
690 lower_latency
= mpo_lgroup
[0].latency
;
691 higher_latency
= mpo_lgroup
[0].latency
;
693 for (i
= 1; i
< n_lgrpnodes
; i
++) {
694 if (mpo_lgroup
[i
].latency
< lower_latency
) {
695 lower_latency
= mpo_lgroup
[i
].latency
;
697 if (mpo_lgroup
[i
].latency
> higher_latency
) {
698 higher_latency
= mpo_lgroup
[i
].latency
;
701 lower_latency
/= 10000;
702 higher_latency
/= 10000;
704 /* Clear our CPU data */
706 for (i
= 0; i
< NCPU
; i
++) {
708 mpo_cpu
[i
].lgrp_index
= -1;
711 /* Build the CPU nodes */
712 for (i
= 0; i
< n_cpunodes
; i
++) {
714 /* Read in the lgroup nodes */
715 result
= get_int(md
, cpunodes
[i
], PROP_LG_CPU_ID
, &k
);
717 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
722 o
= mpo_cpu_to_lgroup(md
, cpunodes
[i
]);
727 mpo_cpu
[k
].lgrp_index
= o
;
728 mpo_cpu
[k
].home
= mpo_lgroup
[o
].addr_match
>> home_mask_shift
;
729 mpo_lgroup
[o
].ncpu
++;
731 /* Validate that no large pages cross mnode boundaries. */
732 if (valid_pages(md
, cpunodes
[0]) == 0) {
739 md_free_scan_dag(md
, &cpunodes
);
741 md_free_scan_dag(md
, &mblocknodes
);
743 panic("lgrp_traverse: No memory blocks found");
746 MPO_STATUS("MPO feature is enabled.\n");
748 sun4v_mpo_enable
= 0; /* set this for DR */
754 * Determine the number of unique mem_lg's present in our system
757 unique_home_mem_lg_count(uint64_t mem_lg_homeset
)
763 * Scan the "home" bits of the mem_lgs, count
764 * the number that are unique.
767 for (homeid
= 0; homeid
< NLGRPS_MAX
; homeid
++) {
768 if (MEM_LG_ISMEMBER(mem_lg_homeset
, homeid
)) {
773 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
775 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count
);
777 /* Default must be at least one */
785 * Platform specific lgroup initialization
793 /* Get the Machine Descriptor handle */
795 md
= md_get_handle();
797 /* If not, we cannot continue */
800 panic("cannot access machine descriptor\n");
802 rc
= lgrp_traverse(md
);
803 (void) md_fini_handle(md
);
807 * If we can't process the MD for lgroups then at least let the
808 * system try to boot. Assume we have one lgroup so that
809 * when plat_build_mem_nodes is called, it will attempt to init
810 * an mnode based on the supplied memory segment.
815 max_locality_groups
= 1;
816 n_locality_groups
= 1;
820 mem_node_pfn_shift
= 0;
821 mem_node_physalign
= 0;
823 /* Use lgroup-aware TSB allocations */
824 tsb_lgrp_affinity
= 1;
826 /* Require that a home lgroup have some memory to be chosen */
827 lgrp_mem_free_thresh
= 1;
829 /* Standard home-on-next-touch policy */
830 lgrp_mem_policy_root
= LGRP_MEM_POLICY_NEXT
;
832 /* Disable option to choose root lgroup if all leaf lgroups are busy */
833 lgrp_load_thresh
= UINT32_MAX
;
835 mpo_update_tunables();
839 * Helper routine for debugging calls to mem_node_add_slice()
842 mpo_mem_node_add_slice(pfn_t basepfn
, pfn_t endpfn
)
844 #if defined(DEBUG) && !defined(lint)
845 static int slice_count
= 0;
848 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n",
849 slice_count
, basepfn
, endpfn
);
851 mem_node_add_slice(basepfn
, endpfn
);
855 mpo_mem_node_del_slice(pfn_t basepfn
, pfn_t endpfn
)
857 #if defined(DEBUG) && !defined(lint)
858 static int slice_count
= 0;
861 MPO_DEBUG("mem_del_slice(%d): basepfn: %lx endpfn: %lx\n",
862 slice_count
, basepfn
, endpfn
);
864 mem_node_del_slice(basepfn
, endpfn
);
868 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
871 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand
, int mnode
)
873 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
874 "mnode index: %d\n", plathand
, mnode
);
875 plat_assign_lgrphand_to_mem_node(plathand
, mnode
);
879 * plat_build_mem_nodes()
881 * Define the mem_nodes based on the modified boot memory list,
882 * or based on info read from the MD in plat_lgrp_init().
884 * When the home mask lies in the middle of the address bits (as it does on
885 * Victoria Falls), then the memory in one mem_node is no longer contiguous;
886 * it is striped across an mblock in a repeating pattern of contiguous memory
887 * followed by a gap. The stripe width is the size of the contiguous piece.
888 * The stride is the distance from the start of one contiguous piece to the
889 * start of the next. The gap is thus stride - stripe_width.
891 * The stripe of an mnode that falls within an mblock is described by the type
892 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The
893 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into
894 * this array is predetermined. The mem_stripe_t that describes mnode m
895 * within mpo_mblock[i] is stored at
896 * mem_stripes[ m + i * max_locality_groups ]
898 * max_locality_groups is the total number of possible locality groups,
899 * as defined by the size of the home mask, even if the memory assigned
900 * to the domain is small and does not cover all the lgroups. Thus some
901 * mem_stripe_t's may be empty.
903 * The members of mem_stripe_t are:
904 * physbase: First valid page in mem_node in the corresponding mblock
905 * physmax: Last valid page in mem_node in mblock
906 * offset: The full stripe width starts at physbase - offset.
907 * Thus if offset is non-zero, this mem_node starts in the middle
908 * of a stripe width, and the second full stripe starts at
909 * physbase - offset + stride. (even though physmax may fall in the
910 * middle of a stripe width, we do not save the ending fragment size
911 * in this data structure.)
912 * exists: Set to 1 if the mblock has memory in this mem_node stripe.
914 * The stripe width is kept in the global mnode_pages.
915 * The stride is kept in the global mnode_stride.
916 * All the above use pfn's as the unit.
918 * As an example, the memory layout for a domain with 2 mblocks and 4
919 * mem_nodes 0,1,2,3 could look like this:
921 * 123012301230 ... 012301230123 ...
927 plat_build_mem_nodes(prom_memlist_t
*list
, size_t nelems
)
932 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
933 max_mem_nodes
= max_locality_groups
;
935 mstripe_update(&mpo_config
);
937 /* Check for non-MPO sun4v platforms */
938 if (n_locality_groups
<= 1) {
939 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE
, 0);
940 for (elem
= 0; elem
< nelems
; list
++, elem
++) {
944 mpo_mem_node_add_slice(btop(base
),
945 btop(base
+ len
- 1));
947 mem_node_pfn_shift
= 0;
948 mem_node_physalign
= 0;
950 mnode_update(&mpo_config
, 0, 0, U_ADD_ALL
);
953 * Indicate to vm_pagelist that the hpm_counters array
954 * should be shared because the ranges overlap.
956 if (max_mem_nodes
> 1) {
957 interleaved_mnodes
= 1;
962 * Return the locality group value for the supplied processor
965 plat_lgrp_cpu_to_hand(processorid_t id
)
967 lgrp_handle_t lgrphand
;
970 if (n_locality_groups
> 1) {
971 lgrphand
= (lgrp_handle_t
)mpo_cpu
[(int)id
].home
;
973 lgrphand
= (lgrp_handle_t
)LGRP_DEFAULT_HANDLE
; /* Default */
981 plat_lgrp_latency(lgrp_handle_t from
, lgrp_handle_t to
)
984 * Return min remote latency when there are more than two lgroups
985 * (root and child) and getting latency between two different lgroups
986 * or root is involved.
988 if (lgrp_optimizations() && (from
!= to
||
989 from
== LGRP_DEFAULT_HANDLE
|| to
== LGRP_DEFAULT_HANDLE
)) {
990 return ((int)higher_latency
);
992 return ((int)lower_latency
);
997 plat_pfn_to_mem_node(pfn_t pfn
)
1001 struct mblock_md
*mb
;
1003 if (n_locality_groups
<= 1)
1007 * The mnode is defined to be 1:1 with the lgroup handle, which
1008 * is taken from from the home bits. Find the mblock in which
1009 * the pfn falls to get the ra_to_pa adjustment, and extract
1013 mb
= &mpo_mblock
[0];
1014 for (i
= 0; i
< n_mblocks
; i
++) {
1015 if (pfn
>= mb
->base_pfn
&& pfn
<= mb
->end_pfn
) {
1016 ra_to_pa_pfn
= btop(mb
->ra_to_pa
);
1017 mnode
= (((pfn
+ ra_to_pa_pfn
) & home_mask_pfn
) >>
1018 home_mask_pfn_shift
);
1019 ASSERT(mnode
< max_mem_nodes
);
1026 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn
);
1031 * plat_rapfn_to_papfn
1033 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034 * and home mask bits are correct. The upper bits do not necessarily
1035 * match the actual PA, however.
1038 plat_rapfn_to_papfn(pfn_t pfn
)
1042 struct mblock_md
*mb
;
1044 ASSERT(n_mblocks
> 0);
1046 return (pfn
+ base_ra_to_pa_pfn
);
1049 * Find the mblock in which the pfn falls
1050 * in order to get the ra_to_pa adjustment.
1053 for (mb
= &mpo_mblock
[0], i
= 0; i
< n_mblocks
; i
++, mb
++) {
1054 if (pfn
<= mb
->end_pfn
&& pfn
>= mb
->base_pfn
) {
1055 ra_to_pa_pfn
= btop(mb
->ra_to_pa
);
1057 return (pfn
+ ra_to_pa_pfn
);
1061 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn
);
1066 * plat_mem_node_iterator_init()
1067 * Initialize cookie "it" to iterate over pfn's in an mnode. There is
1068 * no additional iterator function. The caller uses the info from
1069 * the iterator structure directly.
1071 * pfn: starting pfn.
1072 * mnode: desired mnode.
1073 * szc: desired page size.
1075 * if 1, start a new traversal, initialize "it", find first
1076 * mblock containing pfn, and return its starting pfn
1078 * if 0, continue the previous traversal using passed-in data
1079 * from "it", advance to the next mblock, and return its
1080 * starting pfn within the mnode.
1081 * it: returns readonly data to the caller; see below.
1083 * The input pfn must be aligned for the page size szc.
1085 * Returns: starting pfn for the iteration for the mnode/mblock,
1086 * which is aligned according to the page size,
1087 * or returns (pfn_t)(-1) if the input pfn lies past the last
1088 * valid pfn of the mnode.
1089 * Returns misc values in the "it" struct that allows the caller
1090 * to advance the pfn within an mblock using address arithmetic;
1091 * see definition of mem_node_iterator_t in vm_dep.h.
1092 * When the caller calculates a pfn that is greater than the
1093 * returned value it->mi_mblock_end, the caller should again
1094 * call plat_mem_node_iterator_init, passing init=0.
1096 * The last mblock in continuation case may be invalid because
1097 * of memory DR. To detect this situation mi_genid is checked
1098 * against mpo_genid which is incremented after a memory DR
1099 * operation. See also plat_slice_add()/plat_slice_del().
1102 plat_mem_node_iterator_init(pfn_t pfn
, int mnode
, uchar_t szc
,
1103 mem_node_iterator_t
*it
, int init
)
1106 pgcnt_t szcpgcnt
= PNUM_SIZE(szc
);
1107 struct mblock_md
*mblock
;
1110 uint64_t szcpagesize
;
1113 ASSERT(mnode
>= 0 && mnode
< max_mem_nodes
);
1114 ASSERT(n_mblocks
> 0);
1115 ASSERT(P2PHASE(pfn
, szcpgcnt
) == 0);
1119 if (init
|| (it
->mi_genid
!= mpo_genid
)) {
1120 it
->mi_genid
= mpo_genid
;
1121 it
->mi_last_mblock
= 0;
1125 /* Check if mpo is not enabled and we only have one mblock */
1126 if (n_locality_groups
== 1 && n_mblocks
== 1) {
1127 if (P2PHASE(base_ra_to_pa_pfn
, szcpgcnt
)) {
1131 it
->mi_mnode
= mnode
;
1132 it
->mi_ra_to_pa
= base_ra_to_pa_pfn
;
1133 it
->mi_mnode_pfn_mask
= 0;
1134 it
->mi_mnode_pfn_shift
= 0;
1135 it
->mi_mnode_mask
= 0;
1136 it
->mi_mblock_base
= mem_node_config
[mnode
].physbase
;
1137 it
->mi_mblock_end
= mem_node_config
[mnode
].physmax
;
1138 if (pfn
< it
->mi_mblock_base
)
1139 pfn
= P2ROUNDUP(it
->mi_mblock_base
, szcpgcnt
);
1140 if ((pfn
+ szcpgcnt
- 1) > it
->mi_mblock_end
)
1145 /* init=1 means begin iterator, init=0 means continue */
1149 ASSERT(it
->mi_last_mblock
< n_mblocks
);
1150 i
= it
->mi_last_mblock
;
1152 mem_stripes
[i
* max_locality_groups
+ mnode
].physmax
);
1153 if (++i
== n_mblocks
) {
1160 * Find mblock that contains pfn for mnode's stripe, or first such an
1161 * mblock after pfn, else pfn is out of bound and we'll return -1.
1162 * mblocks and stripes are sorted in ascending address order.
1164 szcpagesize
= szcpgcnt
<< PAGESHIFT
;
1165 for (; i
< n_mblocks
; i
++) {
1166 if (P2PHASE(mpo_mblock
[i
].ra_to_pa
, szcpagesize
))
1168 ms
= &mem_stripes
[i
* max_locality_groups
+ mnode
];
1169 if (ms
->exists
&& (pfn
+ szcpgcnt
- 1) <= ms
->physmax
&&
1170 (P2ROUNDUP(ms
->physbase
, szcpgcnt
) + szcpgcnt
- 1) <=
1174 if (i
== n_mblocks
) {
1175 it
->mi_last_mblock
= i
- 1;
1180 it
->mi_last_mblock
= i
;
1182 mblock
= &mpo_mblock
[i
];
1183 base
= ms
->physbase
;
1186 it
->mi_mnode
= mnode
;
1187 it
->mi_ra_to_pa
= btop(mblock
->ra_to_pa
);
1188 it
->mi_mblock_base
= base
;
1189 it
->mi_mblock_end
= end
;
1190 it
->mi_mnode_pfn_mask
= home_mask_pfn
; /* is 0 for non-MPO case */
1191 it
->mi_mnode_pfn_shift
= home_mask_pfn_shift
;
1192 it
->mi_mnode_mask
= max_locality_groups
- 1;
1194 pfn
= P2ROUNDUP(base
, szcpgcnt
);
1195 ASSERT(pfn
+ szcpgcnt
- 1 <= end
);
1197 ASSERT((pfn
+ szcpgcnt
- 1) <= mpo_mblock
[i
].end_pfn
);
1204 * plat_mem_node_intersect_range()
1206 * Find the intersection between a memnode and a range of pfn's.
1209 plat_mem_node_intersect_range(pfn_t test_base
, pgcnt_t test_len
,
1210 int mnode
, pgcnt_t
*npages_out
)
1212 pfn_t offset
, len
, hole
, base
, end
, test_end
, frag
;
1219 if (!mem_node_config
[mnode
].exists
|| test_len
== 0)
1222 base
= mem_node_config
[mnode
].physbase
;
1223 end
= mem_node_config
[mnode
].physmax
;
1225 test_end
= test_base
+ test_len
- 1;
1226 if (end
< test_base
|| base
> test_end
)
1229 if (n_locality_groups
== 1) {
1230 *npages_out
= MIN(test_end
, end
) - MAX(test_base
, base
) + 1;
1234 hole
= mnode_stride
- mnode_pages
;
1238 * Iterate over all the stripes for this mnode (one per mblock),
1239 * find the intersection with each, and accumulate the intersections.
1241 * Determing the intersection with a stripe is tricky. If base or end
1242 * fall outside the mem_node bounds, round them to physbase/physmax of
1243 * mem_node. If base or end fall in a gap, round them to start of
1244 * nearest stripe. If they fall within a stripe, keep base or end,
1245 * but calculate the fragment size that should be excluded from the
1246 * stripe. Calculate how many strides fall in the adjusted range,
1247 * multiply by stripe width, and add the start and end fragments.
1251 for (i
= mnode
; i
< n_mem_stripes
; i
+= max_locality_groups
) {
1252 ms
= &mem_stripes
[i
];
1254 test_base
<= (end
= ms
->physmax
) &&
1255 test_end
>= (base
= ms
->physbase
)) {
1257 offset
= ms
->offset
;
1259 if (test_base
> base
) {
1260 /* Round test_base to next multiple of stride */
1261 len
= P2ROUNDUP(test_base
- (base
- offset
),
1263 nearest
= base
- offset
+ len
;
1265 * Compute distance from test_base to the
1266 * stride boundary to see if test_base falls
1267 * in the stripe or in the hole.
1269 if (nearest
- test_base
> hole
) {
1271 * test_base lies in stripe,
1272 * and offset should be excluded.
1274 offset
= test_base
-
1275 (nearest
- mnode_stride
);
1278 /* round up to next stripe start */
1289 end
++; /* adjust to an exclusive bound */
1291 /* Round end to next multiple of stride */
1292 len
= P2ROUNDUP(end
- (base
- offset
), mnode_stride
);
1293 nearest
= (base
- offset
) + len
;
1294 if (nearest
- end
<= hole
) {
1295 /* end falls in hole, use entire last stripe */
1298 /* end falls in stripe, compute fragment */
1299 frag
= nearest
- hole
- end
;
1302 len
= (len
>> stripe_shift
) - offset
- frag
;
1307 *npages_out
= npages
;
1314 * Return 1 if pages are valid and do not cross mnode boundaries
1315 * (which would break page free list assumptions), and 0 otherwise.
1319 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1322 valid_pages(md_t
*md
, mde_cookie_t cpu0
)
1325 uint64_t last_page_base
, szc_mask
;
1326 uint64_t max_page_len
, max_coalesce_len
;
1327 struct mblock_md
*mb
= mpo_mblock
;
1330 * Find the smaller of the largest page possible and supported.
1331 * mmu_exported_pagesize_mask is not yet initialized, so read
1332 * it from the MD. Apply minimal fixups in case of broken MDs
1333 * to get a sane mask.
1337 szc_mask
= szc_mask0
;
1339 if (md_get_prop_val(md
, cpu0
, "mmu-page-size-list", &szc_mask
))
1341 /* largest in sun4v default support */
1342 szc_mask
|= (1 << TTE4M
);
1343 szc_mask0
= szc_mask
;
1345 max_szc
= highbit(szc_mask
) - 1;
1346 if (max_szc
> TTE256M
)
1348 max_page_len
= TTEBYTES(max_szc
);
1351 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352 * if mmu-page-size-list does not contain it, so 256M pages must fall
1353 * within one mnode to use MPO.
1355 max_coalesce_len
= TTEBYTES(TTE256M
);
1356 ASSERT(max_coalesce_len
>= max_page_len
);
1358 if (ptob(mnode_pages
) < max_coalesce_len
) {
1359 MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360 "mnode slice = %lx\n", max_coalesce_len
, ptob(mnode_pages
));
1364 for (i
= 0; i
< n_mblocks
; i
++) {
1365 uint64_t base
= mb
->base
;
1366 uint64_t end
= mb
->base
+ mb
->size
- 1;
1367 uint64_t ra_to_pa
= mb
->ra_to_pa
;
1370 * If mblock is smaller than the max page size, then
1371 * RA = PA mod MAXPAGE is not guaranteed, but it must
1374 if (mb
->size
< max_page_len
) {
1375 if (MNODE(base
+ ra_to_pa
) != MNODE(end
+ ra_to_pa
)) {
1376 MPO_STATUS("Small mblock spans mnodes; "
1377 "MPO disabled: base = %lx, end = %lx, "
1378 "ra2pa = %lx\n", base
, end
, ra_to_pa
);
1382 /* Verify RA = PA mod MAXPAGE, using coalesce size */
1383 uint64_t pa_base
= base
+ ra_to_pa
;
1384 if ((base
& (max_coalesce_len
- 1)) !=
1385 (pa_base
& (max_coalesce_len
- 1))) {
1386 MPO_STATUS("bad page alignment; MPO disabled: "
1387 "ra = %lx, pa = %lx, pagelen = %lx\n",
1388 base
, pa_base
, max_coalesce_len
);
1394 * Find start of last large page in mblock in RA space.
1395 * If page extends into the next mblock, verify the
1396 * mnode does not change.
1398 last_page_base
= P2ALIGN(end
, max_coalesce_len
);
1399 if (i
+ 1 < n_mblocks
&&
1400 last_page_base
+ max_coalesce_len
> mb
[1].base
&&
1401 MNODE(last_page_base
+ ra_to_pa
) !=
1402 MNODE(mb
[1].base
+ mb
[1].ra_to_pa
)) {
1403 MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405 "pagelen = %lx\n", end
, ra_to_pa
, mb
[1].base
,
1406 mb
[1].ra_to_pa
, max_coalesce_len
);
1417 * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418 * if any, and remove them. This yields a config where the "coarse
1419 * grained" lgroups cover all of memory, even though part of that memory
1420 * is fine grain interleaved and does not deliver a purely local memory
1423 * This function reads and modifies the globals:
1424 * mpo_lgroup[], n_lgrpnodes
1426 * Returns 1 if lgroup nodes were removed, 0 otherwise.
1430 fix_interleave(void)
1436 for (i
= 0; i
< n_lgrpnodes
; i
++) {
1437 if ((mpo_lgroup
[i
].addr_mask
& PAGEOFFSET
) != 0) {
1438 /* remove this lgroup */
1439 mask
= mpo_lgroup
[i
].addr_mask
;
1441 mpo_lgroup
[j
++] = mpo_lgroup
[i
];
1447 MPO_STATUS("sub-page interleave %lx found; "
1448 "removing lgroup.\n", mask
);
1456 * Allocate memory for mblock an stripe arrays from either static or
1457 * dynamic space depending on utype, and return the result in mc.
1458 * Returns 0 on success and -1 on error.
1462 mblock_alloc(mpo_config_t
*mc
, update_t utype
, int nmblocks
)
1464 mblock_md_t
*mb
= NULL
;
1465 mem_stripe_t
*ms
= NULL
;
1466 int nstripes
= MAX_MEM_NODES
* nmblocks
;
1467 size_t mblocksz
= nmblocks
* sizeof (struct mblock_md
);
1468 size_t mstripesz
= nstripes
* sizeof (mem_stripe_t
);
1469 size_t allocsz
= mmu_ptob(mmu_btopr(mblocksz
+ mstripesz
));
1472 * Allocate space for mblocks and mstripes.
1474 * For DR allocations, just use kmem_alloc(), and set
1475 * mc_alloc_sz to indicate it was used.
1477 * For boot allocation:
1478 * If we have a small number of mblocks we will use the space
1479 * that we preallocated. Otherwise, we will dynamically
1480 * allocate the space from the prom and map it to the
1481 * reserved VA at MPOBUF_BASE.
1484 if (utype
== U_ADD
|| utype
== U_DEL
) {
1485 mb
= (struct mblock_md
*)kmem_zalloc(allocsz
, KM_SLEEP
);
1486 ms
= (mem_stripe_t
*)(mb
+ nmblocks
);
1487 mc
->mc_alloc_sz
= allocsz
;
1488 } else if (nmblocks
<= SMALL_MBLOCKS_COUNT
) {
1489 mb
= &small_mpo_mblocks
[0];
1490 ms
= &small_mem_stripes
[0];
1491 mc
->mc_alloc_sz
= 0;
1493 /* Ensure that we dont request more space than reserved */
1494 if (allocsz
> MPOBUF_SIZE
) {
1495 MPO_STATUS("mblock_alloc: Insufficient space "
1496 "for mblock structures \n");
1499 mb
= (struct mblock_md
*)
1500 prom_alloc((caddr_t
)MPOBUF_BASE
, allocsz
, PAGESIZE
);
1501 if (mb
!= (struct mblock_md
*)MPOBUF_BASE
) {
1502 MPO_STATUS("mblock_alloc: Cannot allocate space "
1506 mpo_heap32_buf
= (caddr_t
)MPOBUF_BASE
;
1507 mpo_heap32_bufsz
= MPOBUF_SIZE
;
1508 ms
= (mem_stripe_t
*)(mb
+ nmblocks
);
1509 mc
->mc_alloc_sz
= 0;
1511 mc
->mc_mblocks
= mb
;
1512 mc
->mc_stripes
= ms
;
1513 mc
->mc_nmblocks
= nmblocks
;
1514 mc
->mc_nstripes
= nstripes
;
1515 MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks
);
1522 * Free memory in mc that was allocated by mblock_alloc.
1526 mblock_free(mpo_config_t
*mc
)
1528 if (mc
->mc_alloc_sz
> 0) {
1529 ASSERT(mc
->mc_mblocks
!= mpo_mblock
);
1530 kmem_free((caddr_t
)mc
->mc_mblocks
, mc
->mc_alloc_sz
);
1532 bzero(mc
, sizeof (*mc
));
1538 * Install mblock config passed in mc as the global configuration.
1539 * May only be called at boot or while holding mpo_wr_lock.
1543 mblock_install(mpo_config_t
*mc
)
1545 mpo_mblock
= mc
->mc_mblocks
;
1546 n_mblocks
= mc
->mc_nmblocks
;
1547 mem_stripes
= mc
->mc_stripes
;
1548 n_mem_stripes
= mc
->mc_nstripes
;
1549 base_ra_to_pa_pfn
= btop(mc
->mc_mblocks
[0].ra_to_pa
);
1556 * Traverse mblocknodes, read the mblock properties from the MD, and
1557 * save the mblocks in mc.
1561 mblock_update(mpo_config_t
*mc
, md_t md
, mde_cookie_t
*mblocknodes
)
1565 mblock_md_t
*mblock
= mc
->mc_mblocks
;
1567 for (i
= 0, j
= 0; j
< mc
->mc_nmblocks
; j
++) {
1569 /* Without a base or size value we will fail */
1570 result
= get_int(md
, mblocknodes
[j
], PROP_LG_BASE
,
1573 MPO_STATUS("mblock_update: "
1574 "PROP_LG_BASE is missing\n");
1575 mc
->mc_nmblocks
= 0;
1579 result
= get_int(md
, mblocknodes
[j
], PROP_LG_SIZE
,
1582 MPO_STATUS("mblock_update: "
1583 "PROP_LG_SIZE is missing\n");
1584 mc
->mc_nmblocks
= 0;
1588 result
= get_int(md
, mblocknodes
[j
],
1589 PROP_LG_RA_PA_OFFSET
, &mblock
[i
].ra_to_pa
);
1591 /* If we don't have an ra_pa_offset, just set it to 0 */
1593 mblock
[i
].ra_to_pa
= 0;
1595 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
1596 "ra_to_pa = %lx\n", i
,
1599 mblock
[i
].ra_to_pa
);
1601 /* check for unsupportable values of base and size */
1602 if (mblock
[i
].base
> mblock
[i
].base
+ mblock
[i
].size
) {
1603 MPO_STATUS("mblock_update: "
1604 "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
1605 "base = %lx, size = %lx\n",
1606 mblock
[i
].base
, mblock
[i
].size
);
1607 mc
->mc_nmblocks
= 0;
1611 /* eliminate size==0 blocks */
1612 if (mblock
[i
].size
!= 0) {
1613 uint64_t base
= mblock
[i
].base
;
1614 uint64_t end
= base
+ mblock
[i
].size
;
1616 mblock
[i
].base_pfn
= btop(base
);
1617 mblock
[i
].end_pfn
= btop(end
- 1);
1623 MPO_STATUS("mblock_update: "
1624 "No non-empty mblock nodes were found "
1625 "in the Machine Descriptor\n");
1626 mc
->mc_nmblocks
= 0;
1629 ASSERT(i
<= mc
->mc_nmblocks
);
1630 mc
->mc_nmblocks
= i
;
1632 /* Must sort mblocks by address for mem_node_iterator_init() */
1633 mblock_sort(mblock
, mc
->mc_nmblocks
);
1639 * Update mblock config after a memory DR add. The added range is not
1640 * needed, as we read *all* mblock nodes from the MD. Save the mblocks
1645 mblock_update_add(mpo_config_t
*mc
)
1648 mde_cookie_t root
, *mblocknodes
;
1651 if ((md
= md_get_handle()) == NULL
) {
1652 MPO_STATUS("Cannot access Machine Descriptor\n");
1656 if ((root
= md_get_root(md
)) == MDE_INVAL_ELEM_COOKIE
)
1659 nmblocks
= md_alloc_scan_dag(md
, root
, PROP_LG_MBLOCK
, "fwd",
1661 if (nmblocks
<= 0) {
1662 MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
1666 if (mblock_alloc(mc
, U_ADD
, nmblocks
) < 0)
1669 mblock_update(mc
, md
, mblocknodes
);
1670 md_free_scan_dag(md
, &mblocknodes
);
1671 (void) md_fini_handle(md
);
1674 panic("mblock_update_add: cannot process mblocks from MD.\n");
1680 * Update mblocks after a memory DR deletion of the range (ubase, uend).
1681 * Allocate a new mblock config, copy old config to the new, modify the new
1682 * mblocks to reflect the deletion. The new mblocks are returned in
1683 * mc_new and are not yet installed as the active config.
1687 mblock_update_del(mpo_config_t
*mc_new
, mpo_config_t
*mc_old
, pfn_t ubase
,
1692 mblock_md_t
*mblock
;
1693 int nmblocks
= mc_old
->mc_nmblocks
;
1695 MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase
, uend
);
1698 * Allocate mblocks in mc_new and copy the old to the new.
1699 * Allocate one extra in case the deletion splits an mblock.
1701 if (mblock_alloc(mc_new
, U_DEL
, nmblocks
+ 1) < 0)
1703 mblock
= mc_new
->mc_mblocks
;
1704 bcopy(mc_old
->mc_mblocks
, mblock
, nmblocks
* sizeof (mblock_md_t
));
1707 * Find the mblock containing the deleted range and adjust it in
1710 for (i
= 0; i
< nmblocks
; i
++) {
1712 base
= btop(mblock
[i
].base
);
1713 end
= base
+ btop(mblock
[i
].size
) - 1;
1716 * Adjust the mblock based on the subset that was deleted.
1718 * If the entire mblk was deleted, compact the table.
1720 * If the middle of the mblk was deleted, extend
1721 * the table. Space for the new slot was already
1724 * The memory to be deleted is a mblock or a subset of
1725 * and does not span multiple mblocks.
1727 if (base
== ubase
&& end
== uend
) {
1728 for (j
= i
; j
< nmblocks
- 1; j
++)
1729 mblock
[j
] = mblock
[j
+ 1];
1731 bzero(&mblock
[nmblocks
], sizeof (*mblock
));
1733 } else if (base
< ubase
&& end
> uend
) {
1734 for (j
= nmblocks
- 1; j
>= i
; j
--)
1735 mblock
[j
+ 1] = mblock
[j
];
1736 mblock
[i
].size
= ptob(ubase
- base
);
1737 mblock
[i
].end_pfn
= ubase
- 1;
1738 mblock
[i
+ 1].base
= ptob(uend
+ 1);
1739 mblock
[i
+ 1].size
= ptob(end
- uend
);
1740 mblock
[i
+ 1].base_pfn
= uend
+ 1;
1743 } else if (base
== ubase
) {
1744 MPO_DEBUG("mblock_update_del: shrink>"
1745 " i=%d base=0x%lx end=0x%lx", i
, base
, end
);
1746 mblock
[i
].base
= ptob(uend
+ 1);
1747 mblock
[i
].size
-= ptob(uend
- ubase
+ 1);
1749 mblock
[i
].base_pfn
= base
;
1750 mblock
[i
].end_pfn
= end
;
1751 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base
, end
);
1753 } else if (end
== uend
) {
1754 MPO_DEBUG("mblock_update_del: shrink<"
1755 " i=%d base=0x%lx end=0x%lx", i
, base
, end
);
1756 mblock
[i
].size
-= ptob(uend
- ubase
+ 1);
1758 mblock
[i
].base_pfn
= base
;
1759 mblock
[i
].end_pfn
= end
;
1760 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base
, end
);
1764 mc_new
->mc_nmblocks
= nmblocks
;
1771 * Read mblocks from mc and update mstripes in mc
1775 mstripe_update(mpo_config_t
*mc
)
1777 lgrp_handle_t lgrphand
, lgrp_start
;
1779 uint64_t offset
, stripe_end
, base
, end
, ra_to_pa
, stride
;
1780 uint64_t stripe
, frag
, remove
;
1782 mblock_md_t
*mblock
= mc
->mc_mblocks
;
1783 int nmblocks
= mc
->mc_nmblocks
;
1784 int mstripesz
= MAX_MEM_NODES
* nmblocks
* sizeof (mem_stripe_t
);
1786 /* Check for non-MPO sun4v platforms or memory DR removal */
1787 if (n_locality_groups
<= 1) {
1788 ASSERT(n_locality_groups
== 1);
1789 ASSERT(max_locality_groups
== 1 && max_mem_nodes
== 1);
1791 if (nmblocks
== 1) {
1792 mc
->mc_nstripes
= 0;
1794 mc
->mc_nstripes
= nmblocks
;
1795 bzero(mc
->mc_stripes
, mstripesz
);
1796 for (i
= 0; i
< nmblocks
; i
++) {
1797 mc
->mc_stripes
[i
].exists
= 1;
1798 mc
->mc_stripes
[i
].physbase
= mblock
[i
].base_pfn
;
1799 mc
->mc_stripes
[i
].physmax
= mblock
[i
].end_pfn
;
1805 bzero(mc
->mc_stripes
, mstripesz
);
1806 mc
->mc_nstripes
= max_locality_groups
* nmblocks
;
1807 stripe
= ptob(mnode_pages
);
1808 stride
= max_locality_groups
* stripe
;
1810 for (i
= 0; i
< nmblocks
; i
++) {
1811 base
= mblock
[i
].base
;
1812 end
= base
+ mblock
[i
].size
;
1813 ra_to_pa
= mblock
[i
].ra_to_pa
;
1815 /* Find the offset from the prev stripe boundary in PA space. */
1816 offset
= (base
+ ra_to_pa
) & (stripe
- 1);
1818 /* Set the next stripe boundary. */
1819 stripe_end
= base
- offset
+ stripe
;
1821 lgrp_start
= (((base
+ ra_to_pa
) & home_mask
) >>
1823 lgrphand
= lgrp_start
;
1826 * Loop over all lgroups covered by the mblock, creating a
1827 * stripe for each. Stop when lgrp_start is visited again.
1830 /* mblock may not span all lgroups */
1835 ASSERT(mnode
< max_mem_nodes
);
1838 * Calculate the size of the fragment that does not
1839 * belong to the mnode in the last partial stride.
1841 frag
= (end
- (base
- offset
)) & (stride
- 1);
1843 /* remove the gap */
1844 remove
= stride
- stripe
;
1845 } else if (frag
< stripe
) {
1846 /* fragment fits in stripe; keep it all */
1849 /* fragment is large; trim after whole stripe */
1850 remove
= frag
- stripe
;
1853 ms
= &mc
->mc_stripes
[i
* max_locality_groups
+ mnode
];
1854 ms
->physbase
= btop(base
);
1855 ms
->physmax
= btop(end
- 1 - remove
);
1856 ms
->offset
= btop(offset
);
1860 stripe_end
+= stripe
;
1862 lgrphand
= (((base
+ ra_to_pa
) & home_mask
) >>
1864 } while (lgrphand
!= lgrp_start
);
1868 #define INTERSECT(a, b, c, d) \
1869 if (((a) >= (c) && (a) <= (d)) || \
1870 ((c) >= (a) && (c) <= (b))) { \
1871 (c) = MAX((a), (c)); \
1872 (d) = MIN((b), (d)); \
1874 ASSERT((a) >= (d) || (b) <= (c)); \
1881 * Read stripes from mc and update mnode extents. The mnode extents are
1882 * part of the live configuration, so this can only be done at boot time
1883 * or while holding the mpo_wr_lock.
1887 mnode_update(mpo_config_t
*mc
, pfn_t ubase
, pfn_t uend
, update_t utype
)
1889 int i
, j
, mnode
, found
;
1893 MPO_DEBUG("mnode_udpate: basepfn: %lx endpfn: %lx\n", ubase
, uend
);
1895 if (n_locality_groups
<= 1 && mc
->mc_nmblocks
== 1) {
1897 mpo_mem_node_add_slice(ubase
, uend
);
1898 else if (utype
== U_DEL
)
1899 mpo_mem_node_del_slice(ubase
, uend
);
1901 panic("mnode update: %d: invalid\n", utype
);
1906 for (i
= 0; i
< mc
->mc_nmblocks
; i
++) {
1907 for (mnode
= 0; mnode
< max_locality_groups
; mnode
++) {
1909 j
= i
* max_locality_groups
+ mnode
;
1910 ms
= &mc
->mc_stripes
[j
];
1914 base
= ms
->physbase
;
1918 * Look for the mstripes intersecting this slice.
1920 * The mstripe and slice pairs may not be equal
1921 * if a subset of a mblock is added/deleted.
1925 INTERSECT(ubase
, uend
, base
, end
);
1928 if (n_locality_groups
> 1)
1929 mpo_plat_assign_lgrphand_to_mem_node(
1931 mpo_mem_node_add_slice(base
, end
);
1934 INTERSECT(ubase
, uend
, base
, end
);
1935 mpo_mem_node_del_slice(base
, end
);
1938 panic("mnode_update: %d: invalid\n", utype
);
1947 panic("mnode_update: mstripe not found");
1950 if (utype
== U_ADD_ALL
|| utype
== U_DEL
)
1953 for (i
= 0; i
< max_mem_nodes
; i
++) {
1954 if (!mem_node_config
[i
].exists
)
1956 if (ubase
>= mem_node_config
[i
].physbase
&&
1957 ubase
<= mem_node_config
[i
].physmax
)
1959 if (uend
>= mem_node_config
[i
].physbase
&&
1960 uend
<= mem_node_config
[i
].physmax
)
1965 pfn_t minpfn
, maxpfn
;
1967 mem_node_max_range(&minpfn
, &maxpfn
);
1968 ASSERT(minpfn
<= ubase
);
1969 ASSERT(maxpfn
>= uend
);
1975 * Plat_slice_add()/plat_slice_del() are the platform hooks
1976 * for adding/deleting a pfn range to/from the system.
1978 * Platform_slice_add() is used for both boot/DR cases.
1980 * - Zeus has already added the mblocks to the MD, so read the updated
1981 * MD and allocate all data structures required to manage the new memory
1984 * - Recompute the stripes which are derived from the mblocks.
1986 * - Update (expand) the mnode extents and install the modified mblocks as
1987 * the new mpo config. This must be done while holding the mpo_wr_lock
1988 * to guarantee that no other threads access the mpo meta-data.
1990 * - Unlock MPO data structures; the new config is live. Free the old config.
1992 * Plat_slice_del() is used for DR only.
1994 * - Zeus has not yet modified the MD to reflect the deletion, so copy
1995 * the old mpo mblocks and delete the range from the copy.
1997 * - Recompute the stripes which are derived from the mblocks.
1999 * - Update (shrink) the mnode extents and install the modified mblocks as
2000 * the new mpo config. This must be done while holding the mpo_wr_lock
2001 * to guarantee that no other threads access the mpo meta-data.
2003 * - Unlock MPO data structures; the new config is live. Free the old config.
2007 plat_slice_add(pfn_t base
, pfn_t end
)
2009 mpo_config_t old_config
= mpo_config
;
2010 mpo_config_t new_config
;
2012 VALIDATE_SLICE(base
, end
);
2013 mblock_update_add(&new_config
);
2014 mstripe_update(&new_config
);
2016 mblock_install(&new_config
);
2017 /* Use new config to add all ranges for mnode_update */
2018 mnode_update(&new_config
, base
, end
, U_ADD
);
2021 mblock_free(&old_config
);
2025 plat_slice_del(pfn_t base
, pfn_t end
)
2027 mpo_config_t old_config
= mpo_config
;
2028 mpo_config_t new_config
;
2030 VALIDATE_SLICE(base
, end
);
2031 mblock_update_del(&new_config
, &old_config
, base
, end
);
2032 mstripe_update(&new_config
);
2034 /* Use old config to find deleted range for mnode_update */
2035 mnode_update(&old_config
, base
, end
, U_DEL
);
2036 mblock_install(&new_config
);
2039 mblock_free(&old_config
);