5285 pass in cpu_pause_func via pause_cpus
[unleashed.git] / usr / src / uts / sun4v / os / mpo.c
blobad0cf0e78a4ef1587fbc7e47061817234bf3a359
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/types.h>
28 #include <sys/sysmacros.h>
29 #include <sys/machsystm.h>
30 #include <sys/machparam.h>
31 #include <sys/cmn_err.h>
32 #include <sys/stat.h>
33 #include <sys/mach_descrip.h>
34 #include <sys/memnode.h>
35 #include <sys/mdesc.h>
36 #include <sys/mpo.h>
37 #include <vm/page.h>
38 #include <vm/vm_dep.h>
39 #include <vm/hat_sfmmu.h>
40 #include <sys/promif.h>
43 * MPO and the sun4v memory representation
44 * ---------------------------------------
46 * Latency groups are defined in the sun4v achitecture by memory-latency-group
47 * nodes in the Machine Description, as specified in FWARC/2007/260. These
48 * tie together cpu nodes and mblock nodes, and contain mask and match
49 * properties that identify the portion of an mblock that belongs to the
50 * lgroup. Mask and match are defined in the Physical Address (PA) space,
51 * but an mblock defines Real Addresses (RA). To translate, the mblock
52 * includes the property address-congruence-offset, hereafter referred to as
53 * ra_to_pa. A real address ra is a member of an lgroup if
55 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
57 * The MD is traversed, and information on all mblocks is kept in the array
58 * mpo_mblock[]. Information on all CPUs, including which lgroup they map
59 * to, is kept in the array mpo_cpu[].
61 * This implementation makes (and verifies) the simplifying assumption that
62 * the mask bits are the same for all defined lgroups, and that all 1 bits in
63 * the mask are contiguous. Thus the number of lgroups is bounded by the
64 * number of possible mask values, and the lgrp_handle_t is defined as the
65 * mask value, shifted right to eliminate the 0 bit positions in mask. The
66 * masks and values are also referred to as "home bits" in the code.
68 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
69 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
70 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
71 * home bits. This yields the mem_node.
73 * Interfaces
74 * ----------
76 * This file exports the following entry points:
78 * plat_lgrp_init()
79 * plat_build_mem_nodes()
80 * plat_lgrp_cpu_to_hand()
81 * plat_lgrp_latency()
82 * plat_pfn_to_mem_node()
83 * These implement the usual platform lgroup interfaces.
85 * plat_rapfn_to_papfn()
86 * Recover the PA page coloring bits from an RA.
88 * plat_mem_node_iterator_init()
89 * Initialize an iterator to efficiently step through pages in a mem_node.
91 * plat_mem_node_intersect_range()
92 * Find the intersection with a mem_node.
94 * plat_slice_add()
95 * plat_slice_del()
96 * Platform hooks to add/delete a pfn range.
98 * Internal Organization
99 * ---------------------
101 * A number of routines are used both boot/DR code which (re)build
102 * appropriate MPO structures.
104 * mblock_alloc()
105 * Allocate memory for mblocks and stripes as
106 * appropriate for boot or memory DR.
108 * mblock_free()
109 * Free memory allocated by mblock_alloc.
111 * mblock_update()
112 * Build mblocks based on mblock nodes read from the MD.
114 * mblock_update_add()
115 * Rebuild mblocks after a memory DR add operation.
117 * mblock_update_del()
118 * Rebuild mblocks after a memory DR delete operation.
120 * mblock_install()
121 * Install mblocks as the new configuration.
123 * mstripe_update()
124 * Build stripes based on mblocks.
126 * mnode_update()
127 * Call memnode layer to add/del a pfn range, based on stripes.
129 * The platform interfaces allocate all memory required for the
130 * particualar update first, block access to the MPO structures
131 * while they are updated, and free old structures after the update.
134 int sun4v_mpo_enable = 1;
135 int sun4v_mpo_debug = 0;
136 char sun4v_mpo_status[256] = "";
138 /* Save CPU info from the MD and associate CPUs with lgroups */
139 static struct cpu_md mpo_cpu[NCPU];
141 /* Save lgroup info from the MD */
142 #define MAX_MD_LGROUPS 32
143 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
144 static int n_lgrpnodes = 0;
145 static int n_locality_groups = 0;
146 static int max_locality_groups = 0;
147 static int szc_mask0 = 0;
149 /* Save mblocks from the MD */
150 #define SMALL_MBLOCKS_COUNT 8
151 static struct mblock_md *mpo_mblock;
152 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
153 static int n_mblocks = 0;
155 /* Save mem_node stripes calculate from mblocks and lgroups. */
156 static mem_stripe_t *mem_stripes;
157 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
158 static int n_mem_stripes = 0;
159 static pfn_t mnode_stride; /* distance between stripes, start to start */
160 static int stripe_shift; /* stride/stripes expressed as a shift */
161 static pfn_t mnode_pages; /* mem_node stripe width */
163 /* Save home mask and shift used to calculate lgrp_handle_t values */
164 static uint64_t home_mask = 0;
165 static pfn_t home_mask_pfn = 0;
166 static int home_mask_shift = 0;
167 static uint_t home_mask_pfn_shift = 0;
169 /* Save lowest and highest latencies found across all lgroups */
170 static int lower_latency = 0;
171 static int higher_latency = 0;
173 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */
174 static int mpo_genid; /* config gen; updated by mem DR */
175 static mpo_config_t mpo_config; /* current mblocks and stripes */
177 typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t;
179 static int valid_pages(md_t *md, mde_cookie_t cpu0);
180 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset);
181 static int fix_interleave(void);
183 static int mblock_alloc(mpo_config_t *, update_t, int nmblocks);
184 static void mblock_install(mpo_config_t *);
185 static void mblock_free(mpo_config_t *);
186 static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes);
187 static void mblock_update_add(mpo_config_t *);
188 static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t);
189 static void mstripe_update(mpo_config_t *);
190 static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t);
192 /* Debug support */
193 #if defined(DEBUG) && !defined(lint)
194 #define VALIDATE_SLICE(base, end) { \
195 ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M))); \
196 ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M))); \
198 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
199 #else
200 #define VALIDATE_SLICE(base, end)
201 #define MPO_DEBUG(...)
202 #endif /* DEBUG */
204 /* Record status message, viewable from mdb */
205 #define MPO_STATUS(args...) { \
206 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \
207 MPO_DEBUG(sun4v_mpo_status); \
211 * The MPO locks are to protect the MPO metadata while that
212 * information is updated as a result of a memory DR operation.
213 * The read lock must be acquired to read the metadata and the
214 * write locks must be acquired to update it.
216 #define mpo_rd_lock kpreempt_disable
217 #define mpo_rd_unlock kpreempt_enable
219 static void
220 mpo_wr_lock()
222 mutex_enter(&cpu_lock);
223 pause_cpus(NULL, NULL);
224 mutex_exit(&cpu_lock);
227 static void
228 mpo_wr_unlock()
230 mutex_enter(&cpu_lock);
231 start_cpus();
232 mutex_exit(&cpu_lock);
236 * Routine to read a uint64_t from a given md
238 static int64_t
239 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
241 int err = md_get_prop_val(md, node, propname, val);
242 return (err);
245 static int
246 mblock_cmp(const void *a, const void *b)
248 struct mblock_md *m1 = (struct mblock_md *)a;
249 struct mblock_md *m2 = (struct mblock_md *)b;
251 if (m1->base < m2->base)
252 return (-1);
253 else if (m1->base == m2->base)
254 return (0);
255 else
256 return (1);
259 static void
260 mblock_sort(struct mblock_md *mblocks, int n)
262 extern void qsort(void *, size_t, size_t,
263 int (*)(const void *, const void *));
265 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
268 static void
269 mpo_update_tunables(void)
271 int i, ncpu_min;
274 * lgrp_expand_proc_thresh is the minimum load on the lgroups
275 * this process is currently running on before considering
276 * expanding threads to another lgroup.
278 * lgrp_expand_proc_diff determines how much less the remote lgroup
279 * must be loaded before expanding to it.
281 * On sun4v CMT processors, threads share a core pipeline, and
282 * at less than 100% utilization, best throughput is obtained by
283 * spreading threads across more cores, even if some are in a
284 * different lgroup. Spread threads to a new lgroup if the
285 * current group is more than 50% loaded. Because of virtualization,
286 * lgroups may have different numbers of CPUs, but the tunables
287 * apply to all lgroups, so find the smallest lgroup and compute
288 * 50% loading.
291 ncpu_min = NCPU;
292 for (i = 0; i < n_lgrpnodes; i++) {
293 int ncpu = mpo_lgroup[i].ncpu;
294 if (ncpu != 0 && ncpu < ncpu_min)
295 ncpu_min = ncpu;
297 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
299 /* new home may only be half as loaded as the existing home to use it */
300 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
302 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
305 static mde_cookie_t
306 cpuid_to_cpunode(md_t *md, int cpuid)
308 mde_cookie_t rootnode, foundnode, *cpunodes;
309 uint64_t cpuid_prop;
310 int n_cpunodes, i;
312 if (md == NULL)
313 return (MDE_INVAL_ELEM_COOKIE);
315 rootnode = md_root_node(md);
316 if (rootnode == MDE_INVAL_ELEM_COOKIE)
317 return (MDE_INVAL_ELEM_COOKIE);
319 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
320 "fwd", &cpunodes);
321 if (n_cpunodes <= 0 || n_cpunodes > NCPU)
322 goto cpuid_fail;
324 for (i = 0; i < n_cpunodes; i++) {
325 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
326 &cpuid_prop))
327 break;
328 if (cpuid_prop == (uint64_t)cpuid) {
329 foundnode = cpunodes[i];
330 md_free_scan_dag(md, &cpunodes);
331 return (foundnode);
334 cpuid_fail:
335 if (n_cpunodes > 0)
336 md_free_scan_dag(md, &cpunodes);
337 return (MDE_INVAL_ELEM_COOKIE);
340 static int
341 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
343 mde_cookie_t *nodes;
344 uint64_t latency, lowest_latency;
345 uint64_t address_match, lowest_address_match;
346 int n_lgroups, j, result = 0;
348 /* Find lgroup nodes reachable from this cpu */
349 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
350 "fwd", &nodes);
352 lowest_latency = ~(0UL);
354 /* Find the lgroup node with the smallest latency */
355 for (j = 0; j < n_lgroups; j++) {
356 result = get_int(md, nodes[j], PROP_LG_LATENCY,
357 &latency);
358 result |= get_int(md, nodes[j], PROP_LG_MATCH,
359 &address_match);
360 if (result != 0) {
361 j = -1;
362 goto to_lgrp_done;
364 if (latency < lowest_latency) {
365 lowest_latency = latency;
366 lowest_address_match = address_match;
369 for (j = 0; j < n_lgrpnodes; j++) {
370 if ((mpo_lgroup[j].latency == lowest_latency) &&
371 (mpo_lgroup[j].addr_match == lowest_address_match))
372 break;
374 if (j == n_lgrpnodes)
375 j = -1;
377 to_lgrp_done:
378 if (n_lgroups > 0)
379 md_free_scan_dag(md, &nodes);
380 return (j);
383 /* Called when DR'ing in a CPU */
384 void
385 mpo_cpu_add(md_t *md, int cpuid)
387 mde_cookie_t cpunode;
389 int i;
391 if (n_lgrpnodes <= 0)
392 return;
394 if (md == NULL)
395 goto add_fail;
397 cpunode = cpuid_to_cpunode(md, cpuid);
398 if (cpunode == MDE_INVAL_ELEM_COOKIE)
399 goto add_fail;
401 i = mpo_cpu_to_lgroup(md, cpunode);
402 if (i == -1)
403 goto add_fail;
405 mpo_cpu[cpuid].lgrp_index = i;
406 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
407 mpo_lgroup[i].ncpu++;
408 mpo_update_tunables();
409 return;
410 add_fail:
411 panic("mpo_cpu_add: Cannot read MD");
414 /* Called when DR'ing out a CPU */
415 void
416 mpo_cpu_remove(int cpuid)
418 int i;
420 if (n_lgrpnodes <= 0)
421 return;
423 i = mpo_cpu[cpuid].lgrp_index;
424 mpo_lgroup[i].ncpu--;
425 mpo_cpu[cpuid].home = 0;
426 mpo_cpu[cpuid].lgrp_index = -1;
427 mpo_update_tunables();
430 static mde_cookie_t
431 md_get_root(md_t *md)
433 mde_cookie_t root = MDE_INVAL_ELEM_COOKIE;
434 int n_nodes;
436 n_nodes = md_node_count(md);
438 if (n_nodes <= 0) {
439 MPO_STATUS("md_get_root: No nodes in node count\n");
440 return (root);
443 root = md_root_node(md);
445 if (root == MDE_INVAL_ELEM_COOKIE) {
446 MPO_STATUS("md_get_root: Root node is missing\n");
447 return (root);
450 MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes);
451 MPO_DEBUG("md_get_root: md: %p\n", md);
452 MPO_DEBUG("md_get_root: root: %lx\n", root);
453 done:
454 return (root);
457 static int
458 lgrp_update(md_t *md, mde_cookie_t root)
460 int i, j, result;
461 int ret_val = 0;
462 int sub_page_fix;
463 mde_cookie_t *nodes, *lgrpnodes;
465 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
466 "fwd", &lgrpnodes);
468 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
469 MPO_STATUS("lgrp_update: No Lgroups\n");
470 ret_val = -1;
471 goto fail;
474 MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes);
476 for (i = 0; i < n_lgrpnodes; i++) {
477 mpo_lgroup[i].node = lgrpnodes[i];
478 mpo_lgroup[i].id = i;
479 mpo_lgroup[i].ncpu = 0;
480 result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
481 &mpo_lgroup[i].addr_mask);
482 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
483 &mpo_lgroup[i].addr_match);
486 * If either the mask or match properties are missing, set to 0
488 if (result < 0) {
489 mpo_lgroup[i].addr_mask = 0;
490 mpo_lgroup[i].addr_match = 0;
493 /* Set latency to 0 if property not present */
495 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
496 &mpo_lgroup[i].latency);
497 if (result < 0)
498 mpo_lgroup[i].latency = 0;
502 * Sub-page level interleave is not yet supported. Check for it,
503 * and remove sub-page interleaved lgroups from mpo_lgroup and
504 * n_lgrpnodes. If no lgroups are left, return.
507 sub_page_fix = fix_interleave();
508 if (n_lgrpnodes == 0) {
509 ret_val = -1;
510 goto fail;
513 /* Ensure that all of the addr_mask values are the same */
515 for (i = 0; i < n_lgrpnodes; i++) {
516 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
517 MPO_STATUS("lgrp_update: "
518 "addr_mask values are not the same\n");
519 ret_val = -1;
520 goto fail;
525 * Ensure that all lgrp nodes see all the mblocks. However, if
526 * sub-page interleave is being fixed, they do not, so skip
527 * the check.
530 if (sub_page_fix == 0) {
531 for (i = 0; i < n_lgrpnodes; i++) {
532 j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
533 PROP_LG_MBLOCK, "fwd", &nodes);
534 md_free_scan_dag(md, &nodes);
535 if (j != n_mblocks) {
536 MPO_STATUS("lgrp_update: "
537 "sub-page interleave is being fixed\n");
538 ret_val = -1;
539 goto fail;
543 fail:
544 if (n_lgrpnodes > 0) {
545 md_free_scan_dag(md, &lgrpnodes);
546 for (i = 0; i < n_lgrpnodes; i++)
547 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
550 return (ret_val);
555 * Traverse the MD to determine:
557 * Number of CPU nodes, lgrp_nodes, and mblocks
558 * Then for each lgrp_node, obtain the appropriate data.
559 * For each CPU, determine its home locality and store it.
560 * For each mblock, retrieve its data and store it.
562 static int
563 lgrp_traverse(md_t *md)
565 mde_cookie_t root, *cpunodes, *mblocknodes;
566 int o;
567 uint64_t i, k, stripe, stride;
568 uint64_t mem_lg_homeset = 0;
569 int ret_val = 0;
570 int result = 0;
571 int n_cpunodes = 0;
572 mpo_config_t new_config;
574 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) {
575 ret_val = -1;
576 goto fail;
579 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
580 &mblocknodes);
581 if (n_mblocks <= 0) {
582 MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
583 "Descriptor\n");
584 ret_val = -1;
585 goto fail;
589 * Build the Memory Nodes. Do this before any possibility of
590 * bailing from this routine so we obtain ra_to_pa (needed for page
591 * coloring) even when there are no lgroups defined.
593 if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) {
594 ret_val = -1;
595 goto fail;
598 mblock_update(&new_config, md, mblocknodes);
599 mblock_install(&new_config);
601 /* Page coloring hook is required so we can iterate through mnodes */
602 if (&page_next_pfn_for_color_cpu == NULL) {
603 MPO_STATUS("lgrp_traverse: No page coloring support\n");
604 ret_val = -1;
605 goto fail;
608 /* Global enable for mpo */
609 if (sun4v_mpo_enable == 0) {
610 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
611 ret_val = -1;
612 goto fail;
615 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
617 if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
618 MPO_STATUS("lgrp_traverse: No CPU nodes detected "
619 "in MD\n");
620 ret_val = -1;
621 goto fail;
624 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
626 if ((ret_val = lgrp_update(md, root)) == -1)
627 goto fail;
630 * Use the address mask from the first lgroup node
631 * to establish our home_mask.
633 home_mask = mpo_lgroup[0].addr_mask;
634 home_mask_pfn = btop(home_mask);
635 home_mask_shift = lowbit(home_mask) - 1;
636 home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
637 mnode_pages = btop(1ULL << home_mask_shift);
640 * How many values are possible in home mask? Assume the mask
641 * bits are contiguous.
643 max_locality_groups =
644 1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
646 stripe_shift = highbit(max_locality_groups) - 1;
647 stripe = ptob(mnode_pages);
648 stride = max_locality_groups * stripe;
649 mnode_stride = btop(stride);
651 /* Now verify the home mask bits are contiguous */
653 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
654 MPO_STATUS("lgrp_traverse: "
655 "home mask bits are not contiguous\n");
656 ret_val = -1;
657 goto fail;
660 /* Record all of the home bits */
662 for (i = 0; i < n_lgrpnodes; i++) {
663 HOMESET_ADD(mem_lg_homeset,
664 mpo_lgroup[i].addr_match >> home_mask_shift);
667 /* Count the number different "home" mem_lg's we've discovered */
669 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
671 /* If we have only 1 locality group then we can exit */
672 if (n_locality_groups == 1) {
673 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
674 ret_val = -1;
675 goto fail;
679 * Set the latencies. A CPU's lgroup is defined by the lowest
680 * latency found. All other memory is considered remote, and the
681 * remote latency is represented by the highest latency found.
682 * Thus hierarchical lgroups, if any, are approximated by a
683 * two level scheme.
685 * The Solaris MPO framework by convention wants to see latencies
686 * in units of nano-sec/10. In the MD, the units are defined to be
687 * pico-seconds.
690 lower_latency = mpo_lgroup[0].latency;
691 higher_latency = mpo_lgroup[0].latency;
693 for (i = 1; i < n_lgrpnodes; i++) {
694 if (mpo_lgroup[i].latency < lower_latency) {
695 lower_latency = mpo_lgroup[i].latency;
697 if (mpo_lgroup[i].latency > higher_latency) {
698 higher_latency = mpo_lgroup[i].latency;
701 lower_latency /= 10000;
702 higher_latency /= 10000;
704 /* Clear our CPU data */
706 for (i = 0; i < NCPU; i++) {
707 mpo_cpu[i].home = 0;
708 mpo_cpu[i].lgrp_index = -1;
711 /* Build the CPU nodes */
712 for (i = 0; i < n_cpunodes; i++) {
714 /* Read in the lgroup nodes */
715 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
716 if (result < 0) {
717 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
718 ret_val = -1;
719 goto fail;
722 o = mpo_cpu_to_lgroup(md, cpunodes[i]);
723 if (o == -1) {
724 ret_val = -1;
725 goto fail;
727 mpo_cpu[k].lgrp_index = o;
728 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
729 mpo_lgroup[o].ncpu++;
731 /* Validate that no large pages cross mnode boundaries. */
732 if (valid_pages(md, cpunodes[0]) == 0) {
733 ret_val = -1;
734 goto fail;
737 fail:
738 if (n_cpunodes > 0)
739 md_free_scan_dag(md, &cpunodes);
740 if (n_mblocks > 0)
741 md_free_scan_dag(md, &mblocknodes);
742 else
743 panic("lgrp_traverse: No memory blocks found");
745 if (ret_val == 0) {
746 MPO_STATUS("MPO feature is enabled.\n");
747 } else
748 sun4v_mpo_enable = 0; /* set this for DR */
750 return (ret_val);
754 * Determine the number of unique mem_lg's present in our system
756 static int
757 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
759 int homeid;
760 int count = 0;
763 * Scan the "home" bits of the mem_lgs, count
764 * the number that are unique.
767 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
768 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
769 count++;
773 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
774 mem_lg_homeset);
775 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
777 /* Default must be at least one */
778 if (count == 0)
779 count = 1;
781 return (count);
785 * Platform specific lgroup initialization
787 void
788 plat_lgrp_init(void)
790 md_t *md;
791 int rc;
793 /* Get the Machine Descriptor handle */
795 md = md_get_handle();
797 /* If not, we cannot continue */
799 if (md == NULL) {
800 panic("cannot access machine descriptor\n");
801 } else {
802 rc = lgrp_traverse(md);
803 (void) md_fini_handle(md);
807 * If we can't process the MD for lgroups then at least let the
808 * system try to boot. Assume we have one lgroup so that
809 * when plat_build_mem_nodes is called, it will attempt to init
810 * an mnode based on the supplied memory segment.
813 if (rc == -1) {
814 home_mask_pfn = 0;
815 max_locality_groups = 1;
816 n_locality_groups = 1;
817 return;
820 mem_node_pfn_shift = 0;
821 mem_node_physalign = 0;
823 /* Use lgroup-aware TSB allocations */
824 tsb_lgrp_affinity = 1;
826 /* Require that a home lgroup have some memory to be chosen */
827 lgrp_mem_free_thresh = 1;
829 /* Standard home-on-next-touch policy */
830 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
832 /* Disable option to choose root lgroup if all leaf lgroups are busy */
833 lgrp_load_thresh = UINT32_MAX;
835 mpo_update_tunables();
839 * Helper routine for debugging calls to mem_node_add_slice()
841 static void
842 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
844 #if defined(DEBUG) && !defined(lint)
845 static int slice_count = 0;
847 slice_count++;
848 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n",
849 slice_count, basepfn, endpfn);
850 #endif
851 mem_node_add_slice(basepfn, endpfn);
854 static void
855 mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn)
857 #if defined(DEBUG) && !defined(lint)
858 static int slice_count = 0;
860 slice_count++;
861 MPO_DEBUG("mem_del_slice(%d): basepfn: %lx endpfn: %lx\n",
862 slice_count, basepfn, endpfn);
863 #endif
864 mem_node_del_slice(basepfn, endpfn);
868 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
870 static void
871 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
873 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
874 "mnode index: %d\n", plathand, mnode);
875 plat_assign_lgrphand_to_mem_node(plathand, mnode);
879 * plat_build_mem_nodes()
881 * Define the mem_nodes based on the modified boot memory list,
882 * or based on info read from the MD in plat_lgrp_init().
884 * When the home mask lies in the middle of the address bits (as it does on
885 * Victoria Falls), then the memory in one mem_node is no longer contiguous;
886 * it is striped across an mblock in a repeating pattern of contiguous memory
887 * followed by a gap. The stripe width is the size of the contiguous piece.
888 * The stride is the distance from the start of one contiguous piece to the
889 * start of the next. The gap is thus stride - stripe_width.
891 * The stripe of an mnode that falls within an mblock is described by the type
892 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The
893 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into
894 * this array is predetermined. The mem_stripe_t that describes mnode m
895 * within mpo_mblock[i] is stored at
896 * mem_stripes[ m + i * max_locality_groups ]
898 * max_locality_groups is the total number of possible locality groups,
899 * as defined by the size of the home mask, even if the memory assigned
900 * to the domain is small and does not cover all the lgroups. Thus some
901 * mem_stripe_t's may be empty.
903 * The members of mem_stripe_t are:
904 * physbase: First valid page in mem_node in the corresponding mblock
905 * physmax: Last valid page in mem_node in mblock
906 * offset: The full stripe width starts at physbase - offset.
907 * Thus if offset is non-zero, this mem_node starts in the middle
908 * of a stripe width, and the second full stripe starts at
909 * physbase - offset + stride. (even though physmax may fall in the
910 * middle of a stripe width, we do not save the ending fragment size
911 * in this data structure.)
912 * exists: Set to 1 if the mblock has memory in this mem_node stripe.
914 * The stripe width is kept in the global mnode_pages.
915 * The stride is kept in the global mnode_stride.
916 * All the above use pfn's as the unit.
918 * As an example, the memory layout for a domain with 2 mblocks and 4
919 * mem_nodes 0,1,2,3 could look like this:
921 * 123012301230 ... 012301230123 ...
922 * mblock 0 mblock 1
925 /*ARGSUSED*/
926 void
927 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
929 int elem;
930 uint64_t base, len;
932 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
933 max_mem_nodes = max_locality_groups;
935 mstripe_update(&mpo_config);
937 /* Check for non-MPO sun4v platforms */
938 if (n_locality_groups <= 1) {
939 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
940 for (elem = 0; elem < nelems; list++, elem++) {
941 base = list->addr;
942 len = list->size;
944 mpo_mem_node_add_slice(btop(base),
945 btop(base + len - 1));
947 mem_node_pfn_shift = 0;
948 mem_node_physalign = 0;
949 } else
950 mnode_update(&mpo_config, 0, 0, U_ADD_ALL);
953 * Indicate to vm_pagelist that the hpm_counters array
954 * should be shared because the ranges overlap.
956 if (max_mem_nodes > 1) {
957 interleaved_mnodes = 1;
962 * Return the locality group value for the supplied processor
964 lgrp_handle_t
965 plat_lgrp_cpu_to_hand(processorid_t id)
967 lgrp_handle_t lgrphand;
969 mpo_rd_lock();
970 if (n_locality_groups > 1) {
971 lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home;
972 } else {
973 lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */
975 mpo_rd_unlock();
977 return (lgrphand);
981 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
984 * Return min remote latency when there are more than two lgroups
985 * (root and child) and getting latency between two different lgroups
986 * or root is involved.
988 if (lgrp_optimizations() && (from != to ||
989 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
990 return ((int)higher_latency);
991 } else {
992 return ((int)lower_latency);
997 plat_pfn_to_mem_node(pfn_t pfn)
999 int i, mnode;
1000 pfn_t ra_to_pa_pfn;
1001 struct mblock_md *mb;
1003 if (n_locality_groups <= 1)
1004 return (0);
1007 * The mnode is defined to be 1:1 with the lgroup handle, which
1008 * is taken from from the home bits. Find the mblock in which
1009 * the pfn falls to get the ra_to_pa adjustment, and extract
1010 * the home bits.
1012 mpo_rd_lock();
1013 mb = &mpo_mblock[0];
1014 for (i = 0; i < n_mblocks; i++) {
1015 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1016 ra_to_pa_pfn = btop(mb->ra_to_pa);
1017 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1018 home_mask_pfn_shift);
1019 ASSERT(mnode < max_mem_nodes);
1020 mpo_rd_unlock();
1021 return (mnode);
1023 mb++;
1026 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1027 return (pfn);
1031 * plat_rapfn_to_papfn
1033 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034 * and home mask bits are correct. The upper bits do not necessarily
1035 * match the actual PA, however.
1037 pfn_t
1038 plat_rapfn_to_papfn(pfn_t pfn)
1040 int i;
1041 pfn_t ra_to_pa_pfn;
1042 struct mblock_md *mb;
1044 ASSERT(n_mblocks > 0);
1045 if (n_mblocks == 1)
1046 return (pfn + base_ra_to_pa_pfn);
1049 * Find the mblock in which the pfn falls
1050 * in order to get the ra_to_pa adjustment.
1052 mpo_rd_lock();
1053 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1054 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1055 ra_to_pa_pfn = btop(mb->ra_to_pa);
1056 mpo_rd_unlock();
1057 return (pfn + ra_to_pa_pfn);
1061 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1062 return (pfn);
1066 * plat_mem_node_iterator_init()
1067 * Initialize cookie "it" to iterate over pfn's in an mnode. There is
1068 * no additional iterator function. The caller uses the info from
1069 * the iterator structure directly.
1071 * pfn: starting pfn.
1072 * mnode: desired mnode.
1073 * szc: desired page size.
1074 * init:
1075 * if 1, start a new traversal, initialize "it", find first
1076 * mblock containing pfn, and return its starting pfn
1077 * within the mnode.
1078 * if 0, continue the previous traversal using passed-in data
1079 * from "it", advance to the next mblock, and return its
1080 * starting pfn within the mnode.
1081 * it: returns readonly data to the caller; see below.
1083 * The input pfn must be aligned for the page size szc.
1085 * Returns: starting pfn for the iteration for the mnode/mblock,
1086 * which is aligned according to the page size,
1087 * or returns (pfn_t)(-1) if the input pfn lies past the last
1088 * valid pfn of the mnode.
1089 * Returns misc values in the "it" struct that allows the caller
1090 * to advance the pfn within an mblock using address arithmetic;
1091 * see definition of mem_node_iterator_t in vm_dep.h.
1092 * When the caller calculates a pfn that is greater than the
1093 * returned value it->mi_mblock_end, the caller should again
1094 * call plat_mem_node_iterator_init, passing init=0.
1096 * The last mblock in continuation case may be invalid because
1097 * of memory DR. To detect this situation mi_genid is checked
1098 * against mpo_genid which is incremented after a memory DR
1099 * operation. See also plat_slice_add()/plat_slice_del().
1101 pfn_t
1102 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1103 mem_node_iterator_t *it, int init)
1105 int i;
1106 pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1107 struct mblock_md *mblock;
1108 pfn_t base, end;
1109 mem_stripe_t *ms;
1110 uint64_t szcpagesize;
1112 ASSERT(it != NULL);
1113 ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1114 ASSERT(n_mblocks > 0);
1115 ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1117 mpo_rd_lock();
1119 if (init || (it->mi_genid != mpo_genid)) {
1120 it->mi_genid = mpo_genid;
1121 it->mi_last_mblock = 0;
1122 it->mi_init = 1;
1125 /* Check if mpo is not enabled and we only have one mblock */
1126 if (n_locality_groups == 1 && n_mblocks == 1) {
1127 if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) {
1128 pfn = (pfn_t)-1;
1129 goto done;
1131 it->mi_mnode = mnode;
1132 it->mi_ra_to_pa = base_ra_to_pa_pfn;
1133 it->mi_mnode_pfn_mask = 0;
1134 it->mi_mnode_pfn_shift = 0;
1135 it->mi_mnode_mask = 0;
1136 it->mi_mblock_base = mem_node_config[mnode].physbase;
1137 it->mi_mblock_end = mem_node_config[mnode].physmax;
1138 if (pfn < it->mi_mblock_base)
1139 pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1140 if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1141 pfn = (pfn_t)-1;
1142 goto done;
1145 /* init=1 means begin iterator, init=0 means continue */
1146 if (init == 1) {
1147 i = 0;
1148 } else {
1149 ASSERT(it->mi_last_mblock < n_mblocks);
1150 i = it->mi_last_mblock;
1151 ASSERT(pfn >
1152 mem_stripes[i * max_locality_groups + mnode].physmax);
1153 if (++i == n_mblocks) {
1154 pfn = (pfn_t)-1;
1155 goto done;
1160 * Find mblock that contains pfn for mnode's stripe, or first such an
1161 * mblock after pfn, else pfn is out of bound and we'll return -1.
1162 * mblocks and stripes are sorted in ascending address order.
1164 szcpagesize = szcpgcnt << PAGESHIFT;
1165 for (; i < n_mblocks; i++) {
1166 if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1167 continue;
1168 ms = &mem_stripes[i * max_locality_groups + mnode];
1169 if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1170 (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1171 ms->physmax)
1172 break;
1174 if (i == n_mblocks) {
1175 it->mi_last_mblock = i - 1;
1176 pfn = (pfn_t)-1;
1177 goto done;
1180 it->mi_last_mblock = i;
1182 mblock = &mpo_mblock[i];
1183 base = ms->physbase;
1184 end = ms->physmax;
1186 it->mi_mnode = mnode;
1187 it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1188 it->mi_mblock_base = base;
1189 it->mi_mblock_end = end;
1190 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */
1191 it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1192 it->mi_mnode_mask = max_locality_groups - 1;
1193 if (pfn < base) {
1194 pfn = P2ROUNDUP(base, szcpgcnt);
1195 ASSERT(pfn + szcpgcnt - 1 <= end);
1197 ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
1198 done:
1199 mpo_rd_unlock();
1200 return (pfn);
1204 * plat_mem_node_intersect_range()
1206 * Find the intersection between a memnode and a range of pfn's.
1208 void
1209 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1210 int mnode, pgcnt_t *npages_out)
1212 pfn_t offset, len, hole, base, end, test_end, frag;
1213 pfn_t nearest;
1214 mem_stripe_t *ms;
1215 int i, npages;
1217 *npages_out = 0;
1219 if (!mem_node_config[mnode].exists || test_len == 0)
1220 return;
1222 base = mem_node_config[mnode].physbase;
1223 end = mem_node_config[mnode].physmax;
1225 test_end = test_base + test_len - 1;
1226 if (end < test_base || base > test_end)
1227 return;
1229 if (n_locality_groups == 1) {
1230 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1231 return;
1234 hole = mnode_stride - mnode_pages;
1235 npages = 0;
1238 * Iterate over all the stripes for this mnode (one per mblock),
1239 * find the intersection with each, and accumulate the intersections.
1241 * Determing the intersection with a stripe is tricky. If base or end
1242 * fall outside the mem_node bounds, round them to physbase/physmax of
1243 * mem_node. If base or end fall in a gap, round them to start of
1244 * nearest stripe. If they fall within a stripe, keep base or end,
1245 * but calculate the fragment size that should be excluded from the
1246 * stripe. Calculate how many strides fall in the adjusted range,
1247 * multiply by stripe width, and add the start and end fragments.
1250 mpo_rd_lock();
1251 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1252 ms = &mem_stripes[i];
1253 if (ms->exists &&
1254 test_base <= (end = ms->physmax) &&
1255 test_end >= (base = ms->physbase)) {
1257 offset = ms->offset;
1259 if (test_base > base) {
1260 /* Round test_base to next multiple of stride */
1261 len = P2ROUNDUP(test_base - (base - offset),
1262 mnode_stride);
1263 nearest = base - offset + len;
1265 * Compute distance from test_base to the
1266 * stride boundary to see if test_base falls
1267 * in the stripe or in the hole.
1269 if (nearest - test_base > hole) {
1271 * test_base lies in stripe,
1272 * and offset should be excluded.
1274 offset = test_base -
1275 (nearest - mnode_stride);
1276 base = test_base;
1277 } else {
1278 /* round up to next stripe start */
1279 offset = 0;
1280 base = nearest;
1281 if (base > end)
1282 continue;
1287 if (test_end < end)
1288 end = test_end;
1289 end++; /* adjust to an exclusive bound */
1291 /* Round end to next multiple of stride */
1292 len = P2ROUNDUP(end - (base - offset), mnode_stride);
1293 nearest = (base - offset) + len;
1294 if (nearest - end <= hole) {
1295 /* end falls in hole, use entire last stripe */
1296 frag = 0;
1297 } else {
1298 /* end falls in stripe, compute fragment */
1299 frag = nearest - hole - end;
1302 len = (len >> stripe_shift) - offset - frag;
1303 npages += len;
1307 *npages_out = npages;
1308 mpo_rd_unlock();
1312 * valid_pages()
1314 * Return 1 if pages are valid and do not cross mnode boundaries
1315 * (which would break page free list assumptions), and 0 otherwise.
1318 #define MNODE(pa) \
1319 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1321 static int
1322 valid_pages(md_t *md, mde_cookie_t cpu0)
1324 int i, max_szc;
1325 uint64_t last_page_base, szc_mask;
1326 uint64_t max_page_len, max_coalesce_len;
1327 struct mblock_md *mb = mpo_mblock;
1330 * Find the smaller of the largest page possible and supported.
1331 * mmu_exported_pagesize_mask is not yet initialized, so read
1332 * it from the MD. Apply minimal fixups in case of broken MDs
1333 * to get a sane mask.
1336 if (cpu0 == NULL)
1337 szc_mask = szc_mask0;
1338 else {
1339 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1340 szc_mask = 0;
1341 /* largest in sun4v default support */
1342 szc_mask |= (1 << TTE4M);
1343 szc_mask0 = szc_mask;
1345 max_szc = highbit(szc_mask) - 1;
1346 if (max_szc > TTE256M)
1347 max_szc = TTE256M;
1348 max_page_len = TTEBYTES(max_szc);
1351 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352 * if mmu-page-size-list does not contain it, so 256M pages must fall
1353 * within one mnode to use MPO.
1355 max_coalesce_len = TTEBYTES(TTE256M);
1356 ASSERT(max_coalesce_len >= max_page_len);
1358 if (ptob(mnode_pages) < max_coalesce_len) {
1359 MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1361 return (0);
1364 for (i = 0; i < n_mblocks; i++) {
1365 uint64_t base = mb->base;
1366 uint64_t end = mb->base + mb->size - 1;
1367 uint64_t ra_to_pa = mb->ra_to_pa;
1370 * If mblock is smaller than the max page size, then
1371 * RA = PA mod MAXPAGE is not guaranteed, but it must
1372 * not span mnodes.
1374 if (mb->size < max_page_len) {
1375 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1376 MPO_STATUS("Small mblock spans mnodes; "
1377 "MPO disabled: base = %lx, end = %lx, "
1378 "ra2pa = %lx\n", base, end, ra_to_pa);
1379 return (0);
1381 } else {
1382 /* Verify RA = PA mod MAXPAGE, using coalesce size */
1383 uint64_t pa_base = base + ra_to_pa;
1384 if ((base & (max_coalesce_len - 1)) !=
1385 (pa_base & (max_coalesce_len - 1))) {
1386 MPO_STATUS("bad page alignment; MPO disabled: "
1387 "ra = %lx, pa = %lx, pagelen = %lx\n",
1388 base, pa_base, max_coalesce_len);
1389 return (0);
1394 * Find start of last large page in mblock in RA space.
1395 * If page extends into the next mblock, verify the
1396 * mnode does not change.
1398 last_page_base = P2ALIGN(end, max_coalesce_len);
1399 if (i + 1 < n_mblocks &&
1400 last_page_base + max_coalesce_len > mb[1].base &&
1401 MNODE(last_page_base + ra_to_pa) !=
1402 MNODE(mb[1].base + mb[1].ra_to_pa)) {
1403 MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405 "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1406 mb[1].ra_to_pa, max_coalesce_len);
1407 return (0);
1410 mb++;
1412 return (1);
1417 * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418 * if any, and remove them. This yields a config where the "coarse
1419 * grained" lgroups cover all of memory, even though part of that memory
1420 * is fine grain interleaved and does not deliver a purely local memory
1421 * latency.
1423 * This function reads and modifies the globals:
1424 * mpo_lgroup[], n_lgrpnodes
1426 * Returns 1 if lgroup nodes were removed, 0 otherwise.
1429 static int
1430 fix_interleave(void)
1432 int i, j;
1433 uint64_t mask = 0;
1435 j = 0;
1436 for (i = 0; i < n_lgrpnodes; i++) {
1437 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1438 /* remove this lgroup */
1439 mask = mpo_lgroup[i].addr_mask;
1440 } else {
1441 mpo_lgroup[j++] = mpo_lgroup[i];
1444 n_lgrpnodes = j;
1446 if (mask != 0)
1447 MPO_STATUS("sub-page interleave %lx found; "
1448 "removing lgroup.\n", mask);
1450 return (mask != 0);
1454 * mblock_alloc
1456 * Allocate memory for mblock an stripe arrays from either static or
1457 * dynamic space depending on utype, and return the result in mc.
1458 * Returns 0 on success and -1 on error.
1461 static int
1462 mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks)
1464 mblock_md_t *mb = NULL;
1465 mem_stripe_t *ms = NULL;
1466 int nstripes = MAX_MEM_NODES * nmblocks;
1467 size_t mblocksz = nmblocks * sizeof (struct mblock_md);
1468 size_t mstripesz = nstripes * sizeof (mem_stripe_t);
1469 size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
1472 * Allocate space for mblocks and mstripes.
1474 * For DR allocations, just use kmem_alloc(), and set
1475 * mc_alloc_sz to indicate it was used.
1477 * For boot allocation:
1478 * If we have a small number of mblocks we will use the space
1479 * that we preallocated. Otherwise, we will dynamically
1480 * allocate the space from the prom and map it to the
1481 * reserved VA at MPOBUF_BASE.
1484 if (utype == U_ADD || utype == U_DEL) {
1485 mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP);
1486 ms = (mem_stripe_t *)(mb + nmblocks);
1487 mc->mc_alloc_sz = allocsz;
1488 } else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
1489 mb = &small_mpo_mblocks[0];
1490 ms = &small_mem_stripes[0];
1491 mc->mc_alloc_sz = 0;
1492 } else {
1493 /* Ensure that we dont request more space than reserved */
1494 if (allocsz > MPOBUF_SIZE) {
1495 MPO_STATUS("mblock_alloc: Insufficient space "
1496 "for mblock structures \n");
1497 return (-1);
1499 mb = (struct mblock_md *)
1500 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
1501 if (mb != (struct mblock_md *)MPOBUF_BASE) {
1502 MPO_STATUS("mblock_alloc: Cannot allocate space "
1503 "for mblocks \n");
1504 return (-1);
1506 mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
1507 mpo_heap32_bufsz = MPOBUF_SIZE;
1508 ms = (mem_stripe_t *)(mb + nmblocks);
1509 mc->mc_alloc_sz = 0;
1511 mc->mc_mblocks = mb;
1512 mc->mc_stripes = ms;
1513 mc->mc_nmblocks = nmblocks;
1514 mc->mc_nstripes = nstripes;
1515 MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks);
1516 return (0);
1520 * mblock_free
1522 * Free memory in mc that was allocated by mblock_alloc.
1525 static void
1526 mblock_free(mpo_config_t *mc)
1528 if (mc->mc_alloc_sz > 0) {
1529 ASSERT(mc->mc_mblocks != mpo_mblock);
1530 kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz);
1532 bzero(mc, sizeof (*mc));
1536 * mblock_install
1538 * Install mblock config passed in mc as the global configuration.
1539 * May only be called at boot or while holding mpo_wr_lock.
1542 static void
1543 mblock_install(mpo_config_t *mc)
1545 mpo_mblock = mc->mc_mblocks;
1546 n_mblocks = mc->mc_nmblocks;
1547 mem_stripes = mc->mc_stripes;
1548 n_mem_stripes = mc->mc_nstripes;
1549 base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa);
1550 mpo_config = *mc;
1554 * mblock_update
1556 * Traverse mblocknodes, read the mblock properties from the MD, and
1557 * save the mblocks in mc.
1560 static void
1561 mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes)
1563 uint64_t i, j;
1564 int result = 0;
1565 mblock_md_t *mblock = mc->mc_mblocks;
1567 for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
1569 /* Without a base or size value we will fail */
1570 result = get_int(md, mblocknodes[j], PROP_LG_BASE,
1571 &mblock[i].base);
1572 if (result < 0) {
1573 MPO_STATUS("mblock_update: "
1574 "PROP_LG_BASE is missing\n");
1575 mc->mc_nmblocks = 0;
1576 return;
1579 result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
1580 &mblock[i].size);
1581 if (result < 0) {
1582 MPO_STATUS("mblock_update: "
1583 "PROP_LG_SIZE is missing\n");
1584 mc->mc_nmblocks = 0;
1585 return;
1588 result = get_int(md, mblocknodes[j],
1589 PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa);
1591 /* If we don't have an ra_pa_offset, just set it to 0 */
1592 if (result < 0)
1593 mblock[i].ra_to_pa = 0;
1595 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
1596 "ra_to_pa = %lx\n", i,
1597 mblock[i].base,
1598 mblock[i].size,
1599 mblock[i].ra_to_pa);
1601 /* check for unsupportable values of base and size */
1602 if (mblock[i].base > mblock[i].base + mblock[i].size) {
1603 MPO_STATUS("mblock_update: "
1604 "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
1605 "base = %lx, size = %lx\n",
1606 mblock[i].base, mblock[i].size);
1607 mc->mc_nmblocks = 0;
1608 return;
1611 /* eliminate size==0 blocks */
1612 if (mblock[i].size != 0) {
1613 uint64_t base = mblock[i].base;
1614 uint64_t end = base + mblock[i].size;
1615 ASSERT(end > base);
1616 mblock[i].base_pfn = btop(base);
1617 mblock[i].end_pfn = btop(end - 1);
1618 i++;
1622 if (i == 0) {
1623 MPO_STATUS("mblock_update: "
1624 "No non-empty mblock nodes were found "
1625 "in the Machine Descriptor\n");
1626 mc->mc_nmblocks = 0;
1627 return;
1629 ASSERT(i <= mc->mc_nmblocks);
1630 mc->mc_nmblocks = i;
1632 /* Must sort mblocks by address for mem_node_iterator_init() */
1633 mblock_sort(mblock, mc->mc_nmblocks);
1637 * mblock_update_add
1639 * Update mblock config after a memory DR add. The added range is not
1640 * needed, as we read *all* mblock nodes from the MD. Save the mblocks
1641 * in mc.
1644 static void
1645 mblock_update_add(mpo_config_t *mc)
1647 md_t *md;
1648 mde_cookie_t root, *mblocknodes;
1649 int nmblocks = 0;
1651 if ((md = md_get_handle()) == NULL) {
1652 MPO_STATUS("Cannot access Machine Descriptor\n");
1653 goto error;
1656 if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE)
1657 goto error;
1659 nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
1660 &mblocknodes);
1661 if (nmblocks <= 0) {
1662 MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
1663 goto error;
1666 if (mblock_alloc(mc, U_ADD, nmblocks) < 0)
1667 goto error;
1669 mblock_update(mc, md, mblocknodes);
1670 md_free_scan_dag(md, &mblocknodes);
1671 (void) md_fini_handle(md);
1672 return;
1673 error:
1674 panic("mblock_update_add: cannot process mblocks from MD.\n");
1678 * mblock_update_del
1680 * Update mblocks after a memory DR deletion of the range (ubase, uend).
1681 * Allocate a new mblock config, copy old config to the new, modify the new
1682 * mblocks to reflect the deletion. The new mblocks are returned in
1683 * mc_new and are not yet installed as the active config.
1686 static void
1687 mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase,
1688 pfn_t uend)
1690 int i, j;
1691 pfn_t base, end;
1692 mblock_md_t *mblock;
1693 int nmblocks = mc_old->mc_nmblocks;
1695 MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend);
1698 * Allocate mblocks in mc_new and copy the old to the new.
1699 * Allocate one extra in case the deletion splits an mblock.
1701 if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0)
1702 return;
1703 mblock = mc_new->mc_mblocks;
1704 bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t));
1707 * Find the mblock containing the deleted range and adjust it in
1708 * the new config.
1710 for (i = 0; i < nmblocks; i++) {
1712 base = btop(mblock[i].base);
1713 end = base + btop(mblock[i].size) - 1;
1716 * Adjust the mblock based on the subset that was deleted.
1718 * If the entire mblk was deleted, compact the table.
1720 * If the middle of the mblk was deleted, extend
1721 * the table. Space for the new slot was already
1722 * allocated.
1724 * The memory to be deleted is a mblock or a subset of
1725 * and does not span multiple mblocks.
1727 if (base == ubase && end == uend) {
1728 for (j = i; j < nmblocks - 1; j++)
1729 mblock[j] = mblock[j + 1];
1730 nmblocks--;
1731 bzero(&mblock[nmblocks], sizeof (*mblock));
1732 break;
1733 } else if (base < ubase && end > uend) {
1734 for (j = nmblocks - 1; j >= i; j--)
1735 mblock[j + 1] = mblock[j];
1736 mblock[i].size = ptob(ubase - base);
1737 mblock[i].end_pfn = ubase - 1;
1738 mblock[i + 1].base = ptob(uend + 1);
1739 mblock[i + 1].size = ptob(end - uend);
1740 mblock[i + 1].base_pfn = uend + 1;
1741 nmblocks++;
1742 break;
1743 } else if (base == ubase) {
1744 MPO_DEBUG("mblock_update_del: shrink>"
1745 " i=%d base=0x%lx end=0x%lx", i, base, end);
1746 mblock[i].base = ptob(uend + 1);
1747 mblock[i].size -= ptob(uend - ubase + 1);
1748 base = uend + 1;
1749 mblock[i].base_pfn = base;
1750 mblock[i].end_pfn = end;
1751 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1752 break;
1753 } else if (end == uend) {
1754 MPO_DEBUG("mblock_update_del: shrink<"
1755 " i=%d base=0x%lx end=0x%lx", i, base, end);
1756 mblock[i].size -= ptob(uend - ubase + 1);
1757 end = ubase - 1;
1758 mblock[i].base_pfn = base;
1759 mblock[i].end_pfn = end;
1760 MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1761 break;
1764 mc_new->mc_nmblocks = nmblocks;
1765 ASSERT(end > base);
1769 * mstripe_update
1771 * Read mblocks from mc and update mstripes in mc
1774 static void
1775 mstripe_update(mpo_config_t *mc)
1777 lgrp_handle_t lgrphand, lgrp_start;
1778 int i, mnode;
1779 uint64_t offset, stripe_end, base, end, ra_to_pa, stride;
1780 uint64_t stripe, frag, remove;
1781 mem_stripe_t *ms;
1782 mblock_md_t *mblock = mc->mc_mblocks;
1783 int nmblocks = mc->mc_nmblocks;
1784 int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t);
1786 /* Check for non-MPO sun4v platforms or memory DR removal */
1787 if (n_locality_groups <= 1) {
1788 ASSERT(n_locality_groups == 1);
1789 ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
1791 if (nmblocks == 1) {
1792 mc->mc_nstripes = 0;
1793 } else {
1794 mc->mc_nstripes = nmblocks;
1795 bzero(mc->mc_stripes, mstripesz);
1796 for (i = 0; i < nmblocks; i++) {
1797 mc->mc_stripes[i].exists = 1;
1798 mc->mc_stripes[i].physbase = mblock[i].base_pfn;
1799 mc->mc_stripes[i].physmax = mblock[i].end_pfn;
1802 return;
1805 bzero(mc->mc_stripes, mstripesz);
1806 mc->mc_nstripes = max_locality_groups * nmblocks;
1807 stripe = ptob(mnode_pages);
1808 stride = max_locality_groups * stripe;
1810 for (i = 0; i < nmblocks; i++) {
1811 base = mblock[i].base;
1812 end = base + mblock[i].size;
1813 ra_to_pa = mblock[i].ra_to_pa;
1815 /* Find the offset from the prev stripe boundary in PA space. */
1816 offset = (base + ra_to_pa) & (stripe - 1);
1818 /* Set the next stripe boundary. */
1819 stripe_end = base - offset + stripe;
1821 lgrp_start = (((base + ra_to_pa) & home_mask) >>
1822 home_mask_shift);
1823 lgrphand = lgrp_start;
1826 * Loop over all lgroups covered by the mblock, creating a
1827 * stripe for each. Stop when lgrp_start is visited again.
1829 do {
1830 /* mblock may not span all lgroups */
1831 if (base >= end)
1832 break;
1834 mnode = lgrphand;
1835 ASSERT(mnode < max_mem_nodes);
1838 * Calculate the size of the fragment that does not
1839 * belong to the mnode in the last partial stride.
1841 frag = (end - (base - offset)) & (stride - 1);
1842 if (frag == 0) {
1843 /* remove the gap */
1844 remove = stride - stripe;
1845 } else if (frag < stripe) {
1846 /* fragment fits in stripe; keep it all */
1847 remove = 0;
1848 } else {
1849 /* fragment is large; trim after whole stripe */
1850 remove = frag - stripe;
1853 ms = &mc->mc_stripes[i * max_locality_groups + mnode];
1854 ms->physbase = btop(base);
1855 ms->physmax = btop(end - 1 - remove);
1856 ms->offset = btop(offset);
1857 ms->exists = 1;
1859 base = stripe_end;
1860 stripe_end += stripe;
1861 offset = 0;
1862 lgrphand = (((base + ra_to_pa) & home_mask) >>
1863 home_mask_shift);
1864 } while (lgrphand != lgrp_start);
1868 #define INTERSECT(a, b, c, d) \
1869 if (((a) >= (c) && (a) <= (d)) || \
1870 ((c) >= (a) && (c) <= (b))) { \
1871 (c) = MAX((a), (c)); \
1872 (d) = MIN((b), (d)); \
1873 } else { \
1874 ASSERT((a) >= (d) || (b) <= (c)); \
1875 continue; \
1879 * mnode_update
1881 * Read stripes from mc and update mnode extents. The mnode extents are
1882 * part of the live configuration, so this can only be done at boot time
1883 * or while holding the mpo_wr_lock.
1886 static void
1887 mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype)
1889 int i, j, mnode, found;
1890 pfn_t base, end;
1891 mem_stripe_t *ms;
1893 MPO_DEBUG("mnode_udpate: basepfn: %lx endpfn: %lx\n", ubase, uend);
1895 if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) {
1896 if (utype == U_ADD)
1897 mpo_mem_node_add_slice(ubase, uend);
1898 else if (utype == U_DEL)
1899 mpo_mem_node_del_slice(ubase, uend);
1900 else
1901 panic("mnode update: %d: invalid\n", utype);
1902 return;
1905 found = 0;
1906 for (i = 0; i < mc->mc_nmblocks; i++) {
1907 for (mnode = 0; mnode < max_locality_groups; mnode++) {
1909 j = i * max_locality_groups + mnode;
1910 ms = &mc->mc_stripes[j];
1911 if (!ms->exists)
1912 continue;
1914 base = ms->physbase;
1915 end = ms->physmax;
1918 * Look for the mstripes intersecting this slice.
1920 * The mstripe and slice pairs may not be equal
1921 * if a subset of a mblock is added/deleted.
1923 switch (utype) {
1924 case U_ADD:
1925 INTERSECT(ubase, uend, base, end);
1926 /*FALLTHROUGH*/
1927 case U_ADD_ALL:
1928 if (n_locality_groups > 1)
1929 mpo_plat_assign_lgrphand_to_mem_node(
1930 mnode, mnode);
1931 mpo_mem_node_add_slice(base, end);
1932 break;
1933 case U_DEL:
1934 INTERSECT(ubase, uend, base, end);
1935 mpo_mem_node_del_slice(base, end);
1936 break;
1937 default:
1938 panic("mnode_update: %d: invalid\n", utype);
1939 break;
1942 found++;
1946 if (!found)
1947 panic("mnode_update: mstripe not found");
1949 #ifdef DEBUG
1950 if (utype == U_ADD_ALL || utype == U_DEL)
1951 return;
1952 found = 0;
1953 for (i = 0; i < max_mem_nodes; i++) {
1954 if (!mem_node_config[i].exists)
1955 continue;
1956 if (ubase >= mem_node_config[i].physbase &&
1957 ubase <= mem_node_config[i].physmax)
1958 found |= 1;
1959 if (uend >= mem_node_config[i].physbase &&
1960 uend <= mem_node_config[i].physmax)
1961 found |= 2;
1963 ASSERT(found == 3);
1965 pfn_t minpfn, maxpfn;
1967 mem_node_max_range(&minpfn, &maxpfn);
1968 ASSERT(minpfn <= ubase);
1969 ASSERT(maxpfn >= uend);
1971 #endif
1975 * Plat_slice_add()/plat_slice_del() are the platform hooks
1976 * for adding/deleting a pfn range to/from the system.
1978 * Platform_slice_add() is used for both boot/DR cases.
1980 * - Zeus has already added the mblocks to the MD, so read the updated
1981 * MD and allocate all data structures required to manage the new memory
1982 * configuration.
1984 * - Recompute the stripes which are derived from the mblocks.
1986 * - Update (expand) the mnode extents and install the modified mblocks as
1987 * the new mpo config. This must be done while holding the mpo_wr_lock
1988 * to guarantee that no other threads access the mpo meta-data.
1990 * - Unlock MPO data structures; the new config is live. Free the old config.
1992 * Plat_slice_del() is used for DR only.
1994 * - Zeus has not yet modified the MD to reflect the deletion, so copy
1995 * the old mpo mblocks and delete the range from the copy.
1997 * - Recompute the stripes which are derived from the mblocks.
1999 * - Update (shrink) the mnode extents and install the modified mblocks as
2000 * the new mpo config. This must be done while holding the mpo_wr_lock
2001 * to guarantee that no other threads access the mpo meta-data.
2003 * - Unlock MPO data structures; the new config is live. Free the old config.
2006 void
2007 plat_slice_add(pfn_t base, pfn_t end)
2009 mpo_config_t old_config = mpo_config;
2010 mpo_config_t new_config;
2012 VALIDATE_SLICE(base, end);
2013 mblock_update_add(&new_config);
2014 mstripe_update(&new_config);
2015 mpo_wr_lock();
2016 mblock_install(&new_config);
2017 /* Use new config to add all ranges for mnode_update */
2018 mnode_update(&new_config, base, end, U_ADD);
2019 mpo_genid++;
2020 mpo_wr_unlock();
2021 mblock_free(&old_config);
2024 void
2025 plat_slice_del(pfn_t base, pfn_t end)
2027 mpo_config_t old_config = mpo_config;
2028 mpo_config_t new_config;
2030 VALIDATE_SLICE(base, end);
2031 mblock_update_del(&new_config, &old_config, base, end);
2032 mstripe_update(&new_config);
2033 mpo_wr_lock();
2034 /* Use old config to find deleted range for mnode_update */
2035 mnode_update(&old_config, base, end, U_DEL);
2036 mblock_install(&new_config);
2037 mpo_genid++;
2038 mpo_wr_unlock();
2039 mblock_free(&old_config);