arch/x86/kernel/platform/i86pc/os/lgrpplat.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright (c) 2010, Intel Corporation.
  27  * All rights reserved.
  28  */
  29
  30 /*
  31  * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS
  32  * ================================================================
  33  * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access
  34  * (NUMA).  A NUMA machine consists of one or more "nodes" that each consist of
  35  * one or more CPUs and some local memory.  The CPUs in each node can access
  36  * the memory in the other nodes but at a higher latency than accessing their
  37  * local memory.  Typically, a system with only one node has Uniform Memory
  38  * Access (UMA), but it may be possible to have a one node system that has
  39  * some global memory outside of the node which is higher latency.
  40  *
  41  * Module Description
  42  * ------------------
  43  * This module provides a platform interface for determining which CPUs and
  44  * which memory (and how much) are in a NUMA node and how far each node is from
  45  * each other.  The interface is used by the Virtual Memory (VM) system and the
  46  * common lgroup framework.  The VM system uses the plat_*() routines to fill
  47  * in its memory node (memnode) array with the physical address range spanned
  48  * by each NUMA node to know which memory belongs to which node, so it can
  49  * build and manage a physical page free list for each NUMA node and allocate
  50  * local memory from each node as needed.  The common lgroup framework uses the
  51  * exported lgrp_plat_*() routines to figure out which CPUs and memory belong
  52  * to each node (leaf lgroup) and how far each node is from each other, so it
  53  * can build the latency (lgroup) topology for the machine in order to optimize
  54  * for locality.  Also, an lgroup platform handle instead of lgroups are used
  55  * in the interface with this module, so this module shouldn't need to know
  56  * anything about lgroups.  Instead, it just needs to know which CPUs, memory,
  57  * etc. are in each NUMA node, how far each node is from each other, and to use
  58  * a unique lgroup platform handle to refer to each node through the interface.
  59  *
  60  * Determining NUMA Configuration
  61  * ------------------------------
  62  * By default, this module will try to determine the NUMA configuration of the
  63  * machine by reading the ACPI System Resource Affinity Table (SRAT) and System
  64  * Locality Information Table (SLIT).  The SRAT contains info to tell which
  65  * CPUs and memory are local to a given proximity domain (NUMA node).  The SLIT
  66  * is a matrix that gives the distance between each system locality (which is
  67  * a NUMA node and should correspond to proximity domains in the SRAT).  For
  68  * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer
  69  * specification.
  70  *
  71  * If the SRAT doesn't exist on a system with AMD Opteron processors, we
  72  * examine registers in PCI configuration space to determine how many nodes are
  73  * in the system and which CPUs and memory are in each node.
  74  * do while booting the kernel.
  75  *
  76  * NOTE: Using these PCI configuration space registers to determine this
  77  *       locality info is not guaranteed to work or be compatible across all
  78  *       Opteron processor families.
  79  *
  80  * If the SLIT does not exist or look right, the kernel will probe to determine
  81  * the distance between nodes as long as the NUMA CPU and memory configuration
  82  * has been determined (see lgrp_plat_probe() for details).
  83  *
  84  * Data Structures
  85  * ---------------
  86  * The main data structures used by this code are the following:
  87  *
  88  * - lgrp_plat_cpu_node[]               CPU to node ID mapping table indexed by
  89  *                                      CPU ID (only used for SRAT)
  90  *
  91  * - lgrp_plat_lat_stats.latencies[][]  Table of latencies between same and
  92  *                                      different nodes indexed by node ID
  93  *
  94  * - lgrp_plat_node_cnt                 Number of NUMA nodes in system for
  95  *                                      non-DR-capable systems,
  96  *                                      maximum possible number of NUMA nodes
  97  *                                      in system for DR capable systems.
  98  *
  99  * - lgrp_plat_node_domain[]            Node ID to proximity domain ID mapping
 100  *                                      table indexed by node ID (only used
 101  *                                      for SRAT)
 102  *
 103  * - lgrp_plat_memnode_info[]           Table with physical address range for
 104  *                                      each memory node indexed by memory node
 105  *                                      ID
 106  *
 107  * The code is implemented to make the following always be true:
 108  *
 109  *      lgroup platform handle == node ID == memnode ID
 110  *
 111  * Moreover, it allows for the proximity domain ID to be equal to all of the
 112  * above as long as the proximity domains IDs are numbered from 0 to <number of
 113  * nodes - 1>.  This is done by hashing each proximity domain ID into the range
 114  * from 0 to <number of nodes - 1>.  Then proximity ID N will hash into node ID
 115  * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N]
 116  * and be assigned node ID N.  If the proximity domain IDs aren't numbered
 117  * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into
 118  * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs
 119  * to node IDs.  However, the proximity domain IDs may not map to the
 120  * equivalent node ID since we want to keep the node IDs numbered from 0 to
 121  * <number of nodes - 1> to minimize cost of searching and potentially space.
 122  *
 123  * With the introduction of support of memory DR operations on x86 platforms,
 124  * things get a little complicated. The addresses of hot-added memory may not
 125  * be continuous with other memory connected to the same lgrp node. In other
 126  * words, memory addresses may get interleaved among lgrp nodes after memory
 127  * DR operations. To work around this limitation, we have extended the
 128  * relationship between lgrp node and memory node from 1:1 map to 1:N map,
 129  * that means there may be multiple memory nodes associated with a lgrp node
 130  * after memory DR operations.
 131  *
 132  * To minimize the code changes to support memory DR operations, the
 133  * following policies have been adopted.
 134  * 1) On non-DR-capable systems, the relationship among lgroup platform handle,
 135  *    node ID and memnode ID is still kept as:
 136  *      lgroup platform handle == node ID == memnode ID
 137  * 2) For memory present at boot time on DR capable platforms, the relationship
 138  *    is still kept as is.
 139  *      lgroup platform handle == node ID == memnode ID
 140  * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have
 141  *    been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt)
 142  *    are reserved for memory present at boot time, and memnode IDs
 143  *    [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate
 144  *    memnode ID for hot-added memory.
 145  * 4) All boot code having the assumption "node ID == memnode ID" can live as
 146  *    is, that's because node ID is always equal to memnode ID at boot time.
 147  * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and
 148  *    lgrp_plat_mem_size() related logics have been enhanced to deal with
 149  *    the 1:N map relationship.
 150  * 6) The latency probing related logics, which have the assumption
 151  *    "node ID == memnode ID" and may be called at run time, is disabled if
 152  *    memory DR operation is enabled.
 153  */
 154
 155
 156 #include <sys/archsystm.h>      /* for {in,out}{b,w,l}() */
 157 #include <sys/atomic.h>
 158 #include <sys/bootconf.h>
 159 #include <sys/cmn_err.h>
 160 #include <sys/controlregs.h>
 161 #include <sys/cpupart.h>
 162 #include <sys/cpuvar.h>
 163 #include <sys/lgrp.h>
 164 #include <sys/machsystm.h>
 165 #include <sys/memlist.h>
 166 #include <sys/memnode.h>
 167 #include <sys/mman.h>
 168 #include <sys/note.h>
 169 #include <sys/pci_cfgspace.h>
 170 #include <sys/pci_impl.h>
 171 #include <sys/param.h>
 172 #include <sys/pghw.h>
 173 #include <sys/promif.h>         /* for prom_printf() */
 174 #include <sys/sysmacros.h>
 175 #include <sys/systm.h>
 176 #include <sys/thread.h>
 177 #include <sys/types.h>
 178 #include <sys/var.h>
 179 #include <sys/x86_archext.h>
 180 #include <vm/hat_i86.h>
 181 #include <vm/seg_kmem.h>
 182 #include <vm/vm_dep.h>
 183
 184 #include <sys/acpidev.h>
 185 #include <sys/acpi/acpi.h>              /* for SRAT, SLIT and MSCT */
 186
 187 /* from fakebop.c */
 188 extern ACPI_TABLE_SRAT *srat_ptr;
 189 extern ACPI_TABLE_SLIT *slit_ptr;
 190 extern ACPI_TABLE_MSCT *msct_ptr;
 191
 192 #define MAX_NODES               8
 193 #define NLGRP                   (MAX_NODES * (MAX_NODES - 1) + 1)
 194
 195 /*
 196  * Constants for configuring probing
 197  */
 198 #define LGRP_PLAT_PROBE_NROUNDS         64      /* default laps for probing */
 199 #define LGRP_PLAT_PROBE_NSAMPLES        1       /* default samples to take */
 200 #define LGRP_PLAT_PROBE_NREADS          256     /* number of vendor ID reads */
 201
 202 /*
 203  * Flags for probing
 204  */
 205 #define LGRP_PLAT_PROBE_ENABLE          0x1     /* enable probing */
 206 #define LGRP_PLAT_PROBE_PGCPY           0x2     /* probe using page copy */
 207 #define LGRP_PLAT_PROBE_VENDOR          0x4     /* probe vendor ID register */
 208
 209 /*
 210  * Hash proximity domain ID into node to domain mapping table "mod" number of
 211  * nodes to minimize span of entries used and try to have lowest numbered
 212  * proximity domain be node 0
 213  */
 214 #define NODE_DOMAIN_HASH(domain, node_cnt) \
 215         ((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \
 216             ((domain) - lgrp_plat_prox_domain_min) % node_cnt)
 217
 218 /*
 219  * CPU to node ID mapping structure (only used with SRAT)
 220  */
 221 typedef struct cpu_node_map {
 222         int             exists;
 223         uint_t          node;
 224         uint32_t        apicid;
 225         uint32_t        prox_domain;
 226 } cpu_node_map_t;
 227
 228 /*
 229  * Latency statistics
 230  */
 231 typedef struct lgrp_plat_latency_stats {
 232         hrtime_t        latencies[MAX_NODES][MAX_NODES];
 233         hrtime_t        latency_max;
 234         hrtime_t        latency_min;
 235 } lgrp_plat_latency_stats_t;
 236
 237 /*
 238  * Memory configuration for probing
 239  */
 240 typedef struct lgrp_plat_probe_mem_config {
 241         size_t  probe_memsize;          /* how much memory to probe per node */
 242         caddr_t probe_va[MAX_NODES];    /* where memory mapped for probing */
 243         pfn_t   probe_pfn[MAX_NODES];   /* physical pages to map for probing */
 244 } lgrp_plat_probe_mem_config_t;
 245
 246 /*
 247  * Statistics kept for probing
 248  */
 249 typedef struct lgrp_plat_probe_stats {
 250         hrtime_t        flush_cost;
 251         hrtime_t        probe_cost;
 252         hrtime_t        probe_cost_total;
 253         hrtime_t        probe_error_code;
 254         hrtime_t        probe_errors[MAX_NODES][MAX_NODES];
 255         int             probe_suspect[MAX_NODES][MAX_NODES];
 256         hrtime_t        probe_max[MAX_NODES][MAX_NODES];
 257         hrtime_t        probe_min[MAX_NODES][MAX_NODES];
 258 } lgrp_plat_probe_stats_t;
 259
 260 /*
 261  * Node to proximity domain ID mapping structure (only used with SRAT)
 262  */
 263 typedef struct node_domain_map {
 264         int             exists;
 265         uint32_t        prox_domain;
 266 } node_domain_map_t;
 267
 268 /*
 269  * Node ID and starting and ending page for physical memory in memory node
 270  */
 271 typedef struct memnode_phys_addr_map {
 272         pfn_t           start;
 273         pfn_t           end;
 274         int             exists;
 275         uint32_t        prox_domain;
 276         uint32_t        device_id;
 277         uint_t          lgrphand;
 278 } memnode_phys_addr_map_t;
 279
 280 /*
 281  * Number of CPUs for which we got APIC IDs
 282  */
 283 static int                              lgrp_plat_apic_ncpus = 0;
 284
 285 /*
 286  * CPU to node ID mapping table (only used for SRAT) and its max number of
 287  * entries
 288  */
 289 static cpu_node_map_t                   *lgrp_plat_cpu_node = NULL;
 290 static uint_t                           lgrp_plat_cpu_node_nentries = 0;
 291
 292 /*
 293  * Latency statistics
 294  */
 295 lgrp_plat_latency_stats_t               lgrp_plat_lat_stats;
 296
 297 /*
 298  * Whether memory is interleaved across nodes causing MPO to be disabled
 299  */
 300 static int                              lgrp_plat_mem_intrlv = 0;
 301
 302 /*
 303  * Node ID to proximity domain ID mapping table (only used for SRAT)
 304  */
 305 static node_domain_map_t                lgrp_plat_node_domain[MAX_NODES];
 306
 307 /*
 308  * Physical address range for memory in each node
 309  */
 310 static memnode_phys_addr_map_t          lgrp_plat_memnode_info[MAX_MEM_NODES];
 311
 312 /*
 313  * Statistics gotten from probing
 314  */
 315 static lgrp_plat_probe_stats_t          lgrp_plat_probe_stats;
 316
 317 /*
 318  * Memory configuration for probing
 319  */
 320 static lgrp_plat_probe_mem_config_t     lgrp_plat_probe_mem_config;
 321
 322 /*
 323  * Lowest proximity domain ID seen in ACPI SRAT
 324  */
 325 static uint32_t                         lgrp_plat_prox_domain_min = UINT32_MAX;
 326
 327 /*
 328  * Error code from processing ACPI SRAT
 329  */
 330 static int                              lgrp_plat_srat_error = 0;
 331
 332 /*
 333  * Error code from processing ACPI SLIT
 334  */
 335 static int                              lgrp_plat_slit_error = 0;
 336
 337 /*
 338  * Whether lgrp topology has been flattened to 2 levels.
 339  */
 340 static int                              lgrp_plat_topo_flatten = 0;
 341
 342
 343 /*
 344  * Maximum memory node ID in use.
 345  */
 346 static uint_t                           lgrp_plat_max_mem_node;
 347
 348 /*
 349  * Allocate lgroup array statically
 350  */
 351 static lgrp_t                           lgrp_space[NLGRP];
 352 static int                              nlgrps_alloc;
 353
 354
 355 /*
 356  * Enable finding and using minimum proximity domain ID when hashing
 357  */
 358 int                     lgrp_plat_domain_min_enable = 1;
 359
 360 /*
 361  * Maximum possible number of nodes in system
 362  */
 363 uint_t                  lgrp_plat_node_cnt = 1;
 364
 365 /*
 366  * Enable sorting nodes in ascending order by starting physical address
 367  */
 368 int                     lgrp_plat_node_sort_enable = 1;
 369
 370 /*
 371  * Configuration Parameters for Probing
 372  * - lgrp_plat_probe_flags      Flags to specify enabling probing, probe
 373  *                              operation, etc.
 374  * - lgrp_plat_probe_nrounds    How many rounds of probing to do
 375  * - lgrp_plat_probe_nsamples   Number of samples to take when probing each
 376  *                              node
 377  * - lgrp_plat_probe_nreads     Number of times to read vendor ID from
 378  *                              Northbridge for each probe
 379  */
 380 uint_t                  lgrp_plat_probe_flags = 0;
 381 int                     lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS;
 382 int                     lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES;
 383 int                     lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS;
 384
 385 /*
 386  * Enable use of ACPI System Resource Affinity Table (SRAT), System
 387  * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT)
 388  */
 389 int                     lgrp_plat_srat_enable = 1;
 390 int                     lgrp_plat_slit_enable = 1;
 391 int                     lgrp_plat_msct_enable = 1;
 392
 393 /*
 394  * mnode_xwa: set to non-zero value to initiate workaround if large pages are
 395  * found to be crossing memory node boundaries. The workaround will eliminate
 396  * a base size page at the end of each memory node boundary to ensure that
 397  * a large page with constituent pages that span more than 1 memory node
 398  * can never be formed.
 399  *
 400  */
 401 int     mnode_xwa = 1;
 402
 403 /*
 404  * Static array to hold lgroup statistics
 405  */
 406 struct lgrp_stats       lgrp_stats[NLGRP];
 407
 408
 409 /*
 410  * Forward declarations of platform interface routines
 411  */
 412 void            plat_build_mem_nodes(struct memlist *list);
 413
 414 int             plat_mnode_xcheck(pfn_t pfncnt);
 415
 416 lgrp_handle_t   plat_mem_node_to_lgrphand(int mnode);
 417
 418 int             plat_pfn_to_mem_node(pfn_t pfn);
 419
 420 /*
 421  * Forward declarations of lgroup platform interface routines
 422  */
 423 lgrp_t          *lgrp_plat_alloc(lgrp_id_t lgrpid);
 424
 425 void            lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg);
 426
 427 lgrp_handle_t   lgrp_plat_cpu_to_hand(processorid_t id);
 428
 429 void            lgrp_plat_init(lgrp_init_stages_t stage);
 430
 431 int             lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to);
 432
 433 int             lgrp_plat_max_lgrps(void);
 434
 435 pgcnt_t         lgrp_plat_mem_size(lgrp_handle_t plathand,
 436     lgrp_mem_query_t query);
 437
 438 lgrp_handle_t   lgrp_plat_pfn_to_hand(pfn_t pfn);
 439
 440 void            lgrp_plat_probe(void);
 441
 442 lgrp_handle_t   lgrp_plat_root_hand(void);
 443
 444
 445 /*
 446  * Forward declarations of local routines
 447  */
 448 static int      is_opteron(void);
 449
 450 static int      lgrp_plat_cpu_node_update(node_domain_map_t *node_domain,
 451     int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid,
 452     uint32_t domain);
 453
 454 static int      lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
 455     int cpu_node_nentries);
 456
 457 static int      lgrp_plat_domain_to_node(node_domain_map_t *node_domain,
 458     int node_cnt, uint32_t domain);
 459
 460 static void     lgrp_plat_get_numa_config(void);
 461
 462 static void     lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
 463     lgrp_plat_latency_stats_t *lat_stats,
 464     lgrp_plat_probe_stats_t *probe_stats);
 465
 466 static int      lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
 467     lgrp_plat_latency_stats_t *lat_stats);
 468
 469 static void     lgrp_plat_main_init(void);
 470
 471 static pgcnt_t  lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t);
 472
 473 static int      lgrp_plat_node_domain_update(node_domain_map_t *node_domain,
 474     int node_cnt, uint32_t domain);
 475
 476 static int      lgrp_plat_memnode_info_update(node_domain_map_t *node_domain,
 477     int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt,
 478     uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id);
 479
 480 static void     lgrp_plat_node_sort(node_domain_map_t *node_domain,
 481     int node_cnt, cpu_node_map_t *cpu_node, int cpu_count,
 482     memnode_phys_addr_map_t *memnode_info);
 483
 484 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node,
 485     int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config,
 486     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats);
 487
 488 static int      lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node);
 489
 490 static int      lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp,
 491     node_domain_map_t *node_domain, uint_t node_cnt,
 492     memnode_phys_addr_map_t *memnode_info,
 493     lgrp_plat_latency_stats_t *lat_stats);
 494
 495 static int      lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info,
 496     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
 497     lgrp_plat_latency_stats_t *lat_stats);
 498
 499 static int      lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
 500     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
 501     cpu_node_map_t *cpu_node, int cpu_count,
 502     memnode_phys_addr_map_t *memnode_info);
 503
 504 static void     lgrp_plat_release_bootstrap(void);
 505
 506 static int      lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp,
 507     uint32_t *prox_domain_min);
 508
 509 static int      lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp,
 510     uint32_t *prox_domain_min);
 511
 512 static void     lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats);
 513
 514 static void     opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
 515     memnode_phys_addr_map_t *memnode_info);
 516
 517 static hrtime_t opt_probe_vendor(int dest_node, int nreads);
 518
 519
 520 /*
 521  * PLATFORM INTERFACE ROUTINES
 522  */
 523
 524 /*
 525  * Configure memory nodes for machines with more than one node (ie NUMA)
 526  */
 527 void
 528 plat_build_mem_nodes(struct memlist *list)
 529 {
 530         pfn_t           cur_start;      /* start addr of subrange */
 531         pfn_t           cur_end;        /* end addr of subrange */
 532         pfn_t           start;          /* start addr of whole range */
 533         pfn_t           end;            /* end addr of whole range */
 534         pgcnt_t         endcnt;         /* pages to sacrifice */
 535
 536         /*
 537          * Boot install lists are arranged <addr, len>, ...
 538          */
 539         while (list) {
 540                 int     node;
 541
 542                 start = list->ml_address >> PAGESHIFT;
 543                 end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT;
 544
 545                 if (start > physmax) {
 546                         list = list->ml_next;
 547                         continue;
 548                 }
 549                 if (end > physmax)
 550                         end = physmax;
 551
 552                 /*
 553                  * When there is only one memnode, just add memory to memnode
 554                  */
 555                 if (max_mem_nodes == 1) {
 556                         mem_node_add_slice(start, end);
 557                         list = list->ml_next;
 558                         continue;
 559                 }
 560
 561                 /*
 562                  * mem_node_add_slice() expects to get a memory range that
 563                  * is within one memnode, so need to split any memory range
 564                  * that spans multiple memnodes into subranges that are each
 565                  * contained within one memnode when feeding them to
 566                  * mem_node_add_slice()
 567                  */
 568                 cur_start = start;
 569                 do {
 570                         node = plat_pfn_to_mem_node(cur_start);
 571
 572                         /*
 573                          * Panic if DRAM address map registers or SRAT say
 574                          * memory in node doesn't exist or address from
 575                          * boot installed memory list entry isn't in this node.
 576                          * This shouldn't happen and rest of code can't deal
 577                          * with this if it does.
 578                          */
 579                         if (node < 0 || node >= lgrp_plat_max_mem_node ||
 580                             !lgrp_plat_memnode_info[node].exists ||
 581                             cur_start < lgrp_plat_memnode_info[node].start ||
 582                             cur_start > lgrp_plat_memnode_info[node].end) {
 583                                 cmn_err(CE_PANIC, "Don't know which memnode "
 584                                     "to add installed memory address 0x%lx\n",
 585                                     cur_start);
 586                         }
 587
 588                         /*
 589                          * End of current subrange should not span memnodes
 590                          */
 591                         cur_end = end;
 592                         endcnt = 0;
 593                         if (lgrp_plat_memnode_info[node].exists &&
 594                             cur_end > lgrp_plat_memnode_info[node].end) {
 595                                 cur_end = lgrp_plat_memnode_info[node].end;
 596                                 if (mnode_xwa > 1) {
 597                                         /*
 598                                          * sacrifice the last page in each
 599                                          * node to eliminate large pages
 600                                          * that span more than 1 memory node.
 601                                          */
 602                                         endcnt = 1;
 603                                         physinstalled--;
 604                                 }
 605                         }
 606
 607                         mem_node_add_slice(cur_start, cur_end - endcnt);
 608
 609                         /*
 610                          * Next subrange starts after end of current one
 611                          */
 612                         cur_start = cur_end + 1;
 613                 } while (cur_end < end);
 614
 615                 list = list->ml_next;
 616         }
 617         mem_node_physalign = 0;
 618         mem_node_pfn_shift = 0;
 619 }
 620
 621
 622 /*
 623  * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt
 624  * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if
 625  * a crossing is found and returns 0 otherwise.
 626  */
 627 int
 628 plat_mnode_xcheck(pfn_t pfncnt)
 629 {
 630         int     node, prevnode = -1, basenode;
 631         pfn_t   ea, sa;
 632
 633         for (node = 0; node < lgrp_plat_max_mem_node; node++) {
 634
 635                 if (lgrp_plat_memnode_info[node].exists == 0)
 636                         continue;
 637
 638                 if (prevnode == -1) {
 639                         prevnode = node;
 640                         basenode = node;
 641                         continue;
 642                 }
 643
 644                 /* assume x86 node pfn ranges are in increasing order */
 645                 ASSERT(lgrp_plat_memnode_info[node].start >
 646                     lgrp_plat_memnode_info[prevnode].end);
 647
 648                 /*
 649                  * continue if the starting address of node is not contiguous
 650                  * with the previous node.
 651                  */
 652
 653                 if (lgrp_plat_memnode_info[node].start !=
 654                     (lgrp_plat_memnode_info[prevnode].end + 1)) {
 655                         basenode = node;
 656                         prevnode = node;
 657                         continue;
 658                 }
 659
 660                 /* check if the starting address of node is pfncnt aligned */
 661                 if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) {
 662
 663                         /*
 664                          * at this point, node starts at an unaligned boundary
 665                          * and is contiguous with the previous node(s) to
 666                          * basenode. Check if there is an aligned contiguous
 667                          * range of length pfncnt that crosses this boundary.
 668                          */
 669
 670                         sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end,
 671                             pfncnt);
 672                         ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start),
 673                             pfncnt);
 674
 675                         ASSERT((ea - sa) == pfncnt);
 676                         if (sa >= lgrp_plat_memnode_info[basenode].start &&
 677                             ea <= (lgrp_plat_memnode_info[node].end + 1)) {
 678                                 /*
 679                                  * large page found to cross mnode boundary.
 680                                  * Return Failure if workaround not enabled.
 681                                  */
 682                                 if (mnode_xwa == 0)
 683                                         return (1);
 684                                 mnode_xwa++;
 685                         }
 686                 }
 687                 prevnode = node;
 688         }
 689         return (0);
 690 }
 691
 692
 693 lgrp_handle_t
 694 plat_mem_node_to_lgrphand(int mnode)
 695 {
 696         if (max_mem_nodes == 1)
 697                 return (LGRP_DEFAULT_HANDLE);
 698
 699         ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node);
 700
 701         return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand));
 702 }
 703
 704 int
 705 plat_pfn_to_mem_node(pfn_t pfn)
 706 {
 707         int     node;
 708
 709         if (max_mem_nodes == 1)
 710                 return (0);
 711
 712         for (node = 0; node < lgrp_plat_max_mem_node; node++) {
 713                 /*
 714                  * Skip nodes with no memory
 715                  */
 716                 if (!lgrp_plat_memnode_info[node].exists)
 717                         continue;
 718
 719                 membar_consumer();
 720                 if (pfn >= lgrp_plat_memnode_info[node].start &&
 721                     pfn <= lgrp_plat_memnode_info[node].end)
 722                         return (node);
 723         }
 724
 725         /*
 726          * Didn't find memnode where this PFN lives which should never happen
 727          */
 728         ASSERT(node < lgrp_plat_max_mem_node);
 729         return (-1);
 730 }
 731
 732
 733 /*
 734  * LGROUP PLATFORM INTERFACE ROUTINES
 735  */
 736
 737 /*
 738  * Allocate additional space for an lgroup.
 739  */
 740 lgrp_t *
 741 lgrp_plat_alloc(lgrp_id_t lgrpid)
 742 {
 743         lgrp_t *lgrp;
 744
 745         lgrp = &lgrp_space[nlgrps_alloc++];
 746         if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP)
 747                 return (NULL);
 748         return (lgrp);
 749 }
 750
 751
 752 /*
 753  * Platform handling for (re)configuration changes
 754  *
 755  * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug:
 756  * 1) Use cpu_lock to synchronize between lgrp_plat_config() and
 757  *    lgrp_plat_cpu_to_hand().
 758  * 2) Disable latency probing logic by making sure that the flag
 759  *    LGRP_PLAT_PROBE_ENABLE is cleared.
 760  *
 761  * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug:
 762  * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal.
 763  * 2) Only expansion to existing entries, no shrinking.
 764  * 3) On writing side, DR framework ensures that lgrp_plat_config() is called
 765  *    in single-threaded context. And membar_producer() is used to ensure that
 766  *    all changes are visible to other CPUs before setting the "exists" flag.
 767  * 4) On reading side, membar_consumer() after checking the "exists" flag
 768  *    ensures that right values are retrieved.
 769  *
 770  * Mechanism to protect lgrp_plat_node_domain[] at hotplug:
 771  * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal.
 772  * 2) On writing side, it's single-threaded and membar_producer() is used to
 773  *    ensure all changes are visible to other CPUs before setting the "exists"
 774  *    flag.
 775  * 3) On reading side, membar_consumer() after checking the "exists" flag
 776  *    ensures that right values are retrieved.
 777  */
 778 void
 779 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg)
 780 {
 781         int     rc, node;
 782         cpu_t   *cp;
 783         void    *hdl = NULL;
 784         uchar_t *sliptr = NULL;
 785         uint32_t domain, apicid, slicnt = 0;
 786         update_membounds_t *mp;
 787
 788         extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *,
 789             uint32_t *, uint32_t *, uchar_t **);
 790         extern void acpidev_dr_free_cpu_numa_info(void *);
 791
 792         /*
 793          * This interface is used to support CPU/memory DR operations.
 794          * Don't bother here if it's still during boot or only one lgrp node
 795          * is supported.
 796          */
 797         if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1)
 798                 return;
 799
 800         switch (flag) {
 801         case LGRP_CONFIG_CPU_ADD:
 802                 cp = (cpu_t *)arg;
 803                 ASSERT(cp != NULL);
 804                 ASSERT(MUTEX_HELD(&cpu_lock));
 805
 806                 /* Check whether CPU already exists. */
 807                 ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists);
 808                 if (lgrp_plat_cpu_node[cp->cpu_id].exists) {
 809                         cmn_err(CE_WARN,
 810                             "!lgrp: CPU(%d) already exists in cpu_node map.",
 811                             cp->cpu_id);
 812                         break;
 813                 }
 814
 815                 /* Query CPU lgrp information. */
 816                 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
 817                     &slicnt, &sliptr);
 818                 ASSERT(rc == 0);
 819                 if (rc != 0) {
 820                         cmn_err(CE_WARN,
 821                             "!lgrp: failed to query lgrp info for CPU(%d).",
 822                             cp->cpu_id);
 823                         break;
 824                 }
 825
 826                 /* Update node to proximity domain mapping */
 827                 node = lgrp_plat_domain_to_node(lgrp_plat_node_domain,
 828                     lgrp_plat_node_cnt, domain);
 829                 if (node == -1) {
 830                         node = lgrp_plat_node_domain_update(
 831                             lgrp_plat_node_domain, lgrp_plat_node_cnt, domain);
 832                         ASSERT(node != -1);
 833                         if (node == -1) {
 834                                 acpidev_dr_free_cpu_numa_info(hdl);
 835                                 cmn_err(CE_WARN, "!lgrp: failed to update "
 836                                     "node_domain map for domain(%u).", domain);
 837                                 break;
 838                         }
 839                 }
 840
 841                 /* Update latency information among lgrps. */
 842                 if (slicnt != 0 && sliptr != NULL) {
 843                         if (lgrp_plat_process_sli(domain, sliptr, slicnt,
 844                             lgrp_plat_node_domain, lgrp_plat_node_cnt,
 845                             &lgrp_plat_lat_stats) != 0) {
 846                                 cmn_err(CE_WARN, "!lgrp: failed to update "
 847                                     "latency information for domain (%u).",
 848                                     domain);
 849                         }
 850                 }
 851
 852                 /* Update CPU to node mapping. */
 853                 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain;
 854                 lgrp_plat_cpu_node[cp->cpu_id].node = node;
 855                 lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid;
 856                 lgrp_plat_cpu_node[cp->cpu_id].exists = 1;
 857                 lgrp_plat_apic_ncpus++;
 858
 859                 acpidev_dr_free_cpu_numa_info(hdl);
 860                 break;
 861
 862         case LGRP_CONFIG_CPU_DEL:
 863                 cp = (cpu_t *)arg;
 864                 ASSERT(cp != NULL);
 865                 ASSERT(MUTEX_HELD(&cpu_lock));
 866
 867                 /* Check whether CPU exists. */
 868                 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists);
 869                 if (!lgrp_plat_cpu_node[cp->cpu_id].exists) {
 870                         cmn_err(CE_WARN,
 871                             "!lgrp: CPU(%d) doesn't exist in cpu_node map.",
 872                             cp->cpu_id);
 873                         break;
 874                 }
 875
 876                 /* Query CPU lgrp information. */
 877                 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain,
 878                     NULL, NULL);
 879                 ASSERT(rc == 0);
 880                 if (rc != 0) {
 881                         cmn_err(CE_WARN,
 882                             "!lgrp: failed to query lgrp info for CPU(%d).",
 883                             cp->cpu_id);
 884                         break;
 885                 }
 886
 887                 /* Update map. */
 888                 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid);
 889                 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain);
 890                 lgrp_plat_cpu_node[cp->cpu_id].exists = 0;
 891                 lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX;
 892                 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX;
 893                 lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX;
 894                 lgrp_plat_apic_ncpus--;
 895
 896                 acpidev_dr_free_cpu_numa_info(hdl);
 897                 break;
 898
 899         case LGRP_CONFIG_MEM_ADD:
 900                 mp = (update_membounds_t *)arg;
 901                 ASSERT(mp != NULL);
 902
 903                 /* Update latency information among lgrps. */
 904                 if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) {
 905                         if (lgrp_plat_process_sli(mp->u_domain,
 906                             mp->u_sli_ptr, mp->u_sli_cnt,
 907                             lgrp_plat_node_domain, lgrp_plat_node_cnt,
 908                             &lgrp_plat_lat_stats) != 0) {
 909                                 cmn_err(CE_WARN, "!lgrp: failed to update "
 910                                     "latency information for domain (%u).",
 911                                     domain);
 912                         }
 913                 }
 914
 915                 if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain,
 916                     lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes,
 917                     mp->u_base, mp->u_base + mp->u_length,
 918                     mp->u_domain, mp->u_device_id) < 0) {
 919                         cmn_err(CE_WARN,
 920                             "!lgrp: failed to update latency  information for "
 921                             "memory (0x%" PRIx64 " - 0x%" PRIx64 ").",
 922                             mp->u_base, mp->u_base + mp->u_length);
 923                 }
 924                 break;
 925
 926         default:
 927                 break;
 928         }
 929 }
 930
 931
 932 /*
 933  * Return the platform handle for the lgroup containing the given CPU
 934  */
 935 lgrp_handle_t
 936 lgrp_plat_cpu_to_hand(processorid_t id)
 937 {
 938         lgrp_handle_t   hand;
 939
 940         ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
 941
 942         if (lgrp_plat_node_cnt == 1)
 943                 return (LGRP_DEFAULT_HANDLE);
 944
 945         hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id],
 946             lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries);
 947
 948         ASSERT(hand != (lgrp_handle_t)-1);
 949         if (hand == (lgrp_handle_t)-1)
 950                 return (LGRP_NULL_HANDLE);
 951
 952         return (hand);
 953 }
 954
 955
 956 /*
 957  * Platform-specific initialization of lgroups
 958  */
 959 void
 960 lgrp_plat_init(lgrp_init_stages_t stage)
 961 {
 962         u_longlong_t    value;
 963
 964         switch (stage) {
 965         case LGRP_INIT_STAGE1:
 966
 967                 /*
 968                  * Get boot property for lgroup topology height limit
 969                  */
 970                 if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0)
 971                         (void) lgrp_topo_ht_limit_set((int)value);
 972
 973                 /*
 974                  * Get boot property for enabling/disabling SRAT
 975                  */
 976                 if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0)
 977                         lgrp_plat_srat_enable = (int)value;
 978
 979                 /*
 980                  * Get boot property for enabling/disabling SLIT
 981                  */
 982                 if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0)
 983                         lgrp_plat_slit_enable = (int)value;
 984
 985                 /*
 986                  * Get boot property for enabling/disabling MSCT
 987                  */
 988                 if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0)
 989                         lgrp_plat_msct_enable = (int)value;
 990
 991                 /*
 992                  * Initialize as a UMA machine
 993                  */
 994                 if (lgrp_topo_ht_limit() == 1) {
 995                         lgrp_plat_node_cnt = max_mem_nodes = 1;
 996                         lgrp_plat_max_mem_node = 1;
 997                         return;
 998                 }
 999
1000                 lgrp_plat_get_numa_config();
1001
1002                 /*
1003                  * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes
1004                  * to support memory DR operations if memory DR is enabled.
1005                  */
1006                 lgrp_plat_max_mem_node = lgrp_plat_node_cnt;
1007                 if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) {
1008                         max_mem_nodes = MAX_MEM_NODES_PER_LGROUP *
1009                             lgrp_plat_node_cnt;
1010                         ASSERT(max_mem_nodes <= MAX_MEM_NODES);
1011                 }
1012                 break;
1013
1014         case LGRP_INIT_STAGE3:
1015                 lgrp_plat_probe();
1016                 lgrp_plat_release_bootstrap();
1017                 break;
1018
1019         case LGRP_INIT_STAGE4:
1020                 lgrp_plat_main_init();
1021                 break;
1022
1023         default:
1024                 break;
1025         }
1026 }
1027
1028
1029 /*
1030  * Return latency between "from" and "to" lgroups
1031  *
1032  * This latency number can only be used for relative comparison
1033  * between lgroups on the running system, cannot be used across platforms,
1034  * and may not reflect the actual latency.  It is platform and implementation
1035  * specific, so platform gets to decide its value.  It would be nice if the
1036  * number was at least proportional to make comparisons more meaningful though.
1037  */
1038 int
1039 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to)
1040 {
1041         lgrp_handle_t   src, dest;
1042         int             node;
1043
1044         if (max_mem_nodes == 1)
1045                 return (0);
1046
1047         /*
1048          * Return max latency for root lgroup
1049          */
1050         if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)
1051                 return (lgrp_plat_lat_stats.latency_max);
1052
1053         src = from;
1054         dest = to;
1055
1056         /*
1057          * Return 0 for nodes (lgroup platform handles) out of range
1058          */
1059         if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES)
1060                 return (0);
1061
1062         /*
1063          * Probe from current CPU if its lgroup latencies haven't been set yet
1064          * and we are trying to get latency from current CPU to some node.
1065          * Avoid probing if CPU/memory DR is enabled.
1066          */
1067         if (lgrp_plat_lat_stats.latencies[src][src] == 0) {
1068                 /*
1069                  * Latency information should be updated by lgrp_plat_config()
1070                  * for DR operations. Something is wrong if reaches here.
1071                  * For safety, flatten lgrp topology to two levels.
1072                  */
1073                 if (plat_dr_support_cpu() || plat_dr_support_memory()) {
1074                         ASSERT(lgrp_plat_lat_stats.latencies[src][src]);
1075                         cmn_err(CE_WARN,
1076                             "lgrp: failed to get latency information, "
1077                             "fall back to two-level topology.");
1078                         lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1079                 } else {
1080                         node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1081                             lgrp_plat_cpu_node_nentries);
1082                         ASSERT(node >= 0 && node < lgrp_plat_node_cnt);
1083                         if (node == src)
1084                                 lgrp_plat_probe();
1085                 }
1086         }
1087
1088         return (lgrp_plat_lat_stats.latencies[src][dest]);
1089 }
1090
1091
1092 /*
1093  * Return the maximum number of lgrps supported by the platform.
1094  * Before lgrp topology is known it returns an estimate based on the number of
1095  * nodes. Once topology is known it returns:
1096  * 1) the actual maximim number of lgrps created if CPU/memory DR operations
1097  *    are not suppported.
1098  * 2) the maximum possible number of lgrps if CPU/memory DR operations are
1099  *    supported.
1100  */
1101 int
1102 lgrp_plat_max_lgrps(void)
1103 {
1104         if (!lgrp_topo_initialized || plat_dr_support_cpu() ||
1105             plat_dr_support_memory()) {
1106                 return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1);
1107         } else {
1108                 return (lgrp_alloc_max + 1);
1109         }
1110 }
1111
1112
1113 /*
1114  * Count number of memory pages (_t) based on mnode id (_n) and query type (_t).
1115  */
1116 #define _LGRP_PLAT_MEM_SIZE(_n, _q, _t)                                 \
1117         if (mem_node_config[_n].exists) {                               \
1118                 switch (_q) {                                           \
1119                 case LGRP_MEM_SIZE_FREE:                                \
1120                         _t += MNODE_PGCNT(_n);                          \
1121                         break;                                          \
1122                 case LGRP_MEM_SIZE_AVAIL:                               \
1123                         _t += mem_node_memlist_pages(_n, phys_avail);   \
1124                                 break;                                  \
1125                 case LGRP_MEM_SIZE_INSTALL:                             \
1126                         _t += mem_node_memlist_pages(_n, phys_install); \
1127                         break;                                          \
1128                 default:                                                \
1129                         break;                                          \
1130                 }                                                       \
1131         }
1132
1133 /*
1134  * Return the number of free pages in an lgroup.
1135  *
1136  * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize
1137  * pages on freelists.  For query of LGRP_MEM_SIZE_AVAIL, return the
1138  * number of allocatable base pagesize pages corresponding to the
1139  * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..)
1140  * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical
1141  * memory installed, regardless of whether or not it's usable.
1142  */
1143 pgcnt_t
1144 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query)
1145 {
1146         int     mnode;
1147         pgcnt_t npgs = (pgcnt_t)0;
1148         extern struct memlist *phys_avail;
1149         extern struct memlist *phys_install;
1150
1151
1152         if (plathand == LGRP_DEFAULT_HANDLE)
1153                 return (lgrp_plat_mem_size_default(plathand, query));
1154
1155         if (plathand != LGRP_NULL_HANDLE) {
1156                 /* Count memory node present at boot. */
1157                 mnode = (int)plathand;
1158                 ASSERT(mnode < lgrp_plat_node_cnt);
1159                 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1160
1161                 /* Count possible hot-added memory nodes. */
1162                 for (mnode = lgrp_plat_node_cnt;
1163                     mnode < lgrp_plat_max_mem_node; mnode++) {
1164                         if (lgrp_plat_memnode_info[mnode].lgrphand == plathand)
1165                                 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs);
1166                 }
1167         }
1168
1169         return (npgs);
1170 }
1171
1172
1173 /*
1174  * Return the platform handle of the lgroup that contains the physical memory
1175  * corresponding to the given page frame number
1176  */
1177 lgrp_handle_t
1178 lgrp_plat_pfn_to_hand(pfn_t pfn)
1179 {
1180         int     mnode;
1181
1182         if (max_mem_nodes == 1)
1183                 return (LGRP_DEFAULT_HANDLE);
1184
1185         if (pfn > physmax)
1186                 return (LGRP_NULL_HANDLE);
1187
1188         mnode = plat_pfn_to_mem_node(pfn);
1189         if (mnode < 0)
1190                 return (LGRP_NULL_HANDLE);
1191
1192         return (MEM_NODE_2_LGRPHAND(mnode));
1193 }
1194
1195
1196 /*
1197  * Probe memory in each node from current CPU to determine latency topology
1198  *
1199  * The probing code will probe the vendor ID register on the Northbridge of
1200  * Opteron processors and probe memory for other processors by default.
1201  *
1202  * Since probing is inherently error prone, the code takes laps across all the
1203  * nodes probing from each node to each of the other nodes some number of
1204  * times.  Furthermore, each node is probed some number of times before moving
1205  * onto the next one during each lap.  The minimum latency gotten between nodes
1206  * is kept as the latency between the nodes.
1207  *
1208  * After all that,  the probe times are adjusted by normalizing values that are
1209  * close to each other and local latencies are made the same.  Lastly, the
1210  * latencies are verified to make sure that certain conditions are met (eg.
1211  * local < remote, latency(a, b) == latency(b, a), etc.).
1212  *
1213  * If any of the conditions aren't met, the code will export a NUMA
1214  * configuration with the local CPUs and memory given by the SRAT or PCI config
1215  * space registers and one remote memory latency since it can't tell exactly
1216  * how far each node is from each other.
1217  */
1218 void
1219 lgrp_plat_probe(void)
1220 {
1221         int                             from;
1222         int                             i;
1223         lgrp_plat_latency_stats_t       *lat_stats;
1224         boolean_t                       probed;
1225         hrtime_t                        probe_time;
1226         int                             to;
1227
1228         if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
1229             max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2)
1230                 return;
1231
1232         /* SRAT and SLIT should be enabled if DR operations are enabled. */
1233         if (plat_dr_support_cpu() || plat_dr_support_memory())
1234                 return;
1235
1236         /*
1237          * Determine ID of node containing current CPU
1238          */
1239         from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
1240             lgrp_plat_cpu_node_nentries);
1241         ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
1242         if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error)
1243                 ASSERT(lgrp_plat_node_domain[from].exists);
1244
1245         /*
1246          * Don't need to probe if got times already
1247          */
1248         lat_stats = &lgrp_plat_lat_stats;
1249         if (lat_stats->latencies[from][from] != 0)
1250                 return;
1251
1252         /*
1253          * Read vendor ID in Northbridge or read and write page(s)
1254          * in each node from current CPU and remember how long it takes,
1255          * so we can build latency topology of machine later.
1256          * This should approximate the memory latency between each node.
1257          */
1258         probed = B_FALSE;
1259         for (i = 0; i < lgrp_plat_probe_nrounds; i++) {
1260                 for (to = 0; to < lgrp_plat_node_cnt; to++) {
1261                         /*
1262                          * Get probe time and skip over any nodes that can't be
1263                          * probed yet or don't have memory
1264                          */
1265                         probe_time = lgrp_plat_probe_time(to,
1266                             lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries,
1267                             &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats,
1268                             &lgrp_plat_probe_stats);
1269                         if (probe_time == 0)
1270                                 continue;
1271
1272                         probed = B_TRUE;
1273
1274                         /*
1275                          * Keep lowest probe time as latency between nodes
1276                          */
1277                         if (lat_stats->latencies[from][to] == 0 ||
1278                             probe_time < lat_stats->latencies[from][to])
1279                                 lat_stats->latencies[from][to] = probe_time;
1280
1281                         /*
1282                          * Update overall minimum and maximum probe times
1283                          * across all nodes
1284                          */
1285                         if (probe_time < lat_stats->latency_min ||
1286                             lat_stats->latency_min == -1)
1287                                 lat_stats->latency_min = probe_time;
1288                         if (probe_time > lat_stats->latency_max)
1289                                 lat_stats->latency_max = probe_time;
1290                 }
1291         }
1292
1293         /*
1294          * Bail out if weren't able to probe any nodes from current CPU
1295          */
1296         if (probed == B_FALSE)
1297                 return;
1298
1299         /*
1300          * - Fix up latencies such that local latencies are same,
1301          *   latency(i, j) == latency(j, i), etc. (if possible)
1302          *
1303          * - Verify that latencies look ok
1304          *
1305          * - Fallback to just optimizing for local and remote if
1306          *   latencies didn't look right
1307          */
1308         lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats,
1309             &lgrp_plat_probe_stats);
1310         lgrp_plat_probe_stats.probe_error_code =
1311             lgrp_plat_latency_verify(lgrp_plat_memnode_info,
1312             &lgrp_plat_lat_stats);
1313         if (lgrp_plat_probe_stats.probe_error_code)
1314                 lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
1315 }
1316
1317
1318 /*
1319  * Return platform handle for root lgroup
1320  */
1321 lgrp_handle_t
1322 lgrp_plat_root_hand(void)
1323 {
1324         return (LGRP_DEFAULT_HANDLE);
1325 }
1326
1327
1328 /*
1329  * INTERNAL ROUTINES
1330  */
1331
1332
1333 /*
1334  * Update CPU to node mapping for given CPU and proximity domain.
1335  * Return values:
1336  *      - zero for success
1337  *      - positive numbers for warnings
1338  *      - negative numbers for errors
1339  */
1340 static int
1341 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt,
1342     cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain)
1343 {
1344         uint_t  i;
1345         int     node;
1346
1347         /*
1348          * Get node number for proximity domain
1349          */
1350         node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
1351         if (node == -1) {
1352                 node = lgrp_plat_node_domain_update(node_domain, node_cnt,
1353                     domain);
1354                 if (node == -1)
1355                         return (-1);
1356         }
1357
1358         /*
1359          * Search for entry with given APIC ID and fill in its node and
1360          * proximity domain IDs (if they haven't been set already)
1361          */
1362         for (i = 0; i < nentries; i++) {
1363                 /*
1364                  * Skip nonexistent entries and ones without matching APIC ID
1365                  */
1366                 if (!cpu_node[i].exists || cpu_node[i].apicid != apicid)
1367                         continue;
1368
1369                 /*
1370                  * Just return if entry completely and correctly filled in
1371                  * already
1372                  */
1373                 if (cpu_node[i].prox_domain == domain &&
1374                     cpu_node[i].node == node)
1375                         return (1);
1376
1377                 /*
1378                  * It's invalid to have more than one entry with the same
1379                  * local APIC ID in SRAT table.
1380                  */
1381                 if (cpu_node[i].node != UINT_MAX)
1382                         return (-2);
1383
1384                 /*
1385                  * Fill in node and proximity domain IDs
1386                  */
1387                 cpu_node[i].prox_domain = domain;
1388                 cpu_node[i].node = node;
1389
1390                 return (0);
1391         }
1392
1393         /*
1394          * It's possible that an apicid doesn't exist in the cpu_node map due
1395          * to user limits number of CPUs powered on at boot by specifying the
1396          * boot_ncpus kernel option.
1397          */
1398         return (2);
1399 }
1400
1401
1402 /*
1403  * Get node ID for given CPU
1404  */
1405 static int
1406 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node,
1407     int cpu_node_nentries)
1408 {
1409         processorid_t   cpuid;
1410
1411         if (cp == NULL)
1412                 return (-1);
1413
1414         cpuid = cp->cpu_id;
1415         if (cpuid < 0 || cpuid >= max_ncpus)
1416                 return (-1);
1417
1418         /*
1419          * SRAT doesn't exist, isn't enabled, or there was an error processing
1420          * it, so return node ID for Opteron and -1 otherwise.
1421          */
1422         if (srat_ptr == NULL || !lgrp_plat_srat_enable ||
1423             lgrp_plat_srat_error) {
1424                 if (is_opteron())
1425                         return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE));
1426                 return (-1);
1427         }
1428
1429         /*
1430          * Return -1 when CPU to node ID mapping entry doesn't exist for given
1431          * CPU
1432          */
1433         if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists)
1434                 return (-1);
1435
1436         return (cpu_node[cpuid].node);
1437 }
1438
1439
1440 /*
1441  * Return node number for given proximity domain/system locality
1442  */
1443 static int
1444 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt,
1445     uint32_t domain)
1446 {
1447         uint_t  node;
1448         uint_t  start;
1449
1450         /*
1451          * Hash proximity domain ID into node to domain mapping table (array),
1452          * search for entry with matching proximity domain ID, and return index
1453          * of matching entry as node ID.
1454          */
1455         node = start = NODE_DOMAIN_HASH(domain, node_cnt);
1456         do {
1457                 if (node_domain[node].exists) {
1458                         membar_consumer();
1459                         if (node_domain[node].prox_domain == domain)
1460                                 return (node);
1461                 }
1462                 node = (node + 1) % node_cnt;
1463         } while (node != start);
1464         return (-1);
1465 }
1466
1467
1468 /*
1469  * Get NUMA configuration of machine
1470  */
1471 static void
1472 lgrp_plat_get_numa_config(void)
1473 {
1474         uint_t          probe_op;
1475
1476         /*
1477          * Read boot property with CPU to APIC ID mapping table/array to
1478          * determine number of CPUs
1479          */
1480         lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL);
1481
1482         /*
1483          * Determine which CPUs and memory are local to each other and number
1484          * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT)
1485          */
1486         if (lgrp_plat_apic_ncpus > 0) {
1487                 int     retval;
1488
1489                 /* Reserve enough resources if CPU DR is enabled. */
1490                 if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus)
1491                         lgrp_plat_cpu_node_nentries = max_ncpus;
1492                 else
1493                         lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus;
1494
1495                 /*
1496                  * Temporarily allocate boot memory to use for CPU to node
1497                  * mapping since kernel memory allocator isn't alive yet
1498                  */
1499                 lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops,
1500                     NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t),
1501                     sizeof (int));
1502
1503                 ASSERT(lgrp_plat_cpu_node != NULL);
1504                 if (lgrp_plat_cpu_node) {
1505                         bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries *
1506                             sizeof (cpu_node_map_t));
1507                 } else {
1508                         lgrp_plat_cpu_node_nentries = 0;
1509                 }
1510
1511                 /*
1512                  * Fill in CPU to node ID mapping table with APIC ID for each
1513                  * CPU
1514                  */
1515                 (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node);
1516
1517                 retval = lgrp_plat_process_srat(srat_ptr, msct_ptr,
1518                     &lgrp_plat_prox_domain_min,
1519                     lgrp_plat_node_domain, lgrp_plat_cpu_node,
1520                     lgrp_plat_apic_ncpus, lgrp_plat_memnode_info);
1521                 if (retval <= 0) {
1522                         lgrp_plat_srat_error = retval;
1523                         lgrp_plat_node_cnt = 1;
1524                 } else {
1525                         lgrp_plat_srat_error = 0;
1526                         lgrp_plat_node_cnt = retval;
1527                 }
1528         }
1529
1530         /*
1531          * Try to use PCI config space registers on Opteron if there's an error
1532          * processing CPU to APIC ID mapping or SRAT
1533          */
1534         if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) &&
1535             is_opteron())
1536                 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv,
1537                     lgrp_plat_memnode_info);
1538
1539         /*
1540          * Don't bother to setup system for multiple lgroups and only use one
1541          * memory node when memory is interleaved between any nodes or there is
1542          * only one NUMA node
1543          */
1544         if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) {
1545                 lgrp_plat_node_cnt = max_mem_nodes = 1;
1546                 (void) lgrp_topo_ht_limit_set(1);
1547                 return;
1548         }
1549
1550         /*
1551          * Leaf lgroups on x86/x64 architectures contain one physical
1552          * processor chip. Tune lgrp_expand_proc_thresh and
1553          * lgrp_expand_proc_diff so that lgrp_choose() will spread
1554          * things out aggressively.
1555          */
1556         lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2;
1557         lgrp_expand_proc_diff = 0;
1558
1559         /*
1560          * There should be one memnode (physical page free list(s)) for
1561          * each node if memory DR is disabled.
1562          */
1563         max_mem_nodes = lgrp_plat_node_cnt;
1564
1565         /*
1566          * Initialize min and max latency before reading SLIT or probing
1567          */
1568         lgrp_plat_lat_stats.latency_min = -1;
1569         lgrp_plat_lat_stats.latency_max = 0;
1570
1571         /*
1572          * Determine how far each NUMA node is from each other by
1573          * reading ACPI System Locality Information Table (SLIT) if it
1574          * exists
1575          */
1576         lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr,
1577             lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info,
1578             &lgrp_plat_lat_stats);
1579
1580         /*
1581          * Disable support of CPU/memory DR operations if multiple locality
1582          * domains exist in system and either of following is true.
1583          * 1) Failed to process SLIT table.
1584          * 2) Latency probing is enabled by user.
1585          */
1586         if (lgrp_plat_node_cnt > 1 &&
1587             (plat_dr_support_cpu() || plat_dr_support_memory())) {
1588                 if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 ||
1589                     !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 ||
1590                     lgrp_plat_apic_ncpus <= 0) {
1591                         cmn_err(CE_CONT,
1592                             "?lgrp: failed to process ACPI SRAT/SLIT table, "
1593                             "disable support of CPU/memory DR operations.");
1594                         plat_dr_disable_cpu();
1595                         plat_dr_disable_memory();
1596                 } else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) {
1597                         cmn_err(CE_CONT,
1598                             "?lgrp: latency probing enabled by user, "
1599                             "disable support of CPU/memory DR operations.");
1600                         plat_dr_disable_cpu();
1601                         plat_dr_disable_memory();
1602                 }
1603         }
1604
1605         /* Done if succeeded to process SLIT table. */
1606         if (lgrp_plat_slit_error == 0)
1607                 return;
1608
1609         /*
1610          * Probe to determine latency between NUMA nodes when SLIT
1611          * doesn't exist or make sense
1612          */
1613         lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE;
1614
1615         /*
1616          * Specify whether to probe using vendor ID register or page copy
1617          * if hasn't been specified already or is overspecified
1618          */
1619         probe_op = lgrp_plat_probe_flags &
1620             (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1621
1622         if (probe_op == 0 ||
1623             probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) {
1624                 lgrp_plat_probe_flags &=
1625                     ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR);
1626                 if (is_opteron())
1627                         lgrp_plat_probe_flags |=
1628                             LGRP_PLAT_PROBE_VENDOR;
1629                 else
1630                         lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY;
1631         }
1632
1633         /*
1634          * Probing errors can mess up the lgroup topology and
1635          * force us fall back to a 2 level lgroup topology.
1636          * Here we bound how tall the lgroup topology can grow
1637          * in hopes of avoiding any anamolies in probing from
1638          * messing up the lgroup topology by limiting the
1639          * accuracy of the latency topology.
1640          *
1641          * Assume that nodes will at least be configured in a
1642          * ring, so limit height of lgroup topology to be less
1643          * than number of nodes on a system with 4 or more
1644          * nodes
1645          */
1646         if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() ==
1647             lgrp_topo_ht_limit_default())
1648                 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1);
1649 }
1650
1651
1652 /*
1653  * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to
1654  * be considered same
1655  */
1656 #define LGRP_LAT_TOLERANCE_SHIFT        4
1657
1658 int     lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT;
1659
1660
1661 /*
1662  * Adjust latencies between nodes to be symmetric, normalize latencies between
1663  * any nodes that are within some tolerance to be same, and make local
1664  * latencies be same
1665  */
1666 static void
1667 lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info,
1668     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
1669 {
1670         int                             i;
1671         int                             j;
1672         int                             k;
1673         int                             l;
1674         u_longlong_t                    max;
1675         u_longlong_t                    min;
1676         u_longlong_t                    t;
1677         u_longlong_t                    t1;
1678         u_longlong_t                    t2;
1679         const lgrp_config_flag_t        cflag = LGRP_CONFIG_LAT_CHANGE_ALL;
1680         int                             lat_corrected[MAX_NODES][MAX_NODES];
1681
1682         /*
1683          * Nothing to do when this is an UMA machine or don't have args needed
1684          */
1685         if (max_mem_nodes == 1)
1686                 return;
1687
1688         ASSERT(memnode_info != NULL && lat_stats != NULL &&
1689             probe_stats != NULL);
1690
1691         /*
1692          * Make sure that latencies are symmetric between any two nodes
1693          * (ie. latency(node0, node1) == latency(node1, node0))
1694          */
1695         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1696                 if (!memnode_info[i].exists)
1697                         continue;
1698
1699                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1700                         if (!memnode_info[j].exists)
1701                                 continue;
1702
1703                         t1 = lat_stats->latencies[i][j];
1704                         t2 = lat_stats->latencies[j][i];
1705
1706                         if (t1 == 0 || t2 == 0 || t1 == t2)
1707                                 continue;
1708
1709                         /*
1710                          * Latencies should be same
1711                          * - Use minimum of two latencies which should be same
1712                          * - Track suspect probe times not within tolerance of
1713                          *   min value
1714                          * - Remember how much values are corrected by
1715                          */
1716                         if (t1 > t2) {
1717                                 t = t2;
1718                                 probe_stats->probe_errors[i][j] += t1 - t2;
1719                                 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) {
1720                                         probe_stats->probe_suspect[i][j]++;
1721                                         probe_stats->probe_suspect[j][i]++;
1722                                 }
1723                         } else if (t2 > t1) {
1724                                 t = t1;
1725                                 probe_stats->probe_errors[j][i] += t2 - t1;
1726                                 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) {
1727                                         probe_stats->probe_suspect[i][j]++;
1728                                         probe_stats->probe_suspect[j][i]++;
1729                                 }
1730                         }
1731
1732                         lat_stats->latencies[i][j] =
1733                             lat_stats->latencies[j][i] = t;
1734                         lgrp_config(cflag, t1, t);
1735                         lgrp_config(cflag, t2, t);
1736                 }
1737         }
1738
1739         /*
1740          * Keep track of which latencies get corrected
1741          */
1742         for (i = 0; i < MAX_NODES; i++)
1743                 for (j = 0; j < MAX_NODES; j++)
1744                         lat_corrected[i][j] = 0;
1745
1746         /*
1747          * For every two nodes, see whether there is another pair of nodes which
1748          * are about the same distance apart and make the latencies be the same
1749          * if they are close enough together
1750          */
1751         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1752                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1753                         if (!memnode_info[j].exists)
1754                                 continue;
1755                         /*
1756                          * Pick one pair of nodes (i, j)
1757                          * and get latency between them
1758                          */
1759                         t1 = lat_stats->latencies[i][j];
1760
1761                         /*
1762                          * Skip this pair of nodes if there isn't a latency
1763                          * for it yet
1764                          */
1765                         if (t1 == 0)
1766                                 continue;
1767
1768                         for (k = 0; k < lgrp_plat_node_cnt; k++) {
1769                                 for (l = 0; l < lgrp_plat_node_cnt; l++) {
1770                                         if (!memnode_info[l].exists)
1771                                                 continue;
1772                                         /*
1773                                          * Pick another pair of nodes (k, l)
1774                                          * not same as (i, j) and get latency
1775                                          * between them
1776                                          */
1777                                         if (k == i && l == j)
1778                                                 continue;
1779
1780                                         t2 = lat_stats->latencies[k][l];
1781
1782                                         /*
1783                                          * Skip this pair of nodes if there
1784                                          * isn't a latency for it yet
1785                                          */
1786
1787                                         if (t2 == 0)
1788                                                 continue;
1789
1790                                         /*
1791                                          * Skip nodes (k, l) if they already
1792                                          * have same latency as (i, j) or
1793                                          * their latency isn't close enough to
1794                                          * be considered/made the same
1795                                          */
1796                                         if (t1 == t2 || (t1 > t2 && t1 - t2 >
1797                                             t1 >> lgrp_plat_probe_lt_shift) ||
1798                                             (t2 > t1 && t2 - t1 >
1799                                             t2 >> lgrp_plat_probe_lt_shift))
1800                                                 continue;
1801
1802                                         /*
1803                                          * Make latency(i, j) same as
1804                                          * latency(k, l), try to use latency
1805                                          * that has been adjusted already to get
1806                                          * more consistency (if possible), and
1807                                          * remember which latencies were
1808                                          * adjusted for next time
1809                                          */
1810                                         if (lat_corrected[i][j]) {
1811                                                 t = t1;
1812                                                 lgrp_config(cflag, t2, t);
1813                                                 t2 = t;
1814                                         } else if (lat_corrected[k][l]) {
1815                                                 t = t2;
1816                                                 lgrp_config(cflag, t1, t);
1817                                                 t1 = t;
1818                                         } else {
1819                                                 if (t1 > t2)
1820                                                         t = t2;
1821                                                 else
1822                                                         t = t1;
1823                                                 lgrp_config(cflag, t1, t);
1824                                                 lgrp_config(cflag, t2, t);
1825                                                 t1 = t2 = t;
1826                                         }
1827
1828                                         lat_stats->latencies[i][j] =
1829                                             lat_stats->latencies[k][l] = t;
1830
1831                                         lat_corrected[i][j] =
1832                                             lat_corrected[k][l] = 1;
1833                                 }
1834                         }
1835                 }
1836         }
1837
1838         /*
1839          * Local latencies should be same
1840          * - Find min and max local latencies
1841          * - Make all local latencies be minimum
1842          */
1843         min = -1;
1844         max = 0;
1845         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1846                 if (!memnode_info[i].exists)
1847                         continue;
1848                 t = lat_stats->latencies[i][i];
1849                 if (t == 0)
1850                         continue;
1851                 if (min == -1 || t < min)
1852                         min = t;
1853                 if (t > max)
1854                         max = t;
1855         }
1856         if (min != max) {
1857                 for (i = 0; i < lgrp_plat_node_cnt; i++) {
1858                         int     local;
1859
1860                         if (!memnode_info[i].exists)
1861                                 continue;
1862
1863                         local = lat_stats->latencies[i][i];
1864                         if (local == 0)
1865                                 continue;
1866
1867                         /*
1868                          * Track suspect probe times that aren't within
1869                          * tolerance of minimum local latency and how much
1870                          * probe times are corrected by
1871                          */
1872                         if (local - min > min >> lgrp_plat_probe_lt_shift)
1873                                 probe_stats->probe_suspect[i][i]++;
1874
1875                         probe_stats->probe_errors[i][i] += local - min;
1876
1877                         /*
1878                          * Make local latencies be minimum
1879                          */
1880                         lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min);
1881                         lat_stats->latencies[i][i] = min;
1882                 }
1883         }
1884
1885         /*
1886          * Determine max probe time again since just adjusted latencies
1887          */
1888         lat_stats->latency_max = 0;
1889         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1890                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1891                         if (!memnode_info[j].exists)
1892                                 continue;
1893                         t = lat_stats->latencies[i][j];
1894                         if (t > lat_stats->latency_max)
1895                                 lat_stats->latency_max = t;
1896                 }
1897         }
1898 }
1899
1900
1901 /*
1902  * Verify following about latencies between nodes:
1903  *
1904  * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a))
1905  * - Local latencies same
1906  * - Local < remote
1907  * - Number of latencies seen is reasonable
1908  * - Number of occurrences of a given latency should be more than 1
1909  *
1910  * Returns:
1911  *      0       Success
1912  *      -1      Not symmetric
1913  *      -2      Local latencies not same
1914  *      -3      Local >= remote
1915  */
1916 static int
1917 lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info,
1918     lgrp_plat_latency_stats_t *lat_stats)
1919 {
1920         int                             i;
1921         int                             j;
1922         u_longlong_t                    t1;
1923         u_longlong_t                    t2;
1924
1925         ASSERT(memnode_info != NULL && lat_stats != NULL);
1926
1927         /*
1928          * Nothing to do when this is an UMA machine, lgroup topology is
1929          * limited to 2 levels, or there aren't any probe times yet
1930          */
1931         if (max_mem_nodes == 1 || lgrp_topo_levels < 2 ||
1932             lat_stats->latencies[0][0] == 0)
1933                 return (0);
1934
1935         /*
1936          * Make sure that latencies are symmetric between any two nodes
1937          * (ie. latency(node0, node1) == latency(node1, node0))
1938          */
1939         for (i = 0; i < lgrp_plat_node_cnt; i++) {
1940                 if (!memnode_info[i].exists)
1941                         continue;
1942                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
1943                         if (!memnode_info[j].exists)
1944                                 continue;
1945                         t1 = lat_stats->latencies[i][j];
1946                         t2 = lat_stats->latencies[j][i];
1947
1948                         if (t1 == 0 || t2 == 0 || t1 == t2)
1949                                 continue;
1950
1951                         return (-1);
1952                 }
1953         }
1954
1955         /*
1956          * Local latencies should be same
1957          */
1958         t1 = lat_stats->latencies[0][0];
1959         for (i = 1; i < lgrp_plat_node_cnt; i++) {
1960                 if (!memnode_info[i].exists)
1961                         continue;
1962
1963                 t2 = lat_stats->latencies[i][i];
1964                 if (t2 == 0)
1965                         continue;
1966
1967                 if (t1 == 0) {
1968                         t1 = t2;
1969                         continue;
1970                 }
1971
1972                 if (t1 != t2)
1973                         return (-2);
1974         }
1975
1976         /*
1977          * Local latencies should be less than remote
1978          */
1979         if (t1) {
1980                 for (i = 0; i < lgrp_plat_node_cnt; i++) {
1981                         for (j = 0; j < lgrp_plat_node_cnt; j++) {
1982                                 if (!memnode_info[j].exists)
1983                                         continue;
1984                                 t2 = lat_stats->latencies[i][j];
1985                                 if (i == j || t2 == 0)
1986                                         continue;
1987
1988                                 if (t1 >= t2)
1989                                         return (-3);
1990                         }
1991                 }
1992         }
1993
1994         return (0);
1995 }
1996
1997
1998 /*
1999  * Platform-specific initialization
2000  */
2001 static void
2002 lgrp_plat_main_init(void)
2003 {
2004         int     curnode;
2005         int     ht_limit;
2006         int     i;
2007
2008         /*
2009          * Print a notice that MPO is disabled when memory is interleaved
2010          * across nodes....Would do this when it is discovered, but can't
2011          * because it happens way too early during boot....
2012          */
2013         if (lgrp_plat_mem_intrlv)
2014                 cmn_err(CE_NOTE,
2015                     "MPO disabled because memory is interleaved\n");
2016
2017         /*
2018          * Don't bother to do any probing if it is disabled, there is only one
2019          * node, or the height of the lgroup topology less than or equal to 2
2020          */
2021         ht_limit = lgrp_topo_ht_limit();
2022         if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) ||
2023             max_mem_nodes == 1 || ht_limit <= 2) {
2024                 /*
2025                  * Setup lgroup latencies for 2 level lgroup topology
2026                  * (ie. local and remote only) if they haven't been set yet
2027                  */
2028                 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 &&
2029                     lgrp_plat_lat_stats.latency_max == 0)
2030                         lgrp_plat_2level_setup(&lgrp_plat_lat_stats);
2031                 return;
2032         }
2033
2034         if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2035                 /*
2036                  * Should have been able to probe from CPU 0 when it was added
2037                  * to lgroup hierarchy, but may not have been able to then
2038                  * because it happens so early in boot that gethrtime() hasn't
2039                  * been initialized.  (:-(
2040                  */
2041                 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node,
2042                     lgrp_plat_cpu_node_nentries);
2043                 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt);
2044                 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0)
2045                         lgrp_plat_probe();
2046
2047                 return;
2048         }
2049
2050         /*
2051          * When probing memory, use one page for every sample to determine
2052          * lgroup topology and taking multiple samples
2053          */
2054         if (lgrp_plat_probe_mem_config.probe_memsize == 0)
2055                 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE *
2056                     lgrp_plat_probe_nsamples;
2057
2058         /*
2059          * Map memory in each node needed for probing to determine latency
2060          * topology
2061          */
2062         for (i = 0; i < lgrp_plat_node_cnt; i++) {
2063                 int     mnode;
2064
2065                 /*
2066                  * Skip this node and leave its probe page NULL
2067                  * if it doesn't have any memory
2068                  */
2069                 mnode = i;
2070                 if (!mem_node_config[mnode].exists) {
2071                         lgrp_plat_probe_mem_config.probe_va[i] = NULL;
2072                         continue;
2073                 }
2074
2075                 /*
2076                  * Allocate one kernel virtual page
2077                  */
2078                 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena,
2079                     lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP);
2080                 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) {
2081                         cmn_err(CE_WARN,
2082                             "lgrp_plat_main_init: couldn't allocate memory");
2083                         return;
2084                 }
2085
2086                 /*
2087                  * Get PFN for first page in each node
2088                  */
2089                 lgrp_plat_probe_mem_config.probe_pfn[i] =
2090                     mem_node_config[mnode].physbase;
2091
2092                 /*
2093                  * Map virtual page to first page in node
2094                  */
2095                 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i],
2096                     lgrp_plat_probe_mem_config.probe_memsize,
2097                     lgrp_plat_probe_mem_config.probe_pfn[i],
2098                     PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE,
2099                     HAT_LOAD_NOCONSIST);
2100         }
2101
2102         /*
2103          * Probe from current CPU
2104          */
2105         lgrp_plat_probe();
2106 }
2107
2108
2109 /*
2110  * Return the number of free, allocatable, or installed
2111  * pages in an lgroup
2112  * This is a copy of the MAX_MEM_NODES == 1 version of the routine
2113  * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup
2114  */
2115 static pgcnt_t
2116 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query)
2117 {
2118         _NOTE(ARGUNUSED(lgrphand));
2119
2120         struct memlist *mlist;
2121         pgcnt_t npgs = 0;
2122         extern struct memlist *phys_avail;
2123         extern struct memlist *phys_install;
2124
2125         switch (query) {
2126         case LGRP_MEM_SIZE_FREE:
2127                 return ((pgcnt_t)freemem);
2128         case LGRP_MEM_SIZE_AVAIL:
2129                 memlist_read_lock();
2130                 for (mlist = phys_avail; mlist; mlist = mlist->ml_next)
2131                         npgs += btop(mlist->ml_size);
2132                 memlist_read_unlock();
2133                 return (npgs);
2134         case LGRP_MEM_SIZE_INSTALL:
2135                 memlist_read_lock();
2136                 for (mlist = phys_install; mlist; mlist = mlist->ml_next)
2137                         npgs += btop(mlist->ml_size);
2138                 memlist_read_unlock();
2139                 return (npgs);
2140         default:
2141                 return ((pgcnt_t)0);
2142         }
2143 }
2144
2145
2146 /*
2147  * Update node to proximity domain mappings for given domain and return node ID
2148  */
2149 static int
2150 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt,
2151     uint32_t domain)
2152 {
2153         uint_t  node;
2154         uint_t  start;
2155
2156         /*
2157          * Hash proximity domain ID into node to domain mapping table (array)
2158          * and add entry for it into first non-existent or matching entry found
2159          */
2160         node = start = NODE_DOMAIN_HASH(domain, node_cnt);
2161         do {
2162                 /*
2163                  * Entry doesn't exist yet, so create one for this proximity
2164                  * domain and return node ID which is index into mapping table.
2165                  */
2166                 if (!node_domain[node].exists) {
2167                         node_domain[node].prox_domain = domain;
2168                         membar_producer();
2169                         node_domain[node].exists = 1;
2170                         return (node);
2171                 }
2172
2173                 /*
2174                  * Entry exists for this proximity domain already, so just
2175                  * return node ID (index into table).
2176                  */
2177                 if (node_domain[node].prox_domain == domain)
2178                         return (node);
2179                 node = NODE_DOMAIN_HASH(node + 1, node_cnt);
2180         } while (node != start);
2181
2182         /*
2183          * Ran out of supported number of entries which shouldn't happen....
2184          */
2185         ASSERT(node != start);
2186         return (-1);
2187 }
2188
2189 /*
2190  * Update node memory information for given proximity domain with specified
2191  * starting and ending physical address range (and return positive numbers for
2192  * success and negative ones for errors)
2193  */
2194 static int
2195 lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt,
2196     memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start,
2197     uint64_t end, uint32_t domain, uint32_t device_id)
2198 {
2199         int     node, mnode;
2200
2201         /*
2202          * Get node number for proximity domain
2203          */
2204         node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain);
2205         if (node == -1) {
2206                 node = lgrp_plat_node_domain_update(node_domain, node_cnt,
2207                     domain);
2208                 if (node == -1)
2209                         return (-1);
2210         }
2211
2212         /*
2213          * This function is called during boot if device_id is
2214          * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for
2215          * memory DR operations.
2216          */
2217         if (device_id != ACPI_MEMNODE_DEVID_BOOT) {
2218                 ASSERT(lgrp_plat_max_mem_node <= memnode_cnt);
2219
2220                 for (mnode = lgrp_plat_node_cnt;
2221                     mnode < lgrp_plat_max_mem_node; mnode++) {
2222                         if (memnode_info[mnode].exists &&
2223                             memnode_info[mnode].prox_domain == domain &&
2224                             memnode_info[mnode].device_id == device_id) {
2225                                 if (btop(start) < memnode_info[mnode].start)
2226                                         memnode_info[mnode].start = btop(start);
2227                                 if (btop(end) > memnode_info[mnode].end)
2228                                         memnode_info[mnode].end = btop(end);
2229                                 return (1);
2230                         }
2231                 }
2232
2233                 if (lgrp_plat_max_mem_node >= memnode_cnt) {
2234                         return (-3);
2235                 } else {
2236                         lgrp_plat_max_mem_node++;
2237                         memnode_info[mnode].start = btop(start);
2238                         memnode_info[mnode].end = btop(end);
2239                         memnode_info[mnode].prox_domain = domain;
2240                         memnode_info[mnode].device_id = device_id;
2241                         memnode_info[mnode].lgrphand = node;
2242                         membar_producer();
2243                         memnode_info[mnode].exists = 1;
2244                         return (0);
2245                 }
2246         }
2247
2248         /*
2249          * Create entry in table for node if it doesn't exist
2250          */
2251         ASSERT(node < memnode_cnt);
2252         if (!memnode_info[node].exists) {
2253                 memnode_info[node].start = btop(start);
2254                 memnode_info[node].end = btop(end);
2255                 memnode_info[node].prox_domain = domain;
2256                 memnode_info[node].device_id = device_id;
2257                 memnode_info[node].lgrphand = node;
2258                 membar_producer();
2259                 memnode_info[node].exists = 1;
2260                 return (0);
2261         }
2262
2263         /*
2264          * Entry already exists for this proximity domain
2265          *
2266          * There may be more than one SRAT memory entry for a domain, so we may
2267          * need to update existing start or end address for the node.
2268          */
2269         if (memnode_info[node].prox_domain == domain) {
2270                 if (btop(start) < memnode_info[node].start)
2271                         memnode_info[node].start = btop(start);
2272                 if (btop(end) > memnode_info[node].end)
2273                         memnode_info[node].end = btop(end);
2274                 return (1);
2275         }
2276         return (-2);
2277 }
2278
2279
2280 /*
2281  * Have to sort nodes by starting physical address because plat_mnode_xcheck()
2282  * assumes and expects memnodes to be sorted in ascending order by physical
2283  * address.
2284  */
2285 static void
2286 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt,
2287     cpu_node_map_t *cpu_node, int cpu_count,
2288     memnode_phys_addr_map_t *memnode_info)
2289 {
2290         boolean_t       found;
2291         int             i;
2292         int             j;
2293         int             n;
2294         boolean_t       sorted;
2295         boolean_t       swapped;
2296
2297         if (!lgrp_plat_node_sort_enable || node_cnt <= 1 ||
2298             node_domain == NULL || memnode_info == NULL)
2299                 return;
2300
2301         /*
2302          * Sorted already?
2303          */
2304         sorted = B_TRUE;
2305         for (i = 0; i < node_cnt - 1; i++) {
2306                 /*
2307                  * Skip entries that don't exist
2308                  */
2309                 if (!memnode_info[i].exists)
2310                         continue;
2311
2312                 /*
2313                  * Try to find next existing entry to compare against
2314                  */
2315                 found = B_FALSE;
2316                 for (j = i + 1; j < node_cnt; j++) {
2317                         if (memnode_info[j].exists) {
2318                                 found = B_TRUE;
2319                                 break;
2320                         }
2321                 }
2322
2323                 /*
2324                  * Done if no more existing entries to compare against
2325                  */
2326                 if (found == B_FALSE)
2327                         break;
2328
2329                 /*
2330                  * Not sorted if starting address of current entry is bigger
2331                  * than starting address of next existing entry
2332                  */
2333                 if (memnode_info[i].start > memnode_info[j].start) {
2334                         sorted = B_FALSE;
2335                         break;
2336                 }
2337         }
2338
2339         /*
2340          * Don't need to sort if sorted already
2341          */
2342         if (sorted == B_TRUE)
2343                 return;
2344
2345         /*
2346          * Just use bubble sort since number of nodes is small
2347          */
2348         n = node_cnt;
2349         do {
2350                 swapped = B_FALSE;
2351                 n--;
2352                 for (i = 0; i < n; i++) {
2353                         /*
2354                          * Skip entries that don't exist
2355                          */
2356                         if (!memnode_info[i].exists)
2357                                 continue;
2358
2359                         /*
2360                          * Try to find next existing entry to compare against
2361                          */
2362                         found = B_FALSE;
2363                         for (j = i + 1; j <= n; j++) {
2364                                 if (memnode_info[j].exists) {
2365                                         found = B_TRUE;
2366                                         break;
2367                                 }
2368                         }
2369
2370                         /*
2371                          * Done if no more existing entries to compare against
2372                          */
2373                         if (found == B_FALSE)
2374                                 break;
2375
2376                         if (memnode_info[i].start > memnode_info[j].start) {
2377                                 memnode_phys_addr_map_t save_addr;
2378                                 node_domain_map_t       save_node;
2379
2380                                 /*
2381                                  * Swap node to proxmity domain ID assignments
2382                                  */
2383                                 bcopy(&node_domain[i], &save_node,
2384                                     sizeof (node_domain_map_t));
2385                                 bcopy(&node_domain[j], &node_domain[i],
2386                                     sizeof (node_domain_map_t));
2387                                 bcopy(&save_node, &node_domain[j],
2388                                     sizeof (node_domain_map_t));
2389
2390                                 /*
2391                                  * Swap node to physical memory assignments
2392                                  */
2393                                 bcopy(&memnode_info[i], &save_addr,
2394                                     sizeof (memnode_phys_addr_map_t));
2395                                 bcopy(&memnode_info[j], &memnode_info[i],
2396                                     sizeof (memnode_phys_addr_map_t));
2397                                 bcopy(&save_addr, &memnode_info[j],
2398                                     sizeof (memnode_phys_addr_map_t));
2399                                 swapped = B_TRUE;
2400                         }
2401                 }
2402         } while (swapped == B_TRUE);
2403
2404         /*
2405          * Check to make sure that CPUs assigned to correct node IDs now since
2406          * node to proximity domain ID assignments may have been changed above
2407          */
2408         if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1)
2409                 return;
2410         for (i = 0; i < cpu_count; i++) {
2411                 int             node;
2412
2413                 node = lgrp_plat_domain_to_node(node_domain, node_cnt,
2414                     cpu_node[i].prox_domain);
2415                 if (cpu_node[i].node != node)
2416                         cpu_node[i].node = node;
2417         }
2418
2419 }
2420
2421
2422 /*
2423  * Return time needed to probe from current CPU to memory in given node
2424  */
2425 static hrtime_t
2426 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries,
2427     lgrp_plat_probe_mem_config_t *probe_mem_config,
2428     lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats)
2429 {
2430         caddr_t                 buf;
2431         hrtime_t                elapsed;
2432         hrtime_t                end;
2433         int                     from;
2434         int                     i;
2435         int                     ipl;
2436         hrtime_t                max;
2437         hrtime_t                min;
2438         hrtime_t                start;
2439         extern int              use_sse_pagecopy;
2440
2441         /*
2442          * Determine ID of node containing current CPU
2443          */
2444         from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries);
2445         ASSERT(from >= 0 && from < lgrp_plat_node_cnt);
2446
2447         /*
2448          * Do common work for probing main memory
2449          */
2450         if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) {
2451                 /*
2452                  * Skip probing any nodes without memory and
2453                  * set probe time to 0
2454                  */
2455                 if (probe_mem_config->probe_va[to] == NULL) {
2456                         lat_stats->latencies[from][to] = 0;
2457                         return (0);
2458                 }
2459
2460                 /*
2461                  * Invalidate caches once instead of once every sample
2462                  * which should cut cost of probing by a lot
2463                  */
2464                 probe_stats->flush_cost = gethrtime();
2465                 invalidate_cache();
2466                 probe_stats->flush_cost = gethrtime() -
2467                     probe_stats->flush_cost;
2468                 probe_stats->probe_cost_total += probe_stats->flush_cost;
2469         }
2470
2471         /*
2472          * Probe from current CPU to given memory using specified operation
2473          * and take specified number of samples
2474          */
2475         max = 0;
2476         min = -1;
2477         for (i = 0; i < lgrp_plat_probe_nsamples; i++) {
2478                 probe_stats->probe_cost = gethrtime();
2479
2480                 /*
2481                  * Can't measure probe time if gethrtime() isn't working yet
2482                  */
2483                 if (probe_stats->probe_cost == 0 && gethrtime() == 0)
2484                         return (0);
2485
2486                 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) {
2487                         /*
2488                          * Measure how long it takes to read vendor ID from
2489                          * Northbridge
2490                          */
2491                         elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads);
2492                 } else {
2493                         /*
2494                          * Measure how long it takes to copy page
2495                          * on top of itself
2496                          */
2497                         buf = probe_mem_config->probe_va[to] + (i * PAGESIZE);
2498
2499                         kpreempt_disable();
2500                         ipl = splhigh();
2501                         start = gethrtime();
2502                         if (use_sse_pagecopy)
2503                                 hwblkpagecopy(buf, buf);
2504                         else
2505                                 bcopy(buf, buf, PAGESIZE);
2506                         end = gethrtime();
2507                         elapsed = end - start;
2508                         splx(ipl);
2509                         kpreempt_enable();
2510                 }
2511
2512                 probe_stats->probe_cost = gethrtime() -
2513                     probe_stats->probe_cost;
2514                 probe_stats->probe_cost_total += probe_stats->probe_cost;
2515
2516                 if (min == -1 || elapsed < min)
2517                         min = elapsed;
2518                 if (elapsed > max)
2519                         max = elapsed;
2520         }
2521
2522         /*
2523          * Update minimum and maximum probe times between
2524          * these two nodes
2525          */
2526         if (min < probe_stats->probe_min[from][to] ||
2527             probe_stats->probe_min[from][to] == 0)
2528                 probe_stats->probe_min[from][to] = min;
2529
2530         if (max > probe_stats->probe_max[from][to])
2531                 probe_stats->probe_max[from][to] = max;
2532
2533         return (min);
2534 }
2535
2536
2537 /*
2538  * Read boot property with CPU to APIC ID array, fill in CPU to node ID
2539  * mapping table with APIC ID for each CPU (if pointer to table isn't NULL),
2540  * and return number of CPU APIC IDs.
2541  *
2542  * NOTE: This code assumes that CPU IDs are assigned in order that they appear
2543  *       in in cpu_apicid_array boot property which is based on and follows
2544  *       same ordering as processor list in ACPI MADT.  If the code in
2545  *       usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns
2546  *       CPU IDs ever changes, then this code will need to change too....
2547  */
2548 static int
2549 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node)
2550 {
2551         int     boot_prop_len;
2552         char    *boot_prop_name = BP_CPU_APICID_ARRAY;
2553         uint32_t *cpu_apicid_array;
2554         int     i;
2555         int     n;
2556
2557         /*
2558          * Check length of property value
2559          */
2560         boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name);
2561         if (boot_prop_len <= 0)
2562                 return (-1);
2563
2564         /*
2565          * Calculate number of entries in array and return when the system is
2566          * not very interesting for NUMA. It's not interesting for NUMA if
2567          * system has only one CPU and doesn't support CPU hotplug.
2568          */
2569         n = boot_prop_len / sizeof (*cpu_apicid_array);
2570         if (n == 1 && !plat_dr_support_cpu())
2571                 return (-2);
2572
2573         cpu_apicid_array = (uint32_t *)BOP_ALLOC(bootops, NULL, boot_prop_len,
2574             sizeof (*cpu_apicid_array));
2575         /*
2576          * Get CPU to APIC ID property value
2577          */
2578         if (cpu_apicid_array == NULL ||
2579             BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0)
2580                 return (-3);
2581
2582         /*
2583          * Just return number of CPU APIC IDs if CPU to node mapping table is
2584          * NULL
2585          */
2586         if (cpu_node == NULL) {
2587                 if (plat_dr_support_cpu() && n >= boot_ncpus) {
2588                         return (boot_ncpus);
2589                 } else {
2590                         return (n);
2591                 }
2592         }
2593
2594         /*
2595          * Fill in CPU to node ID mapping table with APIC ID for each CPU
2596          */
2597         for (i = 0; i < n; i++) {
2598                 /* Only add boot CPUs into the map if CPU DR is enabled. */
2599                 if (plat_dr_support_cpu() && i >= boot_ncpus)
2600                         break;
2601                 cpu_node[i].exists = 1;
2602                 cpu_node[i].apicid = cpu_apicid_array[i];
2603                 cpu_node[i].prox_domain = UINT32_MAX;
2604                 cpu_node[i].node = UINT_MAX;
2605         }
2606
2607         /*
2608          * Return number of CPUs based on number of APIC IDs
2609          */
2610         return (i);
2611 }
2612
2613
2614 /*
2615  * Read ACPI System Locality Information Table (SLIT) to determine how far each
2616  * NUMA node is from each other
2617  */
2618 static int
2619 lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp,
2620     node_domain_map_t *node_domain, uint_t node_cnt,
2621     memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats)
2622 {
2623         int             i;
2624         int             j;
2625         int             src;
2626         int             dst;
2627         int             localities;
2628         hrtime_t        max;
2629         hrtime_t        min;
2630         int             retval;
2631         uint8_t         *slit_entries;
2632
2633         if (tp == NULL || !lgrp_plat_slit_enable)
2634                 return (1);
2635
2636         if (lat_stats == NULL)
2637                 return (2);
2638
2639         localities = tp->LocalityCount;
2640
2641         min = lat_stats->latency_min;
2642         max = lat_stats->latency_max;
2643
2644         /*
2645          * Fill in latency matrix based on SLIT entries
2646          */
2647         slit_entries = tp->Entry;
2648         for (i = 0; i < localities; i++) {
2649                 src = lgrp_plat_domain_to_node(node_domain,
2650                     node_cnt, i);
2651                 if (src == -1)
2652                         continue;
2653
2654                 for (j = 0; j < localities; j++) {
2655                         uint8_t latency;
2656
2657                         dst = lgrp_plat_domain_to_node(node_domain,
2658                             node_cnt, j);
2659                         if (dst == -1)
2660                                 continue;
2661
2662                         latency = slit_entries[(i * localities) + j];
2663                         lat_stats->latencies[src][dst] = latency;
2664                         if (latency < min || min == -1)
2665                                 min = latency;
2666                         if (latency > max)
2667                                 max = latency;
2668                 }
2669         }
2670
2671         /*
2672          * Verify that latencies/distances given in SLIT look reasonable
2673          */
2674         retval = lgrp_plat_latency_verify(memnode_info, lat_stats);
2675
2676         if (retval) {
2677                 /*
2678                  * Reinitialize (zero) latency table since SLIT doesn't look
2679                  * right
2680                  */
2681                 for (i = 0; i < localities; i++) {
2682                         for (j = 0; j < localities; j++)
2683                                 lat_stats->latencies[i][j] = 0;
2684                 }
2685         } else {
2686                 /*
2687                  * Update min and max latencies seen since SLIT looks valid
2688                  */
2689                 lat_stats->latency_min = min;
2690                 lat_stats->latency_max = max;
2691         }
2692
2693         return (retval);
2694 }
2695
2696
2697 /*
2698  * Update lgrp latencies according to information returned by ACPI _SLI method.
2699  */
2700 static int
2701 lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info,
2702     uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt,
2703     lgrp_plat_latency_stats_t *lat_stats)
2704 {
2705         int             i;
2706         int             src, dst;
2707         uint8_t         latency;
2708         hrtime_t        max, min;
2709
2710         if (lat_stats == NULL || sli_info == NULL ||
2711             sli_cnt == 0 || domain_id >= sli_cnt)
2712                 return (-1);
2713
2714         src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id);
2715         if (src == -1) {
2716                 src = lgrp_plat_node_domain_update(node_domain, node_cnt,
2717                     domain_id);
2718                 if (src == -1)
2719                         return (-1);
2720         }
2721
2722         /*
2723          * Don't update latency info if topology has been flattened to 2 levels.
2724          */
2725         if (lgrp_plat_topo_flatten != 0) {
2726                 return (0);
2727         }
2728
2729         /*
2730          * Latency information for proximity domain is ready.
2731          * TODO: support adjusting latency information at runtime.
2732          */
2733         if (lat_stats->latencies[src][src] != 0) {
2734                 return (0);
2735         }
2736
2737         /* Validate latency information. */
2738         for (i = 0; i < sli_cnt; i++) {
2739                 if (i == domain_id) {
2740                         if (sli_info[i] != ACPI_SLIT_SELF_LATENCY ||
2741                             sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) {
2742                                 return (-1);
2743                         }
2744                 } else {
2745                         if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY ||
2746                             sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY ||
2747                             sli_info[i] != sli_info[sli_cnt + i]) {
2748                                 return (-1);
2749                         }
2750                 }
2751         }
2752
2753         min = lat_stats->latency_min;
2754         max = lat_stats->latency_max;
2755         for (i = 0; i < sli_cnt; i++) {
2756                 dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i);
2757                 if (dst == -1)
2758                         continue;
2759
2760                 ASSERT(sli_info[i] == sli_info[sli_cnt + i]);
2761
2762                 /* Update row in latencies matrix. */
2763                 latency = sli_info[i];
2764                 lat_stats->latencies[src][dst] = latency;
2765                 if (latency < min || min == -1)
2766                         min = latency;
2767                 if (latency > max)
2768                         max = latency;
2769
2770                 /* Update column in latencies matrix. */
2771                 latency = sli_info[sli_cnt + i];
2772                 lat_stats->latencies[dst][src] = latency;
2773                 if (latency < min || min == -1)
2774                         min = latency;
2775                 if (latency > max)
2776                         max = latency;
2777         }
2778         lat_stats->latency_min = min;
2779         lat_stats->latency_max = max;
2780
2781         return (0);
2782 }
2783
2784
2785 /*
2786  * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs
2787  * and memory are local to each other in the same NUMA node and return number
2788  * of nodes
2789  */
2790 static int
2791 lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp,
2792     uint32_t *prox_domain_min, node_domain_map_t *node_domain,
2793     cpu_node_map_t *cpu_node, int cpu_count,
2794     memnode_phys_addr_map_t *memnode_info)
2795 {
2796         ACPI_SUBTABLE_HEADER    *item, *srat_end;
2797         int                     i;
2798         int                     node_cnt;
2799         int                     proc_entry_count;
2800         int                     rc;
2801
2802         /*
2803          * Nothing to do when no SRAT or disabled
2804          */
2805         if (tp == NULL || !lgrp_plat_srat_enable)
2806                 return (-1);
2807
2808         /*
2809          * Try to get domain information from MSCT table.
2810          * ACPI4.0: OSPM will use information provided by the MSCT only
2811          * when the System Resource Affinity Table (SRAT) exists.
2812          */
2813         node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min);
2814         if (node_cnt <= 0) {
2815                 /*
2816                  * Determine number of nodes by counting number of proximity
2817                  * domains in SRAT.
2818                  */
2819                 node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min);
2820         }
2821         /*
2822          * Return if number of nodes is 1 or less since don't need to read SRAT.
2823          */
2824         if (node_cnt == 1)
2825                 return (1);
2826         else if (node_cnt <= 0)
2827                 return (-2);
2828
2829         /*
2830          * Walk through SRAT, examining each CPU and memory entry to determine
2831          * which CPUs and memory belong to which node.
2832          */
2833         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
2834         srat_end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
2835         proc_entry_count = 0;
2836         while (item < srat_end) {
2837                 uint32_t        apic_id;
2838                 uint32_t        domain;
2839                 uint64_t        end;
2840                 uint64_t        length;
2841                 uint64_t        start;
2842
2843                 switch (item->Type) {
2844                 case ACPI_SRAT_TYPE_CPU_AFFINITY: {     /* CPU entry */
2845                         ACPI_SRAT_CPU_AFFINITY *cpu =
2846                             (ACPI_SRAT_CPU_AFFINITY *) item;
2847
2848                         if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED) ||
2849                             cpu_node == NULL)
2850                                 break;
2851
2852                         /*
2853                          * Calculate domain (node) ID and fill in APIC ID to
2854                          * domain/node mapping table
2855                          */
2856                         domain = cpu->ProximityDomainLo;
2857                         for (i = 0; i < 3; i++) {
2858                                 domain += cpu->ProximityDomainHi[i] <<
2859                                     ((i + 1) * 8);
2860                         }
2861                         apic_id = cpu->ApicId;
2862
2863                         rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2864                             cpu_node, cpu_count, apic_id, domain);
2865                         if (rc < 0)
2866                                 return (-3);
2867                         else if (rc == 0)
2868                                 proc_entry_count++;
2869                         break;
2870                 }
2871                 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {  /* memory entry */
2872                         ACPI_SRAT_MEM_AFFINITY *mem =
2873                             (ACPI_SRAT_MEM_AFFINITY *)item;
2874
2875                         if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED) ||
2876                             memnode_info == NULL)
2877                                 break;
2878
2879                         /*
2880                          * Get domain (node) ID and fill in domain/node
2881                          * to memory mapping table
2882                          */
2883                         domain = mem->ProximityDomain;
2884                         start = mem->BaseAddress;
2885                         length = mem->Length;
2886                         end = start + length - 1;
2887
2888                         /*
2889                          * According to ACPI 4.0, both ENABLE and HOTPLUG flags
2890                          * may be set for memory address range entries in SRAT
2891                          * table which are reserved for memory hot plug.
2892                          * We intersect memory address ranges in SRAT table
2893                          * with memory ranges in physinstalled to filter out
2894                          * memory address ranges reserved for hot plug.
2895                          */
2896                         if (mem->Flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
2897                                 uint64_t        rstart = UINT64_MAX;
2898                                 uint64_t        rend = 0;
2899                                 struct memlist  *ml;
2900                                 extern struct bootops   *bootops;
2901
2902                                 memlist_read_lock();
2903                                 for (ml = bootops->boot_mem->physinstalled;
2904                                     ml; ml = ml->ml_next) {
2905                                         uint64_t tstart = ml->ml_address;
2906                                         uint64_t tend;
2907
2908                                         tend = ml->ml_address + ml->ml_size;
2909                                         if (tstart > end || tend < start)
2910                                                 continue;
2911                                         if (start > tstart)
2912                                                 tstart = start;
2913                                         if (rstart > tstart)
2914                                                 rstart = tstart;
2915                                         if (end < tend)
2916                                                 tend = end;
2917                                         if (rend < tend)
2918                                                 rend = tend;
2919                                 }
2920                                 memlist_read_unlock();
2921                                 start = rstart;
2922                                 end = rend;
2923                                 /* Skip this entry if no memory installed. */
2924                                 if (start > end)
2925                                         break;
2926                         }
2927
2928                         if (lgrp_plat_memnode_info_update(node_domain,
2929                             node_cnt, memnode_info, node_cnt,
2930                             start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0)
2931                                 return (-4);
2932                         break;
2933                 }
2934                 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {      /* x2apic CPU */
2935                         ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
2936                             (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
2937
2938                         if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED) ||
2939                             cpu_node == NULL)
2940                                 break;
2941
2942                         /*
2943                          * Calculate domain (node) ID and fill in APIC ID to
2944                          * domain/node mapping table
2945                          */
2946                         domain = x2cpu->ProximityDomain;
2947                         apic_id = x2cpu->ApicId;
2948
2949                         rc = lgrp_plat_cpu_node_update(node_domain, node_cnt,
2950                             cpu_node, cpu_count, apic_id, domain);
2951                         if (rc < 0)
2952                                 return (-3);
2953                         else if (rc == 0)
2954                                 proc_entry_count++;
2955                         break;
2956                 }
2957                 default:
2958                         break;
2959                 }
2960
2961                 item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
2962         }
2963
2964         /*
2965          * Should have seen at least as many SRAT processor entries as CPUs
2966          */
2967         if (proc_entry_count < cpu_count)
2968                 return (-5);
2969
2970         /*
2971          * Need to sort nodes by starting physical address since VM system
2972          * assumes and expects memnodes to be sorted in ascending order by
2973          * physical address
2974          */
2975         lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count,
2976             memnode_info);
2977
2978         return (node_cnt);
2979 }
2980
2981
2982 /*
2983  * Allocate permanent memory for any temporary memory that we needed to
2984  * allocate using BOP_ALLOC() before kmem_alloc() and VM system were
2985  * initialized and copy everything from temporary to permanent memory since
2986  * temporary boot memory will eventually be released during boot
2987  */
2988 static void
2989 lgrp_plat_release_bootstrap(void)
2990 {
2991         void    *buf;
2992         size_t  size;
2993
2994         if (lgrp_plat_cpu_node_nentries > 0) {
2995                 size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t);
2996                 buf = kmem_alloc(size, KM_SLEEP);
2997                 bcopy(lgrp_plat_cpu_node, buf, size);
2998                 lgrp_plat_cpu_node = buf;
2999         }
3000 }
3001
3002
3003 /*
3004  * Return number of proximity domains given in ACPI SRAT
3005  */
3006 static int
3007 lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp, uint32_t *prox_domain_min)
3008 {
3009         int                     domain_cnt;
3010         uint32_t                domain_min;
3011         ACPI_SUBTABLE_HEADER    *item, *end;
3012         int                     i;
3013         node_domain_map_t       node_domain[MAX_NODES];
3014
3015
3016         if (tp == NULL || !lgrp_plat_srat_enable)
3017                 return (1);
3018
3019         /*
3020          * Walk through SRAT to find minimum proximity domain ID
3021          */
3022         domain_min = UINT32_MAX;
3023         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
3024         end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
3025         while (item < end) {
3026                 uint32_t        domain;
3027
3028                 switch (item->Type) {
3029                 case ACPI_SRAT_TYPE_CPU_AFFINITY: {     /* CPU entry */
3030                         ACPI_SRAT_CPU_AFFINITY *cpu =
3031                             (ACPI_SRAT_CPU_AFFINITY *) item;
3032
3033                         if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3034                                 item = (ACPI_SUBTABLE_HEADER *)
3035                                     ((uintptr_t)item + item->Length);
3036                                 continue;
3037                         }
3038                         domain = cpu->ProximityDomainLo;
3039                         for (i = 0; i < 3; i++) {
3040                                 domain += cpu->ProximityDomainHi[i] <<
3041                                     ((i + 1) * 8);
3042                         }
3043                         break;
3044                 }
3045                 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {  /* memory entry */
3046                         ACPI_SRAT_MEM_AFFINITY *mem =
3047                             (ACPI_SRAT_MEM_AFFINITY *)item;
3048
3049                         if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) {
3050                                 item = (ACPI_SUBTABLE_HEADER *)
3051                                     ((uintptr_t)item + item->Length);
3052                                 continue;
3053                         }
3054                         domain = mem->ProximityDomain;
3055                         break;
3056                 }
3057                 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {      /* x2apic CPU */
3058                         ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
3059                             (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
3060
3061                         if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3062                                 item = (ACPI_SUBTABLE_HEADER *)
3063                                     ((uintptr_t)item + item->Length);
3064                                 continue;
3065                         }
3066                         domain = x2cpu->ProximityDomain;
3067                         break;
3068                 }
3069                 default:
3070                         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item +
3071                             item->Length);
3072                         continue;
3073                 }
3074
3075                 /*
3076                  * Keep track of minimum proximity domain ID
3077                  */
3078                 if (domain < domain_min)
3079                         domain_min = domain;
3080
3081                 item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
3082         }
3083         if (lgrp_plat_domain_min_enable && prox_domain_min != NULL)
3084                 *prox_domain_min = domain_min;
3085
3086         /*
3087          * Walk through SRAT, examining each CPU and memory entry to determine
3088          * proximity domain ID for each.
3089          */
3090         domain_cnt = 0;
3091         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp));
3092         end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp);
3093         bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t));
3094         while (item < end) {
3095                 uint32_t        domain;
3096                 boolean_t       overflow;
3097                 uint_t          start;
3098
3099                 switch (item->Type) {
3100                 case ACPI_SRAT_TYPE_CPU_AFFINITY: {     /* CPU entry */
3101                         ACPI_SRAT_CPU_AFFINITY *cpu =
3102                             (ACPI_SRAT_CPU_AFFINITY *) item;
3103
3104                         if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3105                                 item = (ACPI_SUBTABLE_HEADER *)
3106                                     ((uintptr_t)item + item->Length);
3107                                 continue;
3108                         }
3109                         domain = cpu->ProximityDomainLo;
3110                         for (i = 0; i < 3; i++) {
3111                                 domain += cpu->ProximityDomainHi[i] <<
3112                                     ((i + 1) * 8);
3113                         }
3114                         break;
3115                 }
3116                 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: {  /* memory entry */
3117                         ACPI_SRAT_MEM_AFFINITY *mem =
3118                             (ACPI_SRAT_MEM_AFFINITY *)item;
3119
3120                         if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) {
3121                                 item = (ACPI_SUBTABLE_HEADER *)
3122                                     ((uintptr_t)item + item->Length);
3123                                 continue;
3124                         }
3125                         domain = mem->ProximityDomain;
3126                         break;
3127                 }
3128                 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: {      /* x2apic CPU */
3129                         ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu =
3130                             (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item;
3131
3132                         if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) {
3133                                 item = (ACPI_SUBTABLE_HEADER *)
3134                                     ((uintptr_t)item + item->Length);
3135                                 continue;
3136                         }
3137                         domain = x2cpu->ProximityDomain;
3138                         break;
3139                 }
3140                 default:
3141                         item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item +
3142                             item->Length);
3143                         continue;
3144                 }
3145
3146                 /*
3147                  * Count and keep track of which proximity domain IDs seen
3148                  */
3149                 start = i = domain % MAX_NODES;
3150                 overflow = B_TRUE;
3151                 do {
3152                         /*
3153                          * Create entry for proximity domain and increment
3154                          * count when no entry exists where proximity domain
3155                          * hashed
3156                          */
3157                         if (!node_domain[i].exists) {
3158                                 node_domain[i].exists = 1;
3159                                 node_domain[i].prox_domain = domain;
3160                                 domain_cnt++;
3161                                 overflow = B_FALSE;
3162                                 break;
3163                         }
3164
3165                         /*
3166                          * Nothing to do when proximity domain seen already
3167                          * and its entry exists
3168                          */
3169                         if (node_domain[i].prox_domain == domain) {
3170                                 overflow = B_FALSE;
3171                                 break;
3172                         }
3173
3174                         /*
3175                          * Entry exists where proximity domain hashed, but for
3176                          * different proximity domain so keep search for empty
3177                          * slot to put it or matching entry whichever comes
3178                          * first.
3179                          */
3180                         i = (i + 1) % MAX_NODES;
3181                 } while (i != start);
3182
3183                 /*
3184                  * Didn't find empty or matching entry which means have more
3185                  * proximity domains than supported nodes (:-(
3186                  */
3187                 ASSERT(overflow != B_TRUE);
3188                 if (overflow == B_TRUE)
3189                         return (-1);
3190
3191                 item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length);
3192         }
3193         return (domain_cnt);
3194 }
3195
3196
3197 /*
3198  * Parse domain information in ACPI Maximum System Capability Table (MSCT).
3199  * MSCT table has been verified in function process_msct() in fakebop.c.
3200  */
3201 static int
3202 lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp, uint32_t *prox_domain_min)
3203 {
3204         int last_seen = 0;
3205         uint32_t proxmin = UINT32_MAX;
3206         ACPI_MSCT_PROXIMITY *item, *end;
3207
3208         if (tp == NULL || lgrp_plat_msct_enable == 0)
3209                 return (-1);
3210
3211         if (tp->MaxProximityDomains >= MAX_NODES) {
3212                 cmn_err(CE_CONT,
3213                     "?lgrp: too many proximity domains (%d), max %d supported, "
3214                     "disable support of CPU/memory DR operations.",
3215                     tp->MaxProximityDomains + 1, MAX_NODES);
3216                 plat_dr_disable_cpu();
3217                 plat_dr_disable_memory();
3218                 return (-1);
3219         }
3220
3221         if (prox_domain_min != NULL) {
3222                 end = (void *)(tp->Header.Length + (uintptr_t)tp);
3223                 for (item = (void *)((uintptr_t)tp +
3224                     tp->ProximityOffset); item < end;
3225                     item = (void *)(item->Length + (uintptr_t)item)) {
3226                         if (item->RangeStart < proxmin) {
3227                                 proxmin = item->RangeStart;
3228                         }
3229
3230                         last_seen = item->RangeEnd - item->RangeStart + 1;
3231                         /*
3232                          * Break out if all proximity domains have been
3233                          * processed. Some BIOSes may have unused items
3234                          * at the end of MSCT table.
3235                          */
3236                         if (last_seen > tp->MaxProximityDomains) {
3237                                 break;
3238                         }
3239                 }
3240                 *prox_domain_min = proxmin;
3241         }
3242
3243         return (tp->MaxProximityDomains + 1);
3244 }
3245
3246
3247 /*
3248  * Set lgroup latencies for 2 level lgroup topology
3249  */
3250 static void
3251 lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats)
3252 {
3253         int     i, j;
3254
3255         ASSERT(lat_stats != NULL);
3256
3257         if (lgrp_plat_node_cnt >= 4)
3258                 cmn_err(CE_NOTE,
3259                     "MPO only optimizing for local and remote\n");
3260         for (i = 0; i < lgrp_plat_node_cnt; i++) {
3261                 for (j = 0; j < lgrp_plat_node_cnt; j++) {
3262                         if (i == j)
3263                                 lat_stats->latencies[i][j] = 2;
3264                         else
3265                                 lat_stats->latencies[i][j] = 3;
3266                 }
3267         }
3268         lat_stats->latency_min = 2;
3269         lat_stats->latency_max = 3;
3270         /* TODO: check it. */
3271         lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0);
3272         lgrp_plat_topo_flatten = 1;
3273 }
3274
3275
3276 /*
3277  * The following Opteron specific constants, macros, types, and routines define
3278  * PCI configuration space registers and how to read them to determine the NUMA
3279  * configuration of *supported* Opteron processors.  They provide the same
3280  * information that may be gotten from the ACPI System Resource Affinity Table
3281  * (SRAT) if it exists on the machine of interest.
3282  *
3283  * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family
3284  * of interest describes all of these registers and their contents.  The main
3285  * registers used by this code to determine the NUMA configuration of the
3286  * machine are the node ID register for the number of NUMA nodes and the DRAM
3287  * address map registers for the physical address range of each node.
3288  *
3289  * NOTE: The format and how to determine the NUMA configuration using PCI
3290  *       config space registers may change or may not be supported in future
3291  *       Opteron processor families.
3292  */
3293
3294 /*
3295  * How many bits to shift Opteron DRAM Address Map base and limit registers
3296  * to get actual value
3297  */
3298 #define OPT_DRAMADDR_HI_LSHIFT_ADDR     40      /* shift left for address */
3299 #define OPT_DRAMADDR_LO_LSHIFT_ADDR     8       /* shift left for address */
3300
3301 #define OPT_DRAMADDR_HI_MASK_ADDR       0x000000FF /* address bits 47-40 */
3302 #define OPT_DRAMADDR_LO_MASK_ADDR       0xFFFF0000 /* address bits 39-24 */
3303
3304 #define OPT_DRAMADDR_LO_MASK_OFF        0xFFFFFF /* offset for address */
3305
3306 /*
3307  * Macros to derive addresses from Opteron DRAM Address Map registers
3308  */
3309 #define OPT_DRAMADDR_HI(reg) \
3310         (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \
3311             OPT_DRAMADDR_HI_LSHIFT_ADDR)
3312
3313 #define OPT_DRAMADDR_LO(reg) \
3314         (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \
3315             OPT_DRAMADDR_LO_LSHIFT_ADDR)
3316
3317 #define OPT_DRAMADDR(high, low) \
3318         (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low))
3319
3320 /*
3321  * Bit masks defining what's in Opteron DRAM Address Map base register
3322  */
3323 #define OPT_DRAMBASE_LO_MASK_RE         0x1     /* read enable */
3324 #define OPT_DRAMBASE_LO_MASK_WE         0x2     /* write enable */
3325 #define OPT_DRAMBASE_LO_MASK_INTRLVEN   0x700   /* interleave */
3326
3327 /*
3328  * Bit masks defining what's in Opteron DRAM Address Map limit register
3329  */
3330 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE   0x7             /* destination node */
3331 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700           /* interleave select */
3332
3333
3334 /*
3335  * Opteron Node ID register in PCI configuration space contains
3336  * number of nodes in system, etc. for Opteron K8.  The following
3337  * constants and macros define its contents, structure, and access.
3338  */
3339
3340 /*
3341  * Bit masks defining what's in Opteron Node ID register
3342  */
3343 #define OPT_NODE_MASK_ID        0x7     /* node ID */
3344 #define OPT_NODE_MASK_CNT       0x70    /* node count */
3345 #define OPT_NODE_MASK_IONODE    0x700   /* Hypertransport I/O hub node ID */
3346 #define OPT_NODE_MASK_LCKNODE   0x7000  /* lock controller node ID */
3347 #define OPT_NODE_MASK_CPUCNT    0xF0000 /* CPUs in system (0 means 1 CPU)  */
3348
3349 /*
3350  * How many bits in Opteron Node ID register to shift right to get actual value
3351  */
3352 #define OPT_NODE_RSHIFT_CNT     0x4     /* shift right for node count value */
3353
3354 /*
3355  * Macros to get values from Opteron Node ID register
3356  */
3357 #define OPT_NODE_CNT(reg) \
3358         ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT)
3359
3360 /*
3361  * Macro to setup PCI Extended Configuration Space (ECS) address to give to
3362  * "in/out" instructions
3363  *
3364  * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any
3365  *       other uses should just do MMIO to access PCI ECS.
3366  *       Must enable special bit in Northbridge Configuration Register on
3367  *       Greyhound for extended CF8 space access to be able to access PCI ECS
3368  *       using "in/out" instructions and restore special bit after done
3369  *       accessing PCI ECS.
3370  */
3371 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \
3372         (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11)  | \
3373             (((function) & 0x7) << 8) | ((reg) & 0xfc) | \
3374             ((((reg) >> 8) & 0xf) << 24))
3375
3376 /*
3377  * PCI configuration space registers accessed by specifying
3378  * a bus, device, function, and offset.  The following constants
3379  * define the values needed to access Opteron K8 configuration
3380  * info to determine its node topology
3381  */
3382
3383 #define OPT_PCS_BUS_CONFIG      0       /* Hypertransport config space bus */
3384
3385 /*
3386  * Opteron PCI configuration space register function values
3387  */
3388 #define OPT_PCS_FUNC_HT         0       /* Hypertransport configuration */
3389 #define OPT_PCS_FUNC_ADDRMAP    1       /* Address map configuration */
3390 #define OPT_PCS_FUNC_DRAM       2       /* DRAM configuration */
3391 #define OPT_PCS_FUNC_MISC       3       /* Miscellaneous configuration */
3392
3393 /*
3394  * PCI Configuration Space register offsets
3395  */
3396 #define OPT_PCS_OFF_VENDOR      0x0     /* device/vendor ID register */
3397 #define OPT_PCS_OFF_DRAMBASE_HI 0x140   /* DRAM Base register (node 0) */
3398 #define OPT_PCS_OFF_DRAMBASE_LO 0x40    /* DRAM Base register (node 0) */
3399 #define OPT_PCS_OFF_NODEID      0x60    /* Node ID register */
3400
3401 /*
3402  * Opteron PCI Configuration Space device IDs for nodes
3403  */
3404 #define OPT_PCS_DEV_NODE0               24      /* device number for node 0 */
3405
3406
3407 /*
3408  * Opteron DRAM address map gives base and limit for physical memory in a node
3409  */
3410 typedef struct opt_dram_addr_map {
3411         uint32_t        base_hi;
3412         uint32_t        base_lo;
3413         uint32_t        limit_hi;
3414         uint32_t        limit_lo;
3415 } opt_dram_addr_map_t;
3416
3417
3418 /*
3419  * Supported AMD processor families
3420  */
3421 #define AMD_FAMILY_HAMMER       15
3422 #define AMD_FAMILY_GREYHOUND    16
3423
3424 /*
3425  * Whether to have is_opteron() return 1 even when processor isn't supported
3426  */
3427 uint_t  is_opteron_override = 0;
3428
3429 /*
3430  * AMD processor family for current CPU
3431  */
3432 uint_t  opt_family = 0;
3433
3434
3435 /*
3436  * Determine whether we're running on a supported AMD Opteron since reading
3437  * node count and DRAM address map registers may have different format or
3438  * may not be supported across processor families
3439  */
3440 static int
3441 is_opteron(void)
3442 {
3443
3444         if (x86_vendor != X86_VENDOR_AMD)
3445                 return (0);
3446
3447         opt_family = cpuid_getfamily(CPU);
3448         if (opt_family == AMD_FAMILY_HAMMER ||
3449             opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override)
3450                 return (1);
3451         else
3452                 return (0);
3453 }
3454
3455
3456 /*
3457  * Determine NUMA configuration for Opteron from registers that live in PCI
3458  * configuration space
3459  */
3460 static void
3461 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv,
3462     memnode_phys_addr_map_t *memnode_info)
3463 {
3464         uint_t                          bus;
3465         uint_t                          dev;
3466         struct opt_dram_addr_map        dram_map[MAX_NODES];
3467         uint_t                          node;
3468         uint_t                          node_info[MAX_NODES];
3469         uint_t                          off_hi;
3470         uint_t                          off_lo;
3471         uint64_t                        nb_cfg_reg;
3472
3473         /*
3474          * Read configuration registers from PCI configuration space to
3475          * determine node information, which memory is in each node, etc.
3476          *
3477          * Write to PCI configuration space address register to specify
3478          * which configuration register to read and read/write PCI
3479          * configuration space data register to get/set contents
3480          */
3481         bus = OPT_PCS_BUS_CONFIG;
3482         dev = OPT_PCS_DEV_NODE0;
3483         off_hi = OPT_PCS_OFF_DRAMBASE_HI;
3484         off_lo = OPT_PCS_OFF_DRAMBASE_LO;
3485
3486         /*
3487          * Read node ID register for node 0 to get node count
3488          */
3489         node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT,
3490             OPT_PCS_OFF_NODEID);
3491         *node_cnt = OPT_NODE_CNT(node_info[0]) + 1;
3492
3493         /*
3494          * If number of nodes is more than maximum supported, then set node
3495          * count to 1 and treat system as UMA instead of NUMA.
3496          */
3497         if (*node_cnt > MAX_NODES) {
3498                 *node_cnt = 1;
3499                 return;
3500         }
3501
3502         /*
3503          * For Greyhound, PCI Extended Configuration Space must be enabled to
3504          * read high DRAM address map base and limit registers
3505          */
3506         if (opt_family == AMD_FAMILY_GREYHOUND) {
3507                 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG);
3508                 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3509                         wrmsr(MSR_AMD_NB_CFG,
3510                             nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS);
3511         }
3512
3513         for (node = 0; node < *node_cnt; node++) {
3514                 uint32_t        base_hi;
3515                 uint32_t        base_lo;
3516                 uint32_t        limit_hi;
3517                 uint32_t        limit_lo;
3518
3519                 /*
3520                  * Read node ID register (except for node 0 which we just read)
3521                  */
3522                 if (node > 0) {
3523                         node_info[node] = pci_getl_func(bus, dev,
3524                             OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID);
3525                 }
3526
3527                 /*
3528                  * Read DRAM base and limit registers which specify
3529                  * physical memory range of each node
3530                  */
3531                 if (opt_family != AMD_FAMILY_GREYHOUND)
3532                         base_hi = 0;
3533                 else {
3534                         outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3535                             OPT_PCS_FUNC_ADDRMAP, off_hi));
3536                         base_hi = dram_map[node].base_hi =
3537                             inl(PCI_CONFDATA);
3538                 }
3539                 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev,
3540                     OPT_PCS_FUNC_ADDRMAP, off_lo);
3541
3542                 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) &&
3543                     mem_intrlv)
3544                         *mem_intrlv = *mem_intrlv + 1;
3545
3546                 off_hi += 4;    /* high limit register offset */
3547                 if (opt_family != AMD_FAMILY_GREYHOUND)
3548                         limit_hi = 0;
3549                 else {
3550                         outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev,
3551                             OPT_PCS_FUNC_ADDRMAP, off_hi));
3552                         limit_hi = dram_map[node].limit_hi =
3553                             inl(PCI_CONFDATA);
3554                 }
3555
3556                 off_lo += 4;    /* low limit register offset */
3557                 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus,
3558                     dev, OPT_PCS_FUNC_ADDRMAP, off_lo);
3559
3560                 /*
3561                  * Increment device number to next node and register offsets
3562                  * for DRAM base register of next node
3563                  */
3564                 off_hi += 4;
3565                 off_lo += 4;
3566                 dev++;
3567
3568                 /*
3569                  * Both read and write enable bits must be enabled in DRAM
3570                  * address map base register for physical memory to exist in
3571                  * node
3572                  */
3573                 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 ||
3574                     (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) {
3575                         /*
3576                          * Mark node memory as non-existent and set start and
3577                          * end addresses to be same in memnode_info[]
3578                          */
3579                         memnode_info[node].exists = 0;
3580                         memnode_info[node].start = memnode_info[node].end =
3581                             (pfn_t)-1;
3582                         continue;
3583                 }
3584
3585                 /*
3586                  * Mark node memory as existing and remember physical address
3587                  * range of each node for use later
3588                  */
3589                 memnode_info[node].exists = 1;
3590
3591                 memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo));
3592
3593                 memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) |
3594                     OPT_DRAMADDR_LO_MASK_OFF);
3595         }
3596
3597         /*
3598          * Restore PCI Extended Configuration Space enable bit
3599          */
3600         if (opt_family == AMD_FAMILY_GREYHOUND) {
3601                 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0)
3602                         wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg);
3603         }
3604 }
3605
3606
3607 /*
3608  * Return average amount of time to read vendor ID register on Northbridge
3609  * N times on specified destination node from current CPU
3610  */
3611 static hrtime_t
3612 opt_probe_vendor(int dest_node, int nreads)
3613 {
3614         int             cnt;
3615         uint_t          dev;
3616         /* LINTED: set but not used in function */
3617         volatile uint_t dev_vendor;
3618         hrtime_t        elapsed;
3619         hrtime_t        end;
3620         int             ipl;
3621         hrtime_t        start;
3622
3623         dev = OPT_PCS_DEV_NODE0 + dest_node;
3624         kpreempt_disable();
3625         ipl = spl8();
3626         outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM,
3627             OPT_PCS_OFF_VENDOR));
3628         start = gethrtime();
3629         for (cnt = 0; cnt < nreads; cnt++)
3630                 dev_vendor = inl(PCI_CONFDATA);
3631         end = gethrtime();
3632         elapsed = (end - start) / nreads;
3633         splx(ipl);
3634         kpreempt_enable();
3635         return (elapsed);
3636 }